From 96e467bbd6beffe48bc8b7f8c6f655b3c542c847 Mon Sep 17 00:00:00 2001 From: guo zebin Date: Tue, 27 Jan 2026 16:12:53 +0800 Subject: [PATCH] =?UTF-8?q?=E7=9F=A5=E8=AF=86=E5=BA=93=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 优化yueshen知识库初始化逻辑; 2. 更新yueshen知识库默认使用fay透传的embedding模型; 3.补充知识库目录“新知识库”。 --- faymcp/data/mcp_prestart_tools.json | 16 +++++++ faymcp/data/mcp_servers.json | 74 ++++++++++++++++++++++++----- faymcp/data/mcp_tool_states.json | 6 ++- mcp_servers/yueshen_rag/server.py | 54 ++++++++++++++++++--- 新知识库/README.md | 31 ++++++++++++ 5 files changed, 163 insertions(+), 18 deletions(-) create mode 100644 faymcp/data/mcp_prestart_tools.json create mode 100644 新知识库/README.md diff --git a/faymcp/data/mcp_prestart_tools.json b/faymcp/data/mcp_prestart_tools.json new file mode 100644 index 0000000..129afaa --- /dev/null +++ b/faymcp/data/mcp_prestart_tools.json @@ -0,0 +1,16 @@ +{ + "4": { + "query_yueshen": { + "params": { + "embedding_api_key": "", + "embedding_base_url": "", + "embedding_model": "", + "query": "{{question}}", + "top_k": 5, + "where": {} + }, + "include_history": true, + "allow_function_call": false + } + } +} \ No newline at end of file diff --git a/faymcp/data/mcp_servers.json b/faymcp/data/mcp_servers.json index 0dfb388..b5ab3ba 100644 --- a/faymcp/data/mcp_servers.json +++ b/faymcp/data/mcp_servers.json @@ -3,7 +3,7 @@ "id": 1, "name": "tools", "ip": "", - "connection_time": "2025-12-10 21:16:35", + "connection_time": "2026-01-22 15:59:12", "key": "", "transport": "stdio", "command": "python", @@ -47,7 +47,7 @@ "id": 4, "name": "yueshen rag", "ip": "", - "connection_time": "2025-12-10 21:16:44", + "connection_time": "2026-01-27 16:01:24", "key": "", "transport": "stdio", "command": "C:\\Users\\Lenovo\\anaconda3\\envs\\rag\\python.exe", @@ -55,20 +55,13 @@ "mcp_servers/yueshen_rag/server.py" ], "cwd": "", - "env": { - "YUESHEN_AUTO_INGEST": "1", - "YUESHEN_AUTO_INTERVAL": "300", - "YUESHEN_AUTO_RESET_ON_START": "0", - "YUESHEN_EMBED_API_KEY": "sk-izmvqrzyhjghzyghiofqfpusxprmfljntxzggkcovtneqpas", - "YUESHEN_EMBED_BASE_URL": "https://api.siliconflow.cn/v1", - "YUESHEN_EMBED_MODEL": "Qwen/Qwen3-Embedding-8B" - } + "env": {} }, { "id": 5, "name": "window capture", "ip": "", - "connection_time": "2025-12-10 21:16:45", + "connection_time": "2025-12-17 22:26:55", "key": "", "transport": "stdio", "command": "python", @@ -77,5 +70,64 @@ ], "cwd": "", "env": {} + }, + { + "id": 6, + "name": "broswermcp", + "ip": "", + "connection_time": "2026-01-15 14:18:38", + "key": "", + "transport": "stdio", + "command": "npx", + "args": [ + "@browsermcp/mcp@latest" + ], + "cwd": "", + "env": {} + }, + { + "id": 7, + "name": "Todo Server", + "ip": "", + "connection_time": "2025-12-17 16:06:29", + "key": "", + "transport": "stdio", + "command": "python", + "args": [ + "-u", + "server.py" + ], + "cwd": "mcp_servers/mcp-todo-server", + "env": {} + }, + { + "id": 8, + "name": "KinEcho", + "ip": "", + "connection_time": "2025-12-24 11:20:24", + "key": "", + "transport": "stdio", + "command": "python", + "args": [ + "server.py" + ], + "cwd": "mcp_servers\\elderly_mcp", + "env": {} + }, + { + "id": 9, + "name": "easydeal-trading", + "ip": "", + "connection_time": "2026-01-15 20:30:00", + "key": "", + "transport": "stdio", + "command": "C:\\Users\\Lenovo\\anaconda3\\envs\\fxai\\python.exe", + "args": [ + "D:/Projects/easy_deal_agent/easy-deal/easydeal_mcp_server.py" + ], + "cwd": "D:/Projects/easy_deal_agent/easy-deal", + "env": { + "EA_PROFILE_PATH": "D:\\Projects\\easy_deal_agent\\config.set" + } } ] \ No newline at end of file diff --git a/faymcp/data/mcp_tool_states.json b/faymcp/data/mcp_tool_states.json index ab80d07..d683a78 100644 --- a/faymcp/data/mcp_tool_states.json +++ b/faymcp/data/mcp_tool_states.json @@ -37,7 +37,11 @@ "search_files": false, "write_file": false }, - "4": {}, + "4": { + "ingest_yueshen": false, + "yueshen_stats": false, + "query_yueshen": false + }, "6": { "list_windows": true } diff --git a/mcp_servers/yueshen_rag/server.py b/mcp_servers/yueshen_rag/server.py index c6a3a9a..bce8f15 100644 --- a/mcp_servers/yueshen_rag/server.py +++ b/mcp_servers/yueshen_rag/server.py @@ -62,9 +62,10 @@ DEFAULT_PERSIST_DIR = os.getenv( os.path.join(PROJECT_ROOT, "cache_data", "chromadb_yueshen"), ) COLLECTION_NAME = "yueshen_kb" -DEFAULT_EMBED_BASE_URL = os.getenv("YUESHEN_EMBED_BASE_URL") -DEFAULT_EMBED_API_KEY = os.getenv("YUESHEN_EMBED_API_KEY") -DEFAULT_EMBED_MODEL = os.getenv("YUESHEN_EMBED_MODEL", "text-embedding-3-small") +# Default to Fay's OpenAI-compatible passthrough embedding endpoint. +DEFAULT_EMBED_BASE_URL = os.getenv("YUESHEN_EMBED_BASE_URL", "http://127.0.0.1:5000/v1") +DEFAULT_EMBED_API_KEY = os.getenv("YUESHEN_EMBED_API_KEY", "sk-fay") +DEFAULT_EMBED_MODEL = os.getenv("YUESHEN_EMBED_MODEL", "embedding") AUTO_INGEST_ENABLED = os.getenv("YUESHEN_AUTO_INGEST", "1") != "0" AUTO_INGEST_INTERVAL = int(os.getenv("YUESHEN_AUTO_INTERVAL", "300")) AUTO_RESET_ON_START = os.getenv("YUESHEN_AUTO_RESET_ON_START", "0") != "0" @@ -469,6 +470,7 @@ class AutoIngestor: self._stop = threading.Event() self._thread: Optional[threading.Thread] = None self._snapshot: Dict[str, Tuple[float, int]] = {} + self._last_ingest_ok = False def _take_snapshot(self) -> Dict[str, Tuple[float, int]]: snap: Dict[str, Tuple[float, int]] = {} @@ -490,7 +492,13 @@ class AutoIngestor: return True return False - def _ingest_once(self, reset: bool = False): + def _vector_count(self) -> Optional[int]: + try: + return self.km.store.collection.count() + except Exception: + return None + + def _ingest_once(self, reset: bool = False) -> bool: try: res = self.km.ingest( corpus_dir=self.km.corpus_dir, @@ -500,8 +508,12 @@ class AutoIngestor: embedding_model=self.km.embedder.model, ) logger.info("Auto-ingest done: %s", json.dumps(res, ensure_ascii=False)) + self._last_ingest_ok = True + return True except Exception as exc: logger.error("Auto-ingest failed: %s", exc) + self._last_ingest_ok = False + return False def _loop(self): # initial snapshot and optional first ingest @@ -514,8 +526,17 @@ class AutoIngestor: self._ingest_once(reset=False) while not self._stop.wait(self.interval): - if self._has_changes(): - logger.info("Detected corpus change, auto-ingest...") + changed = self._has_changes() + vectors = self._vector_count() + needs_bootstrap = (vectors == 0) or (vectors is None and not self._last_ingest_ok) + if changed or needs_bootstrap: + if changed: + reason = "corpus change" + elif vectors == 0: + reason = "empty vector store" + else: + reason = "retry after ingest failure" + logger.info("Auto-ingest trigger: %s", reason) self._ingest_once(reset=False) def start(self): @@ -636,6 +657,27 @@ async def call_tool(name: str, arguments: Dict[str, Any]) -> List[TextContent]: if name == "query_yueshen": query_text = arguments.get("query", "") + try: + vector_count = manager.store.collection.count() + except Exception: + vector_count = None + if vector_count == 0: + return [ + TextContent( + type="text", + text=json.dumps( + { + "results": [], + "count": 0, + "skipped": True, + "reason": "vector store empty; run ingest_yueshen first", + "stats": manager.stats(), + }, + ensure_ascii=False, + indent=2, + ), + ) + ] # 跳过常见问候和简单回复,不进行知识库查询 if _is_trivial_query(query_text): return [TextContent(type="text", text=json.dumps({ diff --git a/新知识库/README.md b/新知识库/README.md new file mode 100644 index 0000000..26b8b34 --- /dev/null +++ b/新知识库/README.md @@ -0,0 +1,31 @@ +# 新知识库(YueShen MCP 知识库目录) + +这个目录是 `yueshen_rag` MCP 服务器默认扫描的知识库目录(默认等价于项目根目录下的 `新知识库`)。 + +## 怎么用(最少配置版) + +1) 把文档放进本目录(可放子目录)。 +2) 启动 Fay(确保 `http://127.0.0.1:5000` 可用)。 +3) 在 Fay 的 MCP 中启用 `yueshen_rag` 服务器。 +4) 首次使用建议先执行一次工具:`ingest_yueshen`(建立索引)。 +5) 之后用工具:`query_yueshen` 进行检索。 + +> 提示:默认会把向量索引持久化到 `cache_data/chromadb_yueshen`。 + +## 支持的文件格式(以代码为准) + +当前 `yueshen_rag` 实际会扫描并处理: + +- `.pdf` +- `.docx` + +不在上述列表中的文件会被忽略(例如 `.doc` / `.txt`)。如需使用 `.doc`,请先转换为 `.docx`。 + +## 常用工具说明 + +- `ingest_yueshen`:扫描本目录文档并写入向量库 +- `query_yueshen`:向量检索(输入 `query`) +- `yueshen_stats`:查看当前向量库状态(例如向量数量) + +如果检索返回提示 “vector store empty; run ingest_yueshen first”,说明还没建立索引,先执行一次 `ingest_yueshen` 即可。 +