知识库模块

1. 优化yueshen知识库初始化逻辑； 2. 更新yueshen知识库默认使用fay透传的embedding模型； 3.补充知识库目录“新知识库”。
2026-03-12 17:51:28 +08:00 · 2026-01-27 16:12:53 +08:00
parent e692fb9f6d
commit 96e467bbd6
5 changed files with 163 additions and 18 deletions
--- a/faymcp/data/mcp_prestart_tools.json
+++ b/faymcp/data/mcp_prestart_tools.json
@@ -0,0 +1,16 @@
+{
+    "4": {
+        "query_yueshen": {
+            "params": {
+                "embedding_api_key": "",
+                "embedding_base_url": "",
+                "embedding_model": "",
+                "query": "{{question}}",
+                "top_k": 5,
+                "where": {}
+            },
+            "include_history": true,
+            "allow_function_call": false
+        }
+    }
+}
--- a/faymcp/data/mcp_servers.json
+++ b/faymcp/data/mcp_servers.json
@@ -3,7 +3,7 @@
        "id": 1,
        "name": "tools",
        "ip": "",
-        "connection_time": "2025-12-10 21:16:35",
+        "connection_time": "2026-01-22 15:59:12",
        "key": "",
        "transport": "stdio",
        "command": "python",
@@ -47,7 +47,7 @@
        "id": 4,
        "name": "yueshen rag",
        "ip": "",
-        "connection_time": "2025-12-10 21:16:44",
+        "connection_time": "2026-01-27 16:01:24",
        "key": "",
        "transport": "stdio",
        "command": "C:\\Users\\Lenovo\\anaconda3\\envs\\rag\\python.exe",
@@ -55,20 +55,13 @@
            "mcp_servers/yueshen_rag/server.py"
        ],
        "cwd": "",
-        "env": {
-            "YUESHEN_AUTO_INGEST": "1",
-            "YUESHEN_AUTO_INTERVAL": "300",
-            "YUESHEN_AUTO_RESET_ON_START": "0",
-            "YUESHEN_EMBED_API_KEY": "sk-izmvqrzyhjghzyghiofqfpusxprmfljntxzggkcovtneqpas",
-            "YUESHEN_EMBED_BASE_URL": "https://api.siliconflow.cn/v1",
-            "YUESHEN_EMBED_MODEL": "Qwen/Qwen3-Embedding-8B"
-        }
+        "env": {}
    },
    {
        "id": 5,
        "name": "window capture",
        "ip": "",
-        "connection_time": "2025-12-10 21:16:45",
+        "connection_time": "2025-12-17 22:26:55",
        "key": "",
        "transport": "stdio",
        "command": "python",
@@ -77,5 +70,64 @@
        ],
        "cwd": "",
        "env": {}
+    },
+    {
+        "id": 6,
+        "name": "broswermcp",
+        "ip": "",
+        "connection_time": "2026-01-15 14:18:38",
+        "key": "",
+        "transport": "stdio",
+        "command": "npx",
+        "args": [
+            "@browsermcp/mcp@latest"
+        ],
+        "cwd": "",
+        "env": {}
+    },
+    {
+        "id": 7,
+        "name": "Todo Server",
+        "ip": "",
+        "connection_time": "2025-12-17 16:06:29",
+        "key": "",
+        "transport": "stdio",
+        "command": "python",
+        "args": [
+            "-u",
+            "server.py"
+        ],
+        "cwd": "mcp_servers/mcp-todo-server",
+        "env": {}
+    },
+    {
+        "id": 8,
+        "name": "KinEcho",
+        "ip": "",
+        "connection_time": "2025-12-24 11:20:24",
+        "key": "",
+        "transport": "stdio",
+        "command": "python",
+        "args": [
+            "server.py"
+        ],
+        "cwd": "mcp_servers\\elderly_mcp",
+        "env": {}
+    },
+    {
+        "id": 9,
+        "name": "easydeal-trading",
+        "ip": "",
+        "connection_time": "2026-01-15 20:30:00",
+        "key": "",
+        "transport": "stdio",
+        "command": "C:\\Users\\Lenovo\\anaconda3\\envs\\fxai\\python.exe",
+        "args": [
+            "D:/Projects/easy_deal_agent/easy-deal/easydeal_mcp_server.py"
+        ],
+        "cwd": "D:/Projects/easy_deal_agent/easy-deal",
+        "env": {
+            "EA_PROFILE_PATH": "D:\\Projects\\easy_deal_agent\\config.set"
+        }
    }
 ]
--- a/faymcp/data/mcp_tool_states.json
+++ b/faymcp/data/mcp_tool_states.json
@@ -37,7 +37,11 @@
        "search_files": false,
        "write_file": false
    },
-    "4": {},
+    "4": {
+        "ingest_yueshen": false,
+        "yueshen_stats": false,
+        "query_yueshen": false
+    },
    "6": {
        "list_windows": true
    }
--- a/mcp_servers/yueshen_rag/server.py
+++ b/mcp_servers/yueshen_rag/server.py
@@ -62,9 +62,10 @@ DEFAULT_PERSIST_DIR = os.getenv(
    os.path.join(PROJECT_ROOT, "cache_data", "chromadb_yueshen"),
 )
 COLLECTION_NAME = "yueshen_kb"
-DEFAULT_EMBED_BASE_URL = os.getenv("YUESHEN_EMBED_BASE_URL")
-DEFAULT_EMBED_API_KEY = os.getenv("YUESHEN_EMBED_API_KEY")
-DEFAULT_EMBED_MODEL = os.getenv("YUESHEN_EMBED_MODEL", "text-embedding-3-small")
+# Default to Fay's OpenAI-compatible passthrough embedding endpoint.
+DEFAULT_EMBED_BASE_URL = os.getenv("YUESHEN_EMBED_BASE_URL", "http://127.0.0.1:5000/v1")
+DEFAULT_EMBED_API_KEY = os.getenv("YUESHEN_EMBED_API_KEY", "sk-fay")
+DEFAULT_EMBED_MODEL = os.getenv("YUESHEN_EMBED_MODEL", "embedding")
 AUTO_INGEST_ENABLED = os.getenv("YUESHEN_AUTO_INGEST", "1") != "0"
 AUTO_INGEST_INTERVAL = int(os.getenv("YUESHEN_AUTO_INTERVAL", "300"))
 AUTO_RESET_ON_START = os.getenv("YUESHEN_AUTO_RESET_ON_START", "0") != "0"
@@ -469,6 +470,7 @@ class AutoIngestor:
        self._stop = threading.Event()
        self._thread: Optional[threading.Thread] = None
        self._snapshot: Dict[str, Tuple[float, int]] = {}
+        self._last_ingest_ok = False

    def _take_snapshot(self) -> Dict[str, Tuple[float, int]]:
        snap: Dict[str, Tuple[float, int]] = {}
@@ -490,7 +492,13 @@ class AutoIngestor:
            return True
        return False

-    def _ingest_once(self, reset: bool = False):
+    def _vector_count(self) -> Optional[int]:
+        try:
+            return self.km.store.collection.count()
+        except Exception:
+            return None
+
+    def _ingest_once(self, reset: bool = False) -> bool:
        try:
            res = self.km.ingest(
                corpus_dir=self.km.corpus_dir,
@@ -500,8 +508,12 @@ class AutoIngestor:
                embedding_model=self.km.embedder.model,
            )
            logger.info("Auto-ingest done: %s", json.dumps(res, ensure_ascii=False))
+            self._last_ingest_ok = True
+            return True
        except Exception as exc:
            logger.error("Auto-ingest failed: %s", exc)
+            self._last_ingest_ok = False
+            return False

    def _loop(self):
        # initial snapshot and optional first ingest
@@ -514,8 +526,17 @@ class AutoIngestor:
            self._ingest_once(reset=False)

        while not self._stop.wait(self.interval):
-            if self._has_changes():
-                logger.info("Detected corpus change, auto-ingest...")
+            changed = self._has_changes()
+            vectors = self._vector_count()
+            needs_bootstrap = (vectors == 0) or (vectors is None and not self._last_ingest_ok)
+            if changed or needs_bootstrap:
+                if changed:
+                    reason = "corpus change"
+                elif vectors == 0:
+                    reason = "empty vector store"
+                else:
+                    reason = "retry after ingest failure"
+                logger.info("Auto-ingest trigger: %s", reason)
                self._ingest_once(reset=False)

    def start(self):
@@ -636,6 +657,27 @@ async def call_tool(name: str, arguments: Dict[str, Any]) -> List[TextContent]:

        if name == "query_yueshen":
            query_text = arguments.get("query", "")
+            try:
+                vector_count = manager.store.collection.count()
+            except Exception:
+                vector_count = None
+            if vector_count == 0:
+                return [
+                    TextContent(
+                        type="text",
+                        text=json.dumps(
+                            {
+                                "results": [],
+                                "count": 0,
+                                "skipped": True,
+                                "reason": "vector store empty; run ingest_yueshen first",
+                                "stats": manager.stats(),
+                            },
+                            ensure_ascii=False,
+                            indent=2,
+                        ),
+                    )
+                ]
            # 跳过常见问候和简单回复，不进行知识库查询
            if _is_trivial_query(query_text):
                return [TextContent(type="text", text=json.dumps({
--- a/新知识库/README.md
+++ b/新知识库/README.md
@@ -0,0 +1,31 @@
+# 新知识库（YueShen MCP 知识库目录）
+
+这个目录是 `yueshen_rag` MCP 服务器默认扫描的知识库目录（默认等价于项目根目录下的 `新知识库`）。
+
+## 怎么用（最少配置版）
+
+1) 把文档放进本目录（可放子目录）。
+2) 启动 Fay（确保 `http://127.0.0.1:5000` 可用）。
+3) 在 Fay 的 MCP 中启用 `yueshen_rag` 服务器。
+4) 首次使用建议先执行一次工具：`ingest_yueshen`（建立索引）。
+5) 之后用工具：`query_yueshen` 进行检索。
+
+> 提示：默认会把向量索引持久化到 `cache_data/chromadb_yueshen`。
+
+## 支持的文件格式（以代码为准）
+
+当前 `yueshen_rag` 实际会扫描并处理：
+
+- `.pdf`
+- `.docx`
+
+不在上述列表中的文件会被忽略（例如 `.doc` / `.txt`）。如需使用 `.doc`，请先转换为 `.docx`。
+
+## 常用工具说明
+
+- `ingest_yueshen`：扫描本目录文档并写入向量库
+- `query_yueshen`：向量检索（输入 `query`）
+- `yueshen_stats`：查看当前向量库状态（例如向量数量）
+
+如果检索返回提示 “vector store empty; run ingest_yueshen first”，说明还没建立索引，先执行一次 `ingest_yueshen` 即可。
+