知识库模块

1. 优化yueshen知识库初始化逻辑;
2. 更新yueshen知识库默认使用fay透传的embedding模型;
3.补充知识库目录“新知识库”。
This commit is contained in:
guo zebin
2026-01-27 16:12:53 +08:00
parent e692fb9f6d
commit 96e467bbd6
5 changed files with 163 additions and 18 deletions

View File

@@ -0,0 +1,16 @@
{
"4": {
"query_yueshen": {
"params": {
"embedding_api_key": "",
"embedding_base_url": "",
"embedding_model": "",
"query": "{{question}}",
"top_k": 5,
"where": {}
},
"include_history": true,
"allow_function_call": false
}
}
}

View File

@@ -3,7 +3,7 @@
"id": 1,
"name": "tools",
"ip": "",
"connection_time": "2025-12-10 21:16:35",
"connection_time": "2026-01-22 15:59:12",
"key": "",
"transport": "stdio",
"command": "python",
@@ -47,7 +47,7 @@
"id": 4,
"name": "yueshen rag",
"ip": "",
"connection_time": "2025-12-10 21:16:44",
"connection_time": "2026-01-27 16:01:24",
"key": "",
"transport": "stdio",
"command": "C:\\Users\\Lenovo\\anaconda3\\envs\\rag\\python.exe",
@@ -55,20 +55,13 @@
"mcp_servers/yueshen_rag/server.py"
],
"cwd": "",
"env": {
"YUESHEN_AUTO_INGEST": "1",
"YUESHEN_AUTO_INTERVAL": "300",
"YUESHEN_AUTO_RESET_ON_START": "0",
"YUESHEN_EMBED_API_KEY": "sk-izmvqrzyhjghzyghiofqfpusxprmfljntxzggkcovtneqpas",
"YUESHEN_EMBED_BASE_URL": "https://api.siliconflow.cn/v1",
"YUESHEN_EMBED_MODEL": "Qwen/Qwen3-Embedding-8B"
}
"env": {}
},
{
"id": 5,
"name": "window capture",
"ip": "",
"connection_time": "2025-12-10 21:16:45",
"connection_time": "2025-12-17 22:26:55",
"key": "",
"transport": "stdio",
"command": "python",
@@ -77,5 +70,64 @@
],
"cwd": "",
"env": {}
},
{
"id": 6,
"name": "broswermcp",
"ip": "",
"connection_time": "2026-01-15 14:18:38",
"key": "",
"transport": "stdio",
"command": "npx",
"args": [
"@browsermcp/mcp@latest"
],
"cwd": "",
"env": {}
},
{
"id": 7,
"name": "Todo Server",
"ip": "",
"connection_time": "2025-12-17 16:06:29",
"key": "",
"transport": "stdio",
"command": "python",
"args": [
"-u",
"server.py"
],
"cwd": "mcp_servers/mcp-todo-server",
"env": {}
},
{
"id": 8,
"name": "KinEcho",
"ip": "",
"connection_time": "2025-12-24 11:20:24",
"key": "",
"transport": "stdio",
"command": "python",
"args": [
"server.py"
],
"cwd": "mcp_servers\\elderly_mcp",
"env": {}
},
{
"id": 9,
"name": "easydeal-trading",
"ip": "",
"connection_time": "2026-01-15 20:30:00",
"key": "",
"transport": "stdio",
"command": "C:\\Users\\Lenovo\\anaconda3\\envs\\fxai\\python.exe",
"args": [
"D:/Projects/easy_deal_agent/easy-deal/easydeal_mcp_server.py"
],
"cwd": "D:/Projects/easy_deal_agent/easy-deal",
"env": {
"EA_PROFILE_PATH": "D:\\Projects\\easy_deal_agent\\config.set"
}
}
]

View File

@@ -37,7 +37,11 @@
"search_files": false,
"write_file": false
},
"4": {},
"4": {
"ingest_yueshen": false,
"yueshen_stats": false,
"query_yueshen": false
},
"6": {
"list_windows": true
}

View File

@@ -62,9 +62,10 @@ DEFAULT_PERSIST_DIR = os.getenv(
os.path.join(PROJECT_ROOT, "cache_data", "chromadb_yueshen"),
)
COLLECTION_NAME = "yueshen_kb"
DEFAULT_EMBED_BASE_URL = os.getenv("YUESHEN_EMBED_BASE_URL")
DEFAULT_EMBED_API_KEY = os.getenv("YUESHEN_EMBED_API_KEY")
DEFAULT_EMBED_MODEL = os.getenv("YUESHEN_EMBED_MODEL", "text-embedding-3-small")
# Default to Fay's OpenAI-compatible passthrough embedding endpoint.
DEFAULT_EMBED_BASE_URL = os.getenv("YUESHEN_EMBED_BASE_URL", "http://127.0.0.1:5000/v1")
DEFAULT_EMBED_API_KEY = os.getenv("YUESHEN_EMBED_API_KEY", "sk-fay")
DEFAULT_EMBED_MODEL = os.getenv("YUESHEN_EMBED_MODEL", "embedding")
AUTO_INGEST_ENABLED = os.getenv("YUESHEN_AUTO_INGEST", "1") != "0"
AUTO_INGEST_INTERVAL = int(os.getenv("YUESHEN_AUTO_INTERVAL", "300"))
AUTO_RESET_ON_START = os.getenv("YUESHEN_AUTO_RESET_ON_START", "0") != "0"
@@ -469,6 +470,7 @@ class AutoIngestor:
self._stop = threading.Event()
self._thread: Optional[threading.Thread] = None
self._snapshot: Dict[str, Tuple[float, int]] = {}
self._last_ingest_ok = False
def _take_snapshot(self) -> Dict[str, Tuple[float, int]]:
snap: Dict[str, Tuple[float, int]] = {}
@@ -490,7 +492,13 @@ class AutoIngestor:
return True
return False
def _ingest_once(self, reset: bool = False):
def _vector_count(self) -> Optional[int]:
try:
return self.km.store.collection.count()
except Exception:
return None
def _ingest_once(self, reset: bool = False) -> bool:
try:
res = self.km.ingest(
corpus_dir=self.km.corpus_dir,
@@ -500,8 +508,12 @@ class AutoIngestor:
embedding_model=self.km.embedder.model,
)
logger.info("Auto-ingest done: %s", json.dumps(res, ensure_ascii=False))
self._last_ingest_ok = True
return True
except Exception as exc:
logger.error("Auto-ingest failed: %s", exc)
self._last_ingest_ok = False
return False
def _loop(self):
# initial snapshot and optional first ingest
@@ -514,8 +526,17 @@ class AutoIngestor:
self._ingest_once(reset=False)
while not self._stop.wait(self.interval):
if self._has_changes():
logger.info("Detected corpus change, auto-ingest...")
changed = self._has_changes()
vectors = self._vector_count()
needs_bootstrap = (vectors == 0) or (vectors is None and not self._last_ingest_ok)
if changed or needs_bootstrap:
if changed:
reason = "corpus change"
elif vectors == 0:
reason = "empty vector store"
else:
reason = "retry after ingest failure"
logger.info("Auto-ingest trigger: %s", reason)
self._ingest_once(reset=False)
def start(self):
@@ -636,6 +657,27 @@ async def call_tool(name: str, arguments: Dict[str, Any]) -> List[TextContent]:
if name == "query_yueshen":
query_text = arguments.get("query", "")
try:
vector_count = manager.store.collection.count()
except Exception:
vector_count = None
if vector_count == 0:
return [
TextContent(
type="text",
text=json.dumps(
{
"results": [],
"count": 0,
"skipped": True,
"reason": "vector store empty; run ingest_yueshen first",
"stats": manager.stats(),
},
ensure_ascii=False,
indent=2,
),
)
]
# 跳过常见问候和简单回复,不进行知识库查询
if _is_trivial_query(query_text):
return [TextContent(type="text", text=json.dumps({

31
新知识库/README.md Normal file
View File

@@ -0,0 +1,31 @@
# 新知识库YueShen MCP 知识库目录)
这个目录是 `yueshen_rag` MCP 服务器默认扫描的知识库目录(默认等价于项目根目录下的 `新知识库`)。
## 怎么用(最少配置版)
1) 把文档放进本目录(可放子目录)。
2) 启动 Fay确保 `http://127.0.0.1:5000` 可用)。
3) 在 Fay 的 MCP 中启用 `yueshen_rag` 服务器。
4) 首次使用建议先执行一次工具:`ingest_yueshen`(建立索引)。
5) 之后用工具:`query_yueshen` 进行检索。
> 提示:默认会把向量索引持久化到 `cache_data/chromadb_yueshen`。
## 支持的文件格式(以代码为准)
当前 `yueshen_rag` 实际会扫描并处理:
- `.pdf`
- `.docx`
不在上述列表中的文件会被忽略(例如 `.doc` / `.txt`)。如需使用 `.doc`,请先转换为 `.docx`
## 常用工具说明
- `ingest_yueshen`:扫描本目录文档并写入向量库
- `query_yueshen`:向量检索(输入 `query`
- `yueshen_stats`:查看当前向量库状态(例如向量数量)
如果检索返回提示 “vector store empty; run ingest_yueshen first”说明还没建立索引先执行一次 `ingest_yueshen` 即可。