feat: file send prompt

fix: workspace bootstrap
fix: web_fetch encoding
2026-03-12 18:01:30 +08:00 · 2026-03-12 00:11:34 +08:00 · 2026-03-11 23:35:42 +08:00 · 2026-03-11 19:42:37 +08:00 · 2026-03-11 17:47:15 +08:00
6 changed files with 119 additions and 26 deletions
--- a/agent/prompt/builder.py
+++ b/agent/prompt/builder.py
@@ -386,10 +386,24 @@ def _build_workspace_section(workspace_dir: str, language: str) -> List[str]:
        "- 例如用自然表达例如「我已记住」而不是「已更新 MEMORY.md」",
        "",
    ]
+
+    # Cloud deployment: inject websites directory info and access URL
+    cloud_website_lines = _build_cloud_website_section(workspace_dir)
+    if cloud_website_lines:
+        lines.extend(cloud_website_lines)
    
    return lines


+def _build_cloud_website_section(workspace_dir: str) -> List[str]:
+    """Build cloud website access prompt when cloud deployment is configured."""
+    try:
+        from common.cloud_client import build_website_prompt
+        return build_website_prompt(workspace_dir)
+    except Exception:
+        return []
+
+
 def _build_context_files_section(context_files: List[ContextFile], language: str) -> List[str]:
    """构建项目上下文文件section"""
    if not context_files:
--- a/agent/prompt/workspace.py
+++ b/agent/prompt/workspace.py
@@ -42,14 +42,16 @@ def ensure_workspace(workspace_dir: str, create_templates: bool = True) -> Works
    Returns:
        WorkspaceFiles对象，包含所有文件路径
    """
-    # Check if this is a brand new workspace (before creating the directory)
-    is_new_workspace = not os.path.exists(workspace_dir)
+    # Check if this is a brand new workspace (AGENT.md not yet created).
+    # Cannot rely on directory existence because other modules (e.g. ConversationStore)
+    # may create the workspace directory before ensure_workspace is called.
+    agent_path = os.path.join(workspace_dir, DEFAULT_AGENT_FILENAME)
+    is_new_workspace = not os.path.exists(agent_path)
    
    # 确保目录存在
    os.makedirs(workspace_dir, exist_ok=True)
    
    # 定义文件路径
-    agent_path = os.path.join(workspace_dir, DEFAULT_AGENT_FILENAME)
    user_path = os.path.join(workspace_dir, DEFAULT_USER_FILENAME)
    rule_path = os.path.join(workspace_dir, DEFAULT_RULE_FILENAME)
    memory_path = os.path.join(workspace_dir, DEFAULT_MEMORY_FILENAME)  # MEMORY.md 在根目录
@@ -61,6 +63,10 @@ def ensure_workspace(workspace_dir: str, create_templates: bool = True) -> Works
    # 创建skills子目录 (for workspace-level skills installed by agent)
    skills_dir = os.path.join(workspace_dir, "skills")
    os.makedirs(skills_dir, exist_ok=True)
+
+    # 创建websites子目录 (for web pages / sites generated by agent)
+    websites_dir = os.path.join(workspace_dir, "websites")
+    os.makedirs(websites_dir, exist_ok=True)
    
    # 如果需要，创建模板文件
    if create_templates:
--- a/agent/tools/web_fetch/web_fetch.py
+++ b/agent/tools/web_fetch/web_fetch.py
@@ -29,7 +29,7 @@ DEFAULT_HEADERS = {

 # Supported document file extensions
 PDF_SUFFIXES: Set[str] = {".pdf"}
-WORD_SUFFIXES: Set[str] = {".doc", ".docx"}
+WORD_SUFFIXES: Set[str] = {".docx"}
 TEXT_SUFFIXES: Set[str] = {".txt", ".md", ".markdown", ".rst", ".csv", ".tsv", ".log"}
 SPREADSHEET_SUFFIXES: Set[str] = {".xls", ".xlsx"}
 PPT_SUFFIXES: Set[str] = {".ppt", ".pptx"}
@@ -56,7 +56,7 @@ class WebFetch(BaseTool):
    description: str = (
        "Fetch content from a URL. For web pages, extracts readable text. "
        "For document files (PDF, Word, TXT, Markdown, Excel, PPT), downloads and parses the file content. "
-        "Supported file types: .pdf, .doc, .docx, .txt, .md, .csv, .xls, .xlsx, .ppt, .pptx"
+        "Supported file types: .pdf, .docx, .txt, .md, .csv, .xls, .xlsx, .ppt, .pptx"
    )

    params: dict = {
@@ -114,6 +114,14 @@ class WebFetch(BaseTool):
        if self._is_binary_content_type(content_type) and not _is_document_url(url):
            return self._handle_download_by_content_type(url, response, content_type)

+        # Fix encoding: use apparent_encoding to auto-detect, but keep Windows encodings as-is
+        if response.apparent_encoding and response.apparent_encoding.lower().startswith("windows"):
+            response.encoding = response.encoding
+        else:
+            response.encoding = response.apparent_encoding
+        if not response.encoding:
+            response.encoding = "utf-8"
+
        html = response.text
        title = self._extract_title(html)
        text = self._extract_text(html)
@@ -226,29 +234,16 @@ class WebFetch(BaseTool):
        return "\n\n".join(text_parts)

    def _parse_word(self, file_path: str) -> str:
-        """Extract text from Word documents (.doc/.docx)."""
-        suffix = os.path.splitext(file_path)[-1].lower()
-
-        if suffix == ".docx":
-            try:
-                from docx import Document
-            except ImportError:
-                raise ImportError(
-                    "python-docx library is required for .docx parsing. Install with: pip install python-docx"
-                )
-            doc = Document(file_path)
-            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
-            return "\n\n".join(paragraphs)
-
-        # .doc format - try textract or fallback
+        """Extract text from Word documents (.docx)."""
        try:
-            import textract
-            text = textract.process(file_path).decode("utf-8")
-            return text
+            from docx import Document
        except ImportError:
            raise ImportError(
-                "textract library is required for .doc parsing. Install with: pip install textract"
+                "python-docx library is required for .docx parsing. Install with: pip install python-docx"
            )
+        doc = Document(file_path)
+        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+        return "\n\n".join(paragraphs)

    def _parse_text(self, file_path: str) -> str:
        """Read plain text files (txt, md, csv, etc.)."""
@@ -344,7 +339,6 @@ class WebFetch(BaseTool):
        """Check if Content-Type indicates a binary/document response."""
        binary_types = [
            "application/pdf",
-            "application/msword",
            "application/vnd.openxmlformats",
            "application/vnd.ms-excel",
            "application/vnd.ms-powerpoint",
@@ -358,7 +352,6 @@ class WebFetch(BaseTool):
        ct_lower = content_type.lower()
        suffix_map = {
            "application/pdf": ".pdf",
-            "application/msword": ".doc",
            "application/vnd.openxmlformats-officedocument.wordprocessingml": ".docx",
            "application/vnd.ms-excel": ".xls",
            "application/vnd.openxmlformats-officedocument.spreadsheetml": ".xlsx",
--- a/common/cloud_client.py
+++ b/common/cloud_client.py
@@ -516,6 +516,79 @@ class CloudClient(LinkAIClient):
            logger.error(f"[CloudClient] Failed to save configuration to config.json: {e}")


+def get_root_domain(host: str = "") -> str:
+    """Extract root domain from a hostname.
+
+    If *host* is empty, reads CLOUD_HOST env var / cloud_host config.
+    """
+    if not host:
+        host = os.environ.get("CLOUD_HOST") or conf().get("cloud_host", "")
+    if not host:
+        return ""
+    host = host.strip().rstrip("/")
+    if "://" in host:
+        host = host.split("://", 1)[1]
+    host = host.split("/", 1)[0].split(":")[0]
+    parts = host.split(".")
+    if len(parts) >= 2:
+        return ".".join(parts[-2:])
+    return host
+
+
+def get_deployment_id() -> str:
+    """Return cloud deployment id from env var or config."""
+    return os.environ.get("CLOUD_DEPLOYMENT_ID") or conf().get("cloud_deployment_id", "")
+
+
+def get_website_base_url() -> str:
+    """Return the public URL prefix that maps to the workspace websites/ dir.
+
+    Returns empty string when cloud deployment is not configured.
+    """
+    deployment_id = get_deployment_id()
+    if not deployment_id:
+        return ""
+
+    websites_domain = os.environ.get("CLOUD_WEBSITES_DOMAIN") or conf().get("cloud_websites_domain", "")
+    if websites_domain:
+        websites_domain = websites_domain.strip().rstrip("/")
+        return f"https://{websites_domain}/{deployment_id}"
+
+    domain = get_root_domain()
+    if not domain:
+        return ""
+    return f"https://app.{domain}/{deployment_id}"
+
+
+def build_website_prompt(workspace_dir: str) -> list:
+    """Build system prompt lines for cloud website/file sharing rules.
+
+    Returns an empty list when cloud deployment is not configured,
+    so callers can safely do ``lines.extend(build_website_prompt(...))``.
+    """
+    base_url = get_website_base_url()
+    if not base_url:
+        return []
+
+    return [
+        "**文件分享与网页生成规则** (非常重要 — 当前为云部署模式):",
+        "",
+        f"云端已为工作空间的 `websites/` 目录配置好公网路由映射，访问地址前缀为: `{base_url}`",
+        "",
+        "1. **网页/网站**: 编写网页、H5页面等前端代码时，**必须**将文件放到 `websites/` 目录中",
+        f"   - 例如: `websites/index.html` → `{base_url}/index.html`",
+        f"   - 例如: `websites/my-app/index.html` → `{base_url}/my-app/index.html`",
+        "",
+        "2. **生成文件分享** (PPT、PDF、图片、音视频等): 当你为用户生成了需要下载或查看的文件时，**可以**将文件保存到 `websites/` 目录中",
+        f"   - 例如: 生成的PPT保存到 `websites/files/report.pptx` → 下载链接为 `{base_url}/files/report.pptx`",
+        "   - 你仍然可以同时使用 `send` 工具发送文件（在飞书、钉钉等IM渠道中有效），但**必须同时在回复文本中提供下载链接**作为兜底，因为部分渠道（如网页端）无法通过 send 接收本地文件",
+        "",
+        "3. **必须发送链接**: 无论是网页还是文件，生成后**必须将完整的访问/下载链接直接写在回复文本中发送给用户**",
+        "",
+        "4. 建议为每个独立项目在 `websites/` 下创建子目录，保持结构清晰",
+        "",
+    ]
+
 def start(channel, channel_mgr=None):
    global chat_client
    chat_client = CloudClient(api_key=conf().get("linkai_api_key"), host=conf().get("cloud_host", ""), channel=channel)
--- a/config.py
+++ b/config.py
@@ -188,6 +188,7 @@ available_setting = {
    "linkai_app_code": "",
    "linkai_api_base": "https://api.link-ai.tech",  # linkAI服务地址
    "cloud_host": "client.link-ai.tech",
+    "cloud_deployment_id": "",
    "minimax_api_key": "",
    "Minimax_group_id": "",
    "Minimax_base_url": "",
--- a/requirements-optional.txt
+++ b/requirements-optional.txt
@@ -29,3 +29,9 @@ google-generativeai

 # tencentcloud sdk
 tencentcloud-sdk-python>=3.0.0
+
+# file parsing (web_fetch document support)
+pypdf
+python-docx
+openpyxl
+python-pptx
Author	SHA1	Message	Date
zhayujie	ee0c47ac1e	feat: file send prompt	2026-03-12 00:11:34 +08:00
zhayujie	eba90e9343	fix: workspace bootstrap	2026-03-11 23:35:42 +08:00
zhayujie	d8374d0fa5	fix: web_fetch encoding	2026-03-11 19:42:37 +08:00
zhayujie	fa61744c6d	feat(web_fetch): support downloading and parsing remote document files (PDF, Word, Excel, PPT)	2026-03-11 17:47:15 +08:00