feat(web_fetch): support downloading and parsing remote document files (PDF, Word, Excel, PPT)

2026-03-12 18:01:30 +08:00 · 2026-03-11 17:47:15 +08:00
parent 4fec55cc01
commit fa61744c6d
2 changed files with 14 additions and 23 deletions
--- a/agent/tools/web_fetch/web_fetch.py
+++ b/agent/tools/web_fetch/web_fetch.py
@@ -29,7 +29,7 @@ DEFAULT_HEADERS = {

 # Supported document file extensions
 PDF_SUFFIXES: Set[str] = {".pdf"}
-WORD_SUFFIXES: Set[str] = {".doc", ".docx"}
+WORD_SUFFIXES: Set[str] = {".docx"}
 TEXT_SUFFIXES: Set[str] = {".txt", ".md", ".markdown", ".rst", ".csv", ".tsv", ".log"}
 SPREADSHEET_SUFFIXES: Set[str] = {".xls", ".xlsx"}
 PPT_SUFFIXES: Set[str] = {".ppt", ".pptx"}
@@ -56,7 +56,7 @@ class WebFetch(BaseTool):
    description: str = (
        "Fetch content from a URL. For web pages, extracts readable text. "
        "For document files (PDF, Word, TXT, Markdown, Excel, PPT), downloads and parses the file content. "
-        "Supported file types: .pdf, .doc, .docx, .txt, .md, .csv, .xls, .xlsx, .ppt, .pptx"
+        "Supported file types: .pdf, .docx, .txt, .md, .csv, .xls, .xlsx, .ppt, .pptx"
    )

    params: dict = {
@@ -226,29 +226,16 @@ class WebFetch(BaseTool):
        return "\n\n".join(text_parts)

    def _parse_word(self, file_path: str) -> str:
-        """Extract text from Word documents (.doc/.docx)."""
-        suffix = os.path.splitext(file_path)[-1].lower()
-
-        if suffix == ".docx":
-            try:
-                from docx import Document
-            except ImportError:
-                raise ImportError(
-                    "python-docx library is required for .docx parsing. Install with: pip install python-docx"
-                )
-            doc = Document(file_path)
-            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
-            return "\n\n".join(paragraphs)
-
-        # .doc format - try textract or fallback
+        """Extract text from Word documents (.docx)."""
        try:
-            import textract
-            text = textract.process(file_path).decode("utf-8")
-            return text
+            from docx import Document
        except ImportError:
            raise ImportError(
-                "textract library is required for .doc parsing. Install with: pip install textract"
+                "python-docx library is required for .docx parsing. Install with: pip install python-docx"
            )
+        doc = Document(file_path)
+        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+        return "\n\n".join(paragraphs)

    def _parse_text(self, file_path: str) -> str:
        """Read plain text files (txt, md, csv, etc.)."""
@@ -344,7 +331,6 @@ class WebFetch(BaseTool):
        """Check if Content-Type indicates a binary/document response."""
        binary_types = [
            "application/pdf",
-            "application/msword",
            "application/vnd.openxmlformats",
            "application/vnd.ms-excel",
            "application/vnd.ms-powerpoint",
@@ -358,7 +344,6 @@ class WebFetch(BaseTool):
        ct_lower = content_type.lower()
        suffix_map = {
            "application/pdf": ".pdf",
-            "application/msword": ".doc",
            "application/vnd.openxmlformats-officedocument.wordprocessingml": ".docx",
            "application/vnd.ms-excel": ".xls",
            "application/vnd.openxmlformats-officedocument.spreadsheetml": ".xlsx",
--- a/requirements-optional.txt
+++ b/requirements-optional.txt
@@ -29,3 +29,9 @@ google-generativeai

 # tencentcloud sdk
 tencentcloud-sdk-python>=3.0.0
+
+# file parsing (web_fetch document support)
+pypdf
+python-docx
+openpyxl
+python-pptx