feat: web_featch tool support remote file url

2026-03-12 18:01:30 +08:00 · 2026-03-11 17:16:39 +08:00
parent 1767413712
commit 4fec55cc01
2 changed files with 321 additions and 14 deletions
--- a/agent/tools/web_fetch/web_fetch.py
+++ b/agent/tools/web_fetch/web_fetch.py
@@ -1,31 +1,62 @@
 """
-Web Fetch tool - Fetch and extract readable content from web pages.
+Web Fetch tool - Fetch and extract readable content from web pages and remote files.
+
+Supports:
+- HTML web pages: extracts readable text content
+- Document files (PDF, Word, TXT, Markdown, etc.): downloads to workspace/tmp and parses content
 """

+import os
 import re
-from typing import Dict, Any
-from urllib.parse import urlparse
+import uuid
+from typing import Dict, Any, Optional, Set
+from urllib.parse import urlparse, unquote

 import requests

 from agent.tools.base_tool import BaseTool, ToolResult
+from agent.tools.utils.truncate import truncate_head, format_size
 from common.log import logger


-DEFAULT_TIMEOUT = 10
+DEFAULT_TIMEOUT = 30
+MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB

 DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept": "*/*",
 }

+# Supported document file extensions
+PDF_SUFFIXES: Set[str] = {".pdf"}
+WORD_SUFFIXES: Set[str] = {".doc", ".docx"}
+TEXT_SUFFIXES: Set[str] = {".txt", ".md", ".markdown", ".rst", ".csv", ".tsv", ".log"}
+SPREADSHEET_SUFFIXES: Set[str] = {".xls", ".xlsx"}
+PPT_SUFFIXES: Set[str] = {".ppt", ".pptx"}
+
+ALL_DOC_SUFFIXES = PDF_SUFFIXES | WORD_SUFFIXES | TEXT_SUFFIXES | SPREADSHEET_SUFFIXES | PPT_SUFFIXES
+
+
+def _get_url_suffix(url: str) -> str:
+    """Extract file extension from URL path, ignoring query params."""
+    path = urlparse(url).path
+    return os.path.splitext(path)[-1].lower()
+
+
+def _is_document_url(url: str) -> bool:
+    """Check if URL points to a downloadable document file."""
+    suffix = _get_url_suffix(url)
+    return suffix in ALL_DOC_SUFFIXES
+

 class WebFetch(BaseTool):
-    """Tool for fetching and extracting readable content from web pages"""
+    """Tool for fetching web pages and remote document files"""

    name: str = "web_fetch"
    description: str = (
-        "Fetch and extract readable text content from a web page URL. "
+        "Fetch content from a URL. For web pages, extracts readable text. "
+        "For document files (PDF, Word, TXT, Markdown, Excel, PPT), downloads and parses the file content. "
+        "Supported file types: .pdf, .doc, .docx, .txt, .md, .csv, .xls, .xlsx, .ppt, .pptx"
    )

    params: dict = {
@@ -33,7 +64,7 @@ class WebFetch(BaseTool):
        "properties": {
            "url": {
                "type": "string",
-                "description": "The HTTP/HTTPS URL to fetch"
+                "description": "The HTTP/HTTPS URL to fetch (web page or document file link)"
            }
        },
        "required": ["url"]
@@ -41,6 +72,7 @@ class WebFetch(BaseTool):

    def __init__(self, config: dict = None):
        self.config = config or {}
+        self.cwd = self.config.get("cwd", os.getcwd())

    def execute(self, args: Dict[str, Any]) -> ToolResult:
        url = args.get("url", "").strip()
@@ -51,6 +83,16 @@ class WebFetch(BaseTool):
        if parsed.scheme not in ("http", "https"):
            return ToolResult.fail("Error: Invalid URL (must start with http:// or https://)")

+        if _is_document_url(url):
+            return self._fetch_document(url)
+
+        return self._fetch_webpage(url)
+
+    # ---- Web page fetching ----
+
+    def _fetch_webpage(self, url: str) -> ToolResult:
+        """Fetch and extract readable text from an HTML web page."""
+        parsed = urlparse(url)
        try:
            response = requests.get(
                url,
@@ -68,12 +110,282 @@ class WebFetch(BaseTool):
        except Exception as e:
            return ToolResult.fail(f"Error: Failed to fetch URL: {e}")

+        content_type = response.headers.get("Content-Type", "")
+        if self._is_binary_content_type(content_type) and not _is_document_url(url):
+            return self._handle_download_by_content_type(url, response, content_type)
+
        html = response.text
        title = self._extract_title(html)
        text = self._extract_text(html)

        return ToolResult.success(f"Title: {title}\n\nContent:\n{text}")

+    # ---- Document fetching ----
+
+    def _fetch_document(self, url: str) -> ToolResult:
+        """Download a document file and extract its text content."""
+        suffix = _get_url_suffix(url)
+        parsed = urlparse(url)
+        filename = self._extract_filename(url)
+        tmp_dir = self._ensure_tmp_dir()
+
+        local_path = os.path.join(tmp_dir, filename)
+        logger.info(f"[WebFetch] Downloading document: {url} -> {local_path}")
+
+        try:
+            response = requests.get(
+                url,
+                headers=DEFAULT_HEADERS,
+                timeout=DEFAULT_TIMEOUT,
+                stream=True,
+                allow_redirects=True,
+            )
+            response.raise_for_status()
+
+            content_length = int(response.headers.get("Content-Length", 0))
+            if content_length > MAX_FILE_SIZE:
+                return ToolResult.fail(
+                    f"Error: File too large ({format_size(content_length)} > {format_size(MAX_FILE_SIZE)})"
+                )
+
+            downloaded = 0
+            with open(local_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    downloaded += len(chunk)
+                    if downloaded > MAX_FILE_SIZE:
+                        f.close()
+                        os.remove(local_path)
+                        return ToolResult.fail(
+                            f"Error: File too large (>{format_size(MAX_FILE_SIZE)}), download aborted"
+                        )
+                    f.write(chunk)
+
+        except requests.Timeout:
+            return ToolResult.fail(f"Error: Download timed out after {DEFAULT_TIMEOUT}s")
+        except requests.ConnectionError:
+            return ToolResult.fail(f"Error: Failed to connect to {parsed.netloc}")
+        except requests.HTTPError as e:
+            return ToolResult.fail(f"Error: HTTP {e.response.status_code} for URL: {url}")
+        except Exception as e:
+            self._cleanup_file(local_path)
+            return ToolResult.fail(f"Error: Failed to download file: {e}")
+
+        try:
+            text = self._parse_document(local_path, suffix)
+        except Exception as e:
+            self._cleanup_file(local_path)
+            return ToolResult.fail(f"Error: Failed to parse document: {e}")
+
+        if not text or not text.strip():
+            file_size = os.path.getsize(local_path)
+            return ToolResult.success(
+                f"File downloaded to: {local_path} ({format_size(file_size)})\n"
+                f"No text content could be extracted. The file may contain only images or be encrypted."
+            )
+
+        truncation = truncate_head(text)
+        result_text = truncation.content
+
+        file_size = os.path.getsize(local_path)
+        header = f"[Document: {filename} | Size: {format_size(file_size)} | Saved to: {local_path}]\n\n"
+
+        if truncation.truncated:
+            header += f"[Content truncated: showing {truncation.output_lines} of {truncation.total_lines} lines]\n\n"
+
+        return ToolResult.success(header + result_text)
+
+    def _parse_document(self, file_path: str, suffix: str) -> str:
+        """Parse document file and return extracted text."""
+        if suffix in PDF_SUFFIXES:
+            return self._parse_pdf(file_path)
+        elif suffix in WORD_SUFFIXES:
+            return self._parse_word(file_path)
+        elif suffix in TEXT_SUFFIXES:
+            return self._parse_text(file_path)
+        elif suffix in SPREADSHEET_SUFFIXES:
+            return self._parse_spreadsheet(file_path)
+        elif suffix in PPT_SUFFIXES:
+            return self._parse_ppt(file_path)
+        else:
+            return self._parse_text(file_path)
+
+    def _parse_pdf(self, file_path: str) -> str:
+        """Extract text from PDF using pypdf."""
+        try:
+            from pypdf import PdfReader
+        except ImportError:
+            raise ImportError("pypdf library is required for PDF parsing. Install with: pip install pypdf")
+
+        reader = PdfReader(file_path)
+        text_parts = []
+        for page_num, page in enumerate(reader.pages, 1):
+            page_text = page.extract_text()
+            if page_text and page_text.strip():
+                text_parts.append(f"--- Page {page_num}/{len(reader.pages)} ---\n{page_text}")
+
+        return "\n\n".join(text_parts)
+
+    def _parse_word(self, file_path: str) -> str:
+        """Extract text from Word documents (.doc/.docx)."""
+        suffix = os.path.splitext(file_path)[-1].lower()
+
+        if suffix == ".docx":
+            try:
+                from docx import Document
+            except ImportError:
+                raise ImportError(
+                    "python-docx library is required for .docx parsing. Install with: pip install python-docx"
+                )
+            doc = Document(file_path)
+            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+            return "\n\n".join(paragraphs)
+
+        # .doc format - try textract or fallback
+        try:
+            import textract
+            text = textract.process(file_path).decode("utf-8")
+            return text
+        except ImportError:
+            raise ImportError(
+                "textract library is required for .doc parsing. Install with: pip install textract"
+            )
+
+    def _parse_text(self, file_path: str) -> str:
+        """Read plain text files (txt, md, csv, etc.)."""
+        encodings = ["utf-8", "utf-8-sig", "gbk", "gb2312", "latin-1"]
+        for enc in encodings:
+            try:
+                with open(file_path, "r", encoding=enc) as f:
+                    return f.read()
+            except (UnicodeDecodeError, UnicodeError):
+                continue
+        raise ValueError(f"Unable to decode file with any supported encoding: {encodings}")
+
+    def _parse_spreadsheet(self, file_path: str) -> str:
+        """Extract text from Excel files (.xls/.xlsx)."""
+        try:
+            import openpyxl
+        except ImportError:
+            raise ImportError(
+                "openpyxl library is required for .xlsx parsing. Install with: pip install openpyxl"
+            )
+
+        wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
+        result_parts = []
+
+        for sheet_name in wb.sheetnames:
+            ws = wb[sheet_name]
+            rows = []
+            for row in ws.iter_rows(values_only=True):
+                cells = [str(c) if c is not None else "" for c in row]
+                if any(cells):
+                    rows.append(" | ".join(cells))
+            if rows:
+                result_parts.append(f"--- Sheet: {sheet_name} ---\n" + "\n".join(rows))
+
+        wb.close()
+        return "\n\n".join(result_parts)
+
+    def _parse_ppt(self, file_path: str) -> str:
+        """Extract text from PowerPoint files (.ppt/.pptx)."""
+        try:
+            from pptx import Presentation
+        except ImportError:
+            raise ImportError(
+                "python-pptx library is required for .pptx parsing. Install with: pip install python-pptx"
+            )
+
+        prs = Presentation(file_path)
+        text_parts = []
+
+        for slide_num, slide in enumerate(prs.slides, 1):
+            slide_texts = []
+            for shape in slide.shapes:
+                if shape.has_text_frame:
+                    for paragraph in shape.text_frame.paragraphs:
+                        text = paragraph.text.strip()
+                        if text:
+                            slide_texts.append(text)
+            if slide_texts:
+                text_parts.append(f"--- Slide {slide_num}/{len(prs.slides)} ---\n" + "\n".join(slide_texts))
+
+        return "\n\n".join(text_parts)
+
+    # ---- Helper methods ----
+
+    def _ensure_tmp_dir(self) -> str:
+        """Ensure workspace/tmp directory exists and return its path."""
+        tmp_dir = os.path.join(self.cwd, "tmp")
+        os.makedirs(tmp_dir, exist_ok=True)
+        return tmp_dir
+
+    def _extract_filename(self, url: str) -> str:
+        """Extract a safe filename from URL, with a short UUID prefix to avoid collisions."""
+        path = urlparse(url).path
+        basename = os.path.basename(unquote(path))
+        if not basename or basename == "/":
+            basename = "downloaded_file"
+        # Sanitize: keep only safe chars
+        basename = re.sub(r'[^\w.\-]', '_', basename)
+        short_id = uuid.uuid4().hex[:8]
+        return f"{short_id}_{basename}"
+
+    @staticmethod
+    def _cleanup_file(path: str):
+        """Remove a file if it exists, ignoring errors."""
+        try:
+            if os.path.exists(path):
+                os.remove(path)
+        except Exception:
+            pass
+
+    @staticmethod
+    def _is_binary_content_type(content_type: str) -> bool:
+        """Check if Content-Type indicates a binary/document response."""
+        binary_types = [
+            "application/pdf",
+            "application/msword",
+            "application/vnd.openxmlformats",
+            "application/vnd.ms-excel",
+            "application/vnd.ms-powerpoint",
+            "application/octet-stream",
+        ]
+        ct_lower = content_type.lower()
+        return any(bt in ct_lower for bt in binary_types)
+
+    def _handle_download_by_content_type(self, url: str, response: requests.Response, content_type: str) -> ToolResult:
+        """Handle a URL that returned binary content instead of HTML."""
+        ct_lower = content_type.lower()
+        suffix_map = {
+            "application/pdf": ".pdf",
+            "application/msword": ".doc",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml": ".docx",
+            "application/vnd.ms-excel": ".xls",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml": ".xlsx",
+            "application/vnd.ms-powerpoint": ".ppt",
+            "application/vnd.openxmlformats-officedocument.presentationml": ".pptx",
+        }
+        detected_suffix = None
+        for ct_prefix, ext in suffix_map.items():
+            if ct_prefix in ct_lower:
+                detected_suffix = ext
+                break
+
+        if detected_suffix and detected_suffix in ALL_DOC_SUFFIXES:
+            # Re-fetch as document
+            return self._fetch_document(url if _get_url_suffix(url) in ALL_DOC_SUFFIXES
+                                        else self._rewrite_url_with_suffix(url, detected_suffix))
+        return ToolResult.fail(f"Error: URL returned binary content ({content_type}), not a supported document type")
+
+    @staticmethod
+    def _rewrite_url_with_suffix(url: str, suffix: str) -> str:
+        """Append a suffix to the URL path so _get_url_suffix works correctly."""
+        parsed = urlparse(url)
+        new_path = parsed.path.rstrip("/") + suffix
+        return parsed._replace(path=new_path).geturl()
+
+    # ---- HTML extraction (unchanged) ----
+
    @staticmethod
    def _extract_title(html: str) -> str:
        match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
@@ -81,18 +393,13 @@ class WebFetch(BaseTool):

    @staticmethod
    def _extract_text(html: str) -> str:
-        # Remove script and style blocks
        text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.IGNORECASE | re.DOTALL)
        text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.IGNORECASE | re.DOTALL)
-        # Remove HTML tags
        text = re.sub(r"<[^>]+>", "", text)
-        # Decode common HTML entities
        text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
        text = text.replace("&quot;", '"').replace("&#39;", "'").replace("&nbsp;", " ")
-        # Collapse whitespace: multiple spaces/tabs -> single space, multiple newlines -> double newline
        text = re.sub(r"[^\S\n]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text)
-        # Strip leading/trailing whitespace per line
        lines = [line.strip() for line in text.splitlines()]
        text = "\n".join(lines)
        return text.strip()
--- a/bridge/agent_initializer.py
+++ b/bridge/agent_initializer.py
@@ -366,7 +366,7 @@ class AgentInitializer:

                if tool:
                    # Apply workspace config to file operation tools
-                    if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls']:
+                    if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch']:
                        tool.config = file_config
                        tool.cwd = file_config.get("cwd", getattr(tool, 'cwd', None))
                        if 'memory_manager' in file_config: