refactor: convert image vision from skill to native tool

2026-03-12 18:01:30 +08:00 · 2026-03-09 16:01:56 +08:00
parent 3c6781d240
commit a50fafaca2
5 changed files with 267 additions and 362 deletions
--- a/agent/tools/init.py
+++ b/agent/tools/init.py
@@ -64,6 +64,15 @@ def _import_optional_tools():
    except Exception as e:
        logger.error(f"[Tools] WebFetch failed to load: {e}")

+    # Vision Tool (conditionally loaded based on API key availability)
+    try:
+        from agent.tools.vision.vision import Vision
+        tools['Vision'] = Vision
+    except ImportError as e:
+        logger.error(f"[Tools] Vision not loaded - missing dependency: {e}")
+    except Exception as e:
+        logger.error(f"[Tools] Vision failed to load: {e}")
+
    return tools

 # Load optional tools
@@ -72,6 +81,7 @@ EnvConfig = _optional_tools.get('EnvConfig')
 SchedulerTool = _optional_tools.get('SchedulerTool')
 WebSearch = _optional_tools.get('WebSearch')
 WebFetch = _optional_tools.get('WebFetch')
+Vision = _optional_tools.get('Vision')
 GoogleSearch = _optional_tools.get('GoogleSearch')
 FileSave = _optional_tools.get('FileSave')
 Terminal = _optional_tools.get('Terminal')
@@ -113,6 +123,7 @@ __all__ = [
    'SchedulerTool',
    'WebSearch',
    'WebFetch',
+    'Vision',
    # Optional tools (may be None if dependencies not available)
    # 'BrowserTool'
 ]
--- a/agent/tools/vision/init.py
+++ b/agent/tools/vision/init.py
@@ -0,0 +1 @@
+from agent.tools.vision.vision import Vision
--- a/agent/tools/vision/vision.py
+++ b/agent/tools/vision/vision.py
@@ -0,0 +1,255 @@
+"""
+Vision tool - Analyze images using OpenAI-compatible Vision API.
+Supports local files (auto base64-encoded) and HTTP URLs.
+Providers: OpenAI (preferred) > LinkAI (fallback).
+"""
+
+import base64
+import os
+import subprocess
+import tempfile
+from typing import Any, Dict, Optional, Tuple
+
+import requests
+
+from agent.tools.base_tool import BaseTool, ToolResult
+from common.log import logger
+from config import conf
+
+DEFAULT_MODEL = "gpt-4.1-mini"
+DEFAULT_TIMEOUT = 60
+MAX_TOKENS = 1000
+COMPRESS_THRESHOLD = 1_048_576  # 1 MB
+
+SUPPORTED_EXTENSIONS = {
+    "jpg": "image/jpeg",
+    "jpeg": "image/jpeg",
+    "png": "image/png",
+    "gif": "image/gif",
+    "webp": "image/webp",
+}
+
+
+class Vision(BaseTool):
+    """Analyze images using OpenAI-compatible Vision API"""
+
+    name: str = "vision"
+    description: str = (
+        "Analyze an image (local file or URL) using Vision API. "
+        "Can describe content, extract text, identify objects, colors, etc. "
+        "Requires OPENAI_API_KEY or LINKAI_API_KEY."
+    )
+
+    params: dict = {
+        "type": "object",
+        "properties": {
+            "image": {
+                "type": "string",
+                "description": "Local file path or HTTP(S) URL of the image to analyze",
+            },
+            "question": {
+                "type": "string",
+                "description": "Question to ask about the image",
+            },
+            "model": {
+                "type": "string",
+                "description": (
+                    f"Vision model to use (default: {DEFAULT_MODEL}). "
+                    "Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o"
+                ),
+            },
+        },
+        "required": ["image", "question"],
+    }
+
+    def __init__(self, config: dict = None):
+        self.config = config or {}
+
+    @staticmethod
+    def is_available() -> bool:
+        return bool(
+            conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
+            or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
+        )
+
+    def execute(self, args: Dict[str, Any]) -> ToolResult:
+        image = args.get("image", "").strip()
+        question = args.get("question", "").strip()
+        model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL
+
+        if not image:
+            return ToolResult.fail("Error: 'image' parameter is required")
+        if not question:
+            return ToolResult.fail("Error: 'question' parameter is required")
+
+        api_key, api_base = self._resolve_provider()
+        if not api_key:
+            return ToolResult.fail(
+                "Error: No API key configured for Vision.\n"
+                "Please configure one of the following using env_config tool:\n"
+                "  1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
+                "  2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n"
+                "Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech"
+            )
+
+        try:
+            image_content = self._build_image_content(image)
+        except Exception as e:
+            return ToolResult.fail(f"Error: {e}")
+
+        try:
+            return self._call_api(api_key, api_base, model, question, image_content)
+        except requests.Timeout:
+            return ToolResult.fail(f"Error: Vision API request timed out after {DEFAULT_TIMEOUT}s")
+        except requests.ConnectionError:
+            return ToolResult.fail("Error: Failed to connect to Vision API")
+        except Exception as e:
+            logger.error(f"[Vision] Unexpected error: {e}", exc_info=True)
+            return ToolResult.fail(f"Error: Vision API call failed - {e}")
+
+    def _resolve_provider(self) -> Tuple[Optional[str], str]:
+        """Resolve API key and base URL. Priority: conf() > env vars."""
+        api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
+        if api_key:
+            api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \
+                or "https://api.openai.com/v1"
+            return api_key, self._ensure_v1(api_base)
+
+        api_key = conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
+        if api_key:
+            api_base = (conf().get("linkai_api_base") or os.environ.get("LINKAI_API_BASE", "")).rstrip("/") \
+                or "https://api.link-ai.tech"
+            logger.debug("[Vision] Using LinkAI API (OPENAI_API_KEY not set)")
+            return api_key, self._ensure_v1(api_base)
+
+        return None, ""
+
+    @staticmethod
+    def _ensure_v1(api_base: str) -> str:
+        """Append /v1 if the base URL doesn't already end with a versioned path."""
+        if not api_base:
+            return api_base
+        # Already has /v1 or similar version suffix
+        if api_base.rstrip("/").split("/")[-1].startswith("v"):
+            return api_base
+        return api_base.rstrip("/") + "/v1"
+
+    def _build_image_content(self, image: str) -> dict:
+        """Build the image_url content block for the API request."""
+        if image.startswith(("http://", "https://")):
+            return {"type": "image_url", "image_url": {"url": image}}
+
+        if not os.path.isfile(image):
+            raise FileNotFoundError(f"Image file not found: {image}")
+
+        ext = image.rsplit(".", 1)[-1].lower() if "." in image else ""
+        mime_type = SUPPORTED_EXTENSIONS.get(ext)
+        if not mime_type:
+            raise ValueError(
+                f"Unsupported image format '.{ext}'. "
+                f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
+            )
+
+        file_path = self._maybe_compress(image)
+        try:
+            with open(file_path, "rb") as f:
+                b64 = base64.b64encode(f.read()).decode("ascii")
+        finally:
+            if file_path != image and os.path.exists(file_path):
+                os.remove(file_path)
+
+        data_url = f"data:{mime_type};base64,{b64}"
+        return {"type": "image_url", "image_url": {"url": data_url}}
+
+    @staticmethod
+    def _maybe_compress(path: str) -> str:
+        """Compress image if larger than threshold; return path to use."""
+        file_size = os.path.getsize(path)
+        if file_size <= COMPRESS_THRESHOLD:
+            return path
+
+        tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
+        tmp.close()
+
+        try:
+            # macOS: use sips
+            subprocess.run(
+                ["sips", "-Z", "800", path, "--out", tmp.name],
+                capture_output=True, check=True,
+            )
+            logger.debug(f"[Vision] Compressed image ({file_size // 1024}KB -> {os.path.getsize(tmp.name) // 1024}KB)")
+            return tmp.name
+        except (FileNotFoundError, subprocess.CalledProcessError):
+            pass
+
+        try:
+            # Linux: use ImageMagick convert
+            subprocess.run(
+                ["convert", path, "-resize", "800x800>", tmp.name],
+                capture_output=True, check=True,
+            )
+            logger.debug(f"[Vision] Compressed image ({file_size // 1024}KB -> {os.path.getsize(tmp.name) // 1024}KB)")
+            return tmp.name
+        except (FileNotFoundError, subprocess.CalledProcessError):
+            pass
+
+        os.remove(tmp.name)
+        return path
+
+    def _call_api(self, api_key: str, api_base: str, model: str,
+                  question: str, image_content: dict) -> ToolResult:
+        payload = {
+            "model": model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        image_content,
+                    ],
+                }
+            ],
+            "max_tokens": MAX_TOKENS,
+        }
+
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        }
+
+        resp = requests.post(
+            f"{api_base}/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=DEFAULT_TIMEOUT,
+        )
+
+        if resp.status_code == 401:
+            return ToolResult.fail("Error: Invalid API key. Please check your configuration.")
+        if resp.status_code == 429:
+            return ToolResult.fail("Error: API rate limit reached. Please try again later.")
+        if resp.status_code != 200:
+            return ToolResult.fail(f"Error: Vision API returned HTTP {resp.status_code}: {resp.text[:200]}")
+
+        data = resp.json()
+
+        if "error" in data:
+            msg = data["error"].get("message", "Unknown API error")
+            return ToolResult.fail(f"Error: Vision API error - {msg}")
+
+        content = ""
+        choices = data.get("choices", [])
+        if choices:
+            content = choices[0].get("message", {}).get("content", "")
+
+        usage = data.get("usage", {})
+        result = {
+            "model": model,
+            "content": content,
+            "usage": {
+                "prompt_tokens": usage.get("prompt_tokens", 0),
+                "completion_tokens": usage.get("completion_tokens", 0),
+                "total_tokens": usage.get("total_tokens", 0),
+            },
+        }
+        return ToolResult.success(result)
--- a/skills/openai-image-vision/SKILL.md
+++ b/skills/openai-image-vision/SKILL.md
@@ -1,119 +0,0 @@
---
-name: openai-image-vision
-description: Analyze images using OpenAI's Vision API. Use bash command to execute the vision script like 'bash <base_dir>/scripts/vision.sh <image> <question>'. Can understand image content, objects, text, colors, and answer questions about images.
-homepage: https://platform.openai.com/docs/guides/vision
-metadata:
-  emoji: 👁️
-  requires:
-    bins: ["curl", "base64"]
-    anyEnv: ["OPENAI_API_KEY", "LINKAI_API_KEY"]
---
-
-# OpenAI Image Vision
-
-Analyze images using OpenAI's GPT-4 Vision API. The model can understand visual elements including objects, shapes, colors, textures, and text within images.
-
-## Setup
-
-This skill requires at least one of the following API keys (OpenAI is preferred when both are set):
-
-1. **OpenAI** (preferred): `env_config(action="set", key="OPENAI_API_KEY", value="your-key")`
-2. **LinkAI** (fallback): `env_config(action="set", key="LINKAI_API_KEY", value="your-key")`
-
-Optional: Set custom API base URL:
-
-```bash
-env_config(action="set", key="OPENAI_API_BASE", value="your-base-url")
-```
-
-## Usage
-
-**Important**: Scripts are located relative to this skill's base directory.
-
-When you see this skill in `<available_skills>`, note the `<base_dir>` path.
-
-**CRITICAL**: Always use `bash` command to execute the script:
-
-```bash
-# General pattern (MUST start with bash):
-bash "<base_dir>/scripts/vision.sh" "<image_path_or_url>" "<question>" [model]
-
-# DO NOT execute the script directly like this (WRONG):
-# "<base_dir>/scripts/vision.sh" ...
-
-# Parameters:
-# - image_path_or_url: Local image file path or HTTP(S) URL (required)
-# - question: Question to ask about the image (required)
-# - model: OpenAI model to use (default: gpt-4.1-mini)
-#   Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4-turbo
-```
-
-## Examples
-
-### Analyze a local image
-```bash
-bash "<base_dir>/scripts/vision.sh" "/path/to/image.jpg" "What's in this image?"
-```
-
-### Analyze an image from URL
-```bash
-bash "<base_dir>/scripts/vision.sh" "https://example.com/image.jpg" "Describe this image in detail"
-```
-
-### Use specific model
-```bash
-bash "<base_dir>/scripts/vision.sh" "/path/to/photo.png" "What colors are prominent?" "gpt-4o-mini"
-```
-
-### Extract text from image
-```bash
-bash "<base_dir>/scripts/vision.sh" "/path/to/document.jpg" "Extract all text from this image"
-```
-
-### Analyze multiple aspects
-```bash
-bash "<base_dir>/scripts/vision.sh" "image.jpg" "List all objects you can see and describe the overall scene"
-```
-
-## Supported Image Formats
-
- JPEG (.jpg, .jpeg)
- PNG (.png)
- GIF (.gif)
- WebP (.webp)
-
-**Performance Optimization**: Files larger than 1MB are automatically compressed to 800px (longest side) to avoid command-line parameter limits. This happens transparently without affecting analysis quality.
-
-## Response Format
-
-The script returns a JSON response:
-
-```json
-{
-  "model": "gpt-4.1-mini",
-  "content": "The image shows...",
-  "usage": {
-    "prompt_tokens": 1234,
-    "completion_tokens": 567,
-    "total_tokens": 1801
-  }
-}
-```
-
-Or in case of error:
-
-```json
-{
-  "error": "Error description",
-  "details": "Additional error information"
-}
-```
-
-## Notes
-
- **Image size**: Images are automatically resized if too large
- **Timeout**: 60 seconds for API calls
- **Rate limits**: Subject to your OpenAI API plan limits
- **Privacy**: Images are sent to OpenAI's servers for processing
- **Local files**: Automatically converted to base64 for API submission
- **URLs**: Can be passed directly to the API without downloading
--- a/skills/openai-image-vision/scripts/vision.sh
+++ b/skills/openai-image-vision/scripts/vision.sh
@@ -1,243 +0,0 @@
-#!/usr/bin/env bash
-# OpenAI Vision API wrapper
-# API Docs: https://platform.openai.com/docs/guides/vision
-
-set -euo pipefail
-
-image_input="${1:-}"
-question="${2:-}"
-model="${3:-gpt-4.1-mini}"
-
-if [ -z "$image_input" ]; then
-    echo '{"error": "Image path or URL is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
-    exit 1
-fi
-
-if [ -z "$question" ]; then
-    echo '{"error": "Question is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
-    exit 1
-fi
-
-# Determine API key and base URL (prefer OpenAI, fallback to LinkAI)
-api_key="${OPENAI_API_KEY:-}"
-api_base="${OPENAI_API_BASE:-https://api.openai.com/v1}"
-
-if [ -z "$api_key" ] && [ -n "${LINKAI_API_KEY:-}" ]; then
-    api_key="$LINKAI_API_KEY"
-    api_base="${LINKAI_API_BASE:-https://api.link-ai.tech}/v1"
-    >&2 echo "[vision.sh] Using LinkAI API (OPENAI_API_KEY not set)"
-fi
-
-if [ -z "$api_key" ]; then
-    echo '{"error": "No API key configured. Set OPENAI_API_KEY or LINKAI_API_KEY", "help": "Visit https://platform.openai.com/api-keys or https://link-ai.tech to get an API key"}'
-    exit 1
-fi
-
-# Remove trailing slash if present
-api_base="${api_base%/}"
-
-# Determine if input is a URL or local file
-if [[ "$image_input" =~ ^https?:// ]]; then
-    # It's a URL - use it directly
-    image_url="$image_input"
-    
-    # Build JSON request body with URL
-    request_body=$(cat <<EOF
-{
-  "model": "$model",
-  "messages": [
-    {
-      "role": "user",
-      "content": [
-        {
-          "type": "text",
-          "text": "$question"
-        },
-        {
-          "type": "image_url",
-          "image_url": {
-            "url": "$image_url"
-          }
-        }
-      ]
-    }
-  ],
-  "max_tokens": 1000
-}
-EOF
-)
-else
-    # It's a local file - need to encode as base64
-    if [ ! -f "$image_input" ]; then
-        echo "{\"error\": \"Image file not found\", \"path\": \"$image_input\"}"
-        exit 1
-    fi
-    
-    # Check file size and compress if needed to avoid "Argument list too long" error
-    # Files larger than 1MB should be compressed
-    file_size=$(wc -c < "$image_input" | tr -d ' ')
-    max_size=1048576  # 1MB
-    
-    image_to_encode="$image_input"
-    temp_compressed=""
-    
-    if [ "$file_size" -gt "$max_size" ]; then
-        # File is too large, compress it
-        temp_compressed=$(mktemp "${TMPDIR:-/tmp}/vision_compressed_XXXXXX.jpg")
-        
-        # Use sips (macOS) or convert (ImageMagick) to compress
-        if command -v sips &> /dev/null; then
-            # macOS: resize to max 800px on longest side
-            $(command -v sips) -Z 800 "$image_input" --out "$temp_compressed" &> /dev/null
-            if [ $? -eq 0 ]; then
-                image_to_encode="$temp_compressed"
-                >&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
-            fi
-        elif command -v convert &> /dev/null; then
-            # Linux: use ImageMagick
-            convert "$image_input" -resize 800x800\> "$temp_compressed" 2>/dev/null
-            if [ $? -eq 0 ]; then
-                image_to_encode="$temp_compressed"
-                >&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
-            fi
-        fi
-    fi
-    
-    # Detect image format from file extension
-    extension="${image_to_encode##*.}"
-    extension_lower=$(echo "$extension" | tr '[:upper:]' '[:lower:]')
-    
-    case "$extension_lower" in
-        jpg|jpeg)
-            mime_type="image/jpeg"
-            ;;
-        png)
-            mime_type="image/png"
-            ;;
-        gif)
-            mime_type="image/gif"
-            ;;
-        webp)
-            mime_type="image/webp"
-            ;;
-        *)
-            echo "{\"error\": \"Unsupported image format\", \"extension\": \"$extension\", \"supported\": [\"jpg\", \"jpeg\", \"png\", \"gif\", \"webp\"]}"
-            # Clean up temp file if exists
-            [ -n "$temp_compressed" ] && rm -f "$temp_compressed"
-            exit 1
-            ;;
-    esac
-    
-    # Encode image to base64
-    if command -v base64 &> /dev/null; then
-        # macOS and most Linux systems
-        base64_cmd=$(command -v base64)
-        base64_image=$($base64_cmd -i "$image_to_encode" 2>/dev/null || $base64_cmd "$image_to_encode" 2>/dev/null)
-    else
-        echo '{"error": "base64 command not found", "help": "Please install base64 utility"}'
-        # Clean up temp file if exists
-        [ -n "$temp_compressed" ] && rm -f "$temp_compressed"
-        exit 1
-    fi
-    
-    # Clean up temp compressed file
-    [ -n "$temp_compressed" ] && rm -f "$temp_compressed"
-    
-    if [ -z "$base64_image" ]; then
-        echo "{\"error\": \"Failed to encode image to base64\", \"path\": \"$image_input\"}"
-        exit 1
-    fi
-    
-    # Escape question for JSON (replace " with \")
-    escaped_question=$(echo "$question" | sed 's/"/\\"/g')
-    
-    # Build JSON request body with base64 image
-    # Note: Using printf to avoid issues with special characters
-    request_body=$(cat <<EOF
-{
-  "model": "$model",
-  "messages": [
-    {
-      "role": "user",
-      "content": [
-        {
-          "type": "text",
-          "text": "$escaped_question"
-        },
-        {
-          "type": "image_url",
-          "image_url": {
-            "url": "data:$mime_type;base64,$base64_image"
-          }
-        }
-      ]
-    }
-  ],
-  "max_tokens": 1000
-}
-EOF
-)
-fi
-
-# Call OpenAI API
-curl_cmd=$(command -v curl)
-response=$($curl_cmd -sS --max-time 60 \
-    -X POST \
-    -H "Authorization: Bearer $api_key" \
-    -H "Content-Type: application/json" \
-    -d "$request_body" \
-    "$api_base/chat/completions" 2>&1)
-
-curl_exit_code=$?
-
-if [ $curl_exit_code -ne 0 ]; then
-    echo "{\"error\": \"Failed to call OpenAI API\", \"details\": \"$response\"}"
-    exit 1
-fi
-
-# Simple JSON validation - check if response starts with { or [
-if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then
-    echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}"
-    exit 1
-fi
-
-# Check for API error (look for "error" field in response)
-if echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{'; then
-    # Extract error message if possible
-    error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
-    if [ -z "$error_msg" ]; then
-        error_msg="Unknown API error"
-    fi
-    echo "{\"error\": \"OpenAI API error\", \"message\": \"$error_msg\", \"response\": $response}"
-    exit 1
-fi
-
-# Extract the content from the response
-# The response structure is: choices[0].message.content
-content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
-
-# Extract usage information
-prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
-completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
-total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
-
-# Build simplified response
-if [ -n "$content" ]; then
-    # Unescape JSON content (basic unescaping)
-    content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g')
-    
-    cat <<EOF
-{
-  "model": "$model",
-  "content": "$content",
-  "usage": {
-    "prompt_tokens": ${prompt_tokens:-0},
-    "completion_tokens": ${completion_tokens:-0},
-    "total_tokens": ${total_tokens:-0}
-  }
-}
-EOF
-else
-    # If we can't extract content, return the full response
-    echo "$response"
-fi
				`@@ -0,0 +1 @@`
				`from agent.tools.vision.vision import Vision`