From a50fafaca2c800a7df90a4e2124ea1737126dad9 Mon Sep 17 00:00:00 2001 From: zhayujie Date: Mon, 9 Mar 2026 16:01:56 +0800 Subject: [PATCH] refactor: convert image vision from skill to native tool --- agent/tools/__init__.py | 11 + agent/tools/vision/__init__.py | 1 + agent/tools/vision/vision.py | 255 +++++++++++++++++++ skills/openai-image-vision/SKILL.md | 119 --------- skills/openai-image-vision/scripts/vision.sh | 243 ------------------ 5 files changed, 267 insertions(+), 362 deletions(-) create mode 100644 agent/tools/vision/__init__.py create mode 100644 agent/tools/vision/vision.py delete mode 100644 skills/openai-image-vision/SKILL.md delete mode 100755 skills/openai-image-vision/scripts/vision.sh diff --git a/agent/tools/__init__.py b/agent/tools/__init__.py index 5c2cc20..106d7a1 100644 --- a/agent/tools/__init__.py +++ b/agent/tools/__init__.py @@ -64,6 +64,15 @@ def _import_optional_tools(): except Exception as e: logger.error(f"[Tools] WebFetch failed to load: {e}") + # Vision Tool (conditionally loaded based on API key availability) + try: + from agent.tools.vision.vision import Vision + tools['Vision'] = Vision + except ImportError as e: + logger.error(f"[Tools] Vision not loaded - missing dependency: {e}") + except Exception as e: + logger.error(f"[Tools] Vision failed to load: {e}") + return tools # Load optional tools @@ -72,6 +81,7 @@ EnvConfig = _optional_tools.get('EnvConfig') SchedulerTool = _optional_tools.get('SchedulerTool') WebSearch = _optional_tools.get('WebSearch') WebFetch = _optional_tools.get('WebFetch') +Vision = _optional_tools.get('Vision') GoogleSearch = _optional_tools.get('GoogleSearch') FileSave = _optional_tools.get('FileSave') Terminal = _optional_tools.get('Terminal') @@ -113,6 +123,7 @@ __all__ = [ 'SchedulerTool', 'WebSearch', 'WebFetch', + 'Vision', # Optional tools (may be None if dependencies not available) # 'BrowserTool' ] diff --git a/agent/tools/vision/__init__.py b/agent/tools/vision/__init__.py new file mode 100644 index 0000000..587d663 --- /dev/null +++ b/agent/tools/vision/__init__.py @@ -0,0 +1 @@ +from agent.tools.vision.vision import Vision diff --git a/agent/tools/vision/vision.py b/agent/tools/vision/vision.py new file mode 100644 index 0000000..308169d --- /dev/null +++ b/agent/tools/vision/vision.py @@ -0,0 +1,255 @@ +""" +Vision tool - Analyze images using OpenAI-compatible Vision API. +Supports local files (auto base64-encoded) and HTTP URLs. +Providers: OpenAI (preferred) > LinkAI (fallback). +""" + +import base64 +import os +import subprocess +import tempfile +from typing import Any, Dict, Optional, Tuple + +import requests + +from agent.tools.base_tool import BaseTool, ToolResult +from common.log import logger +from config import conf + +DEFAULT_MODEL = "gpt-4.1-mini" +DEFAULT_TIMEOUT = 60 +MAX_TOKENS = 1000 +COMPRESS_THRESHOLD = 1_048_576 # 1 MB + +SUPPORTED_EXTENSIONS = { + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "png": "image/png", + "gif": "image/gif", + "webp": "image/webp", +} + + +class Vision(BaseTool): + """Analyze images using OpenAI-compatible Vision API""" + + name: str = "vision" + description: str = ( + "Analyze an image (local file or URL) using Vision API. " + "Can describe content, extract text, identify objects, colors, etc. " + "Requires OPENAI_API_KEY or LINKAI_API_KEY." + ) + + params: dict = { + "type": "object", + "properties": { + "image": { + "type": "string", + "description": "Local file path or HTTP(S) URL of the image to analyze", + }, + "question": { + "type": "string", + "description": "Question to ask about the image", + }, + "model": { + "type": "string", + "description": ( + f"Vision model to use (default: {DEFAULT_MODEL}). " + "Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o" + ), + }, + }, + "required": ["image", "question"], + } + + def __init__(self, config: dict = None): + self.config = config or {} + + @staticmethod + def is_available() -> bool: + return bool( + conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY") + or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY") + ) + + def execute(self, args: Dict[str, Any]) -> ToolResult: + image = args.get("image", "").strip() + question = args.get("question", "").strip() + model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL + + if not image: + return ToolResult.fail("Error: 'image' parameter is required") + if not question: + return ToolResult.fail("Error: 'question' parameter is required") + + api_key, api_base = self._resolve_provider() + if not api_key: + return ToolResult.fail( + "Error: No API key configured for Vision.\n" + "Please configure one of the following using env_config tool:\n" + " 1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n" + " 2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n" + "Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech" + ) + + try: + image_content = self._build_image_content(image) + except Exception as e: + return ToolResult.fail(f"Error: {e}") + + try: + return self._call_api(api_key, api_base, model, question, image_content) + except requests.Timeout: + return ToolResult.fail(f"Error: Vision API request timed out after {DEFAULT_TIMEOUT}s") + except requests.ConnectionError: + return ToolResult.fail("Error: Failed to connect to Vision API") + except Exception as e: + logger.error(f"[Vision] Unexpected error: {e}", exc_info=True) + return ToolResult.fail(f"Error: Vision API call failed - {e}") + + def _resolve_provider(self) -> Tuple[Optional[str], str]: + """Resolve API key and base URL. Priority: conf() > env vars.""" + api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY") + if api_key: + api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \ + or "https://api.openai.com/v1" + return api_key, self._ensure_v1(api_base) + + api_key = conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY") + if api_key: + api_base = (conf().get("linkai_api_base") or os.environ.get("LINKAI_API_BASE", "")).rstrip("/") \ + or "https://api.link-ai.tech" + logger.debug("[Vision] Using LinkAI API (OPENAI_API_KEY not set)") + return api_key, self._ensure_v1(api_base) + + return None, "" + + @staticmethod + def _ensure_v1(api_base: str) -> str: + """Append /v1 if the base URL doesn't already end with a versioned path.""" + if not api_base: + return api_base + # Already has /v1 or similar version suffix + if api_base.rstrip("/").split("/")[-1].startswith("v"): + return api_base + return api_base.rstrip("/") + "/v1" + + def _build_image_content(self, image: str) -> dict: + """Build the image_url content block for the API request.""" + if image.startswith(("http://", "https://")): + return {"type": "image_url", "image_url": {"url": image}} + + if not os.path.isfile(image): + raise FileNotFoundError(f"Image file not found: {image}") + + ext = image.rsplit(".", 1)[-1].lower() if "." in image else "" + mime_type = SUPPORTED_EXTENSIONS.get(ext) + if not mime_type: + raise ValueError( + f"Unsupported image format '.{ext}'. " + f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}" + ) + + file_path = self._maybe_compress(image) + try: + with open(file_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("ascii") + finally: + if file_path != image and os.path.exists(file_path): + os.remove(file_path) + + data_url = f"data:{mime_type};base64,{b64}" + return {"type": "image_url", "image_url": {"url": data_url}} + + @staticmethod + def _maybe_compress(path: str) -> str: + """Compress image if larger than threshold; return path to use.""" + file_size = os.path.getsize(path) + if file_size <= COMPRESS_THRESHOLD: + return path + + tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) + tmp.close() + + try: + # macOS: use sips + subprocess.run( + ["sips", "-Z", "800", path, "--out", tmp.name], + capture_output=True, check=True, + ) + logger.debug(f"[Vision] Compressed image ({file_size // 1024}KB -> {os.path.getsize(tmp.name) // 1024}KB)") + return tmp.name + except (FileNotFoundError, subprocess.CalledProcessError): + pass + + try: + # Linux: use ImageMagick convert + subprocess.run( + ["convert", path, "-resize", "800x800>", tmp.name], + capture_output=True, check=True, + ) + logger.debug(f"[Vision] Compressed image ({file_size // 1024}KB -> {os.path.getsize(tmp.name) // 1024}KB)") + return tmp.name + except (FileNotFoundError, subprocess.CalledProcessError): + pass + + os.remove(tmp.name) + return path + + def _call_api(self, api_key: str, api_base: str, model: str, + question: str, image_content: dict) -> ToolResult: + payload = { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": question}, + image_content, + ], + } + ], + "max_tokens": MAX_TOKENS, + } + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + resp = requests.post( + f"{api_base}/chat/completions", + headers=headers, + json=payload, + timeout=DEFAULT_TIMEOUT, + ) + + if resp.status_code == 401: + return ToolResult.fail("Error: Invalid API key. Please check your configuration.") + if resp.status_code == 429: + return ToolResult.fail("Error: API rate limit reached. Please try again later.") + if resp.status_code != 200: + return ToolResult.fail(f"Error: Vision API returned HTTP {resp.status_code}: {resp.text[:200]}") + + data = resp.json() + + if "error" in data: + msg = data["error"].get("message", "Unknown API error") + return ToolResult.fail(f"Error: Vision API error - {msg}") + + content = "" + choices = data.get("choices", []) + if choices: + content = choices[0].get("message", {}).get("content", "") + + usage = data.get("usage", {}) + result = { + "model": model, + "content": content, + "usage": { + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + } + return ToolResult.success(result) diff --git a/skills/openai-image-vision/SKILL.md b/skills/openai-image-vision/SKILL.md deleted file mode 100644 index dba5f99..0000000 --- a/skills/openai-image-vision/SKILL.md +++ /dev/null @@ -1,119 +0,0 @@ ---- -name: openai-image-vision -description: Analyze images using OpenAI's Vision API. Use bash command to execute the vision script like 'bash /scripts/vision.sh '. Can understand image content, objects, text, colors, and answer questions about images. -homepage: https://platform.openai.com/docs/guides/vision -metadata: - emoji: 👁️ - requires: - bins: ["curl", "base64"] - anyEnv: ["OPENAI_API_KEY", "LINKAI_API_KEY"] ---- - -# OpenAI Image Vision - -Analyze images using OpenAI's GPT-4 Vision API. The model can understand visual elements including objects, shapes, colors, textures, and text within images. - -## Setup - -This skill requires at least one of the following API keys (OpenAI is preferred when both are set): - -1. **OpenAI** (preferred): `env_config(action="set", key="OPENAI_API_KEY", value="your-key")` -2. **LinkAI** (fallback): `env_config(action="set", key="LINKAI_API_KEY", value="your-key")` - -Optional: Set custom API base URL: - -```bash -env_config(action="set", key="OPENAI_API_BASE", value="your-base-url") -``` - -## Usage - -**Important**: Scripts are located relative to this skill's base directory. - -When you see this skill in ``, note the `` path. - -**CRITICAL**: Always use `bash` command to execute the script: - -```bash -# General pattern (MUST start with bash): -bash "/scripts/vision.sh" "" "" [model] - -# DO NOT execute the script directly like this (WRONG): -# "/scripts/vision.sh" ... - -# Parameters: -# - image_path_or_url: Local image file path or HTTP(S) URL (required) -# - question: Question to ask about the image (required) -# - model: OpenAI model to use (default: gpt-4.1-mini) -# Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4-turbo -``` - -## Examples - -### Analyze a local image -```bash -bash "/scripts/vision.sh" "/path/to/image.jpg" "What's in this image?" -``` - -### Analyze an image from URL -```bash -bash "/scripts/vision.sh" "https://example.com/image.jpg" "Describe this image in detail" -``` - -### Use specific model -```bash -bash "/scripts/vision.sh" "/path/to/photo.png" "What colors are prominent?" "gpt-4o-mini" -``` - -### Extract text from image -```bash -bash "/scripts/vision.sh" "/path/to/document.jpg" "Extract all text from this image" -``` - -### Analyze multiple aspects -```bash -bash "/scripts/vision.sh" "image.jpg" "List all objects you can see and describe the overall scene" -``` - -## Supported Image Formats - -- JPEG (.jpg, .jpeg) -- PNG (.png) -- GIF (.gif) -- WebP (.webp) - -**Performance Optimization**: Files larger than 1MB are automatically compressed to 800px (longest side) to avoid command-line parameter limits. This happens transparently without affecting analysis quality. - -## Response Format - -The script returns a JSON response: - -```json -{ - "model": "gpt-4.1-mini", - "content": "The image shows...", - "usage": { - "prompt_tokens": 1234, - "completion_tokens": 567, - "total_tokens": 1801 - } -} -``` - -Or in case of error: - -```json -{ - "error": "Error description", - "details": "Additional error information" -} -``` - -## Notes - -- **Image size**: Images are automatically resized if too large -- **Timeout**: 60 seconds for API calls -- **Rate limits**: Subject to your OpenAI API plan limits -- **Privacy**: Images are sent to OpenAI's servers for processing -- **Local files**: Automatically converted to base64 for API submission -- **URLs**: Can be passed directly to the API without downloading diff --git a/skills/openai-image-vision/scripts/vision.sh b/skills/openai-image-vision/scripts/vision.sh deleted file mode 100755 index e30e5e1..0000000 --- a/skills/openai-image-vision/scripts/vision.sh +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env bash -# OpenAI Vision API wrapper -# API Docs: https://platform.openai.com/docs/guides/vision - -set -euo pipefail - -image_input="${1:-}" -question="${2:-}" -model="${3:-gpt-4.1-mini}" - -if [ -z "$image_input" ]; then - echo '{"error": "Image path or URL is required", "usage": "bash vision.sh [model]"}' - exit 1 -fi - -if [ -z "$question" ]; then - echo '{"error": "Question is required", "usage": "bash vision.sh [model]"}' - exit 1 -fi - -# Determine API key and base URL (prefer OpenAI, fallback to LinkAI) -api_key="${OPENAI_API_KEY:-}" -api_base="${OPENAI_API_BASE:-https://api.openai.com/v1}" - -if [ -z "$api_key" ] && [ -n "${LINKAI_API_KEY:-}" ]; then - api_key="$LINKAI_API_KEY" - api_base="${LINKAI_API_BASE:-https://api.link-ai.tech}/v1" - >&2 echo "[vision.sh] Using LinkAI API (OPENAI_API_KEY not set)" -fi - -if [ -z "$api_key" ]; then - echo '{"error": "No API key configured. Set OPENAI_API_KEY or LINKAI_API_KEY", "help": "Visit https://platform.openai.com/api-keys or https://link-ai.tech to get an API key"}' - exit 1 -fi - -# Remove trailing slash if present -api_base="${api_base%/}" - -# Determine if input is a URL or local file -if [[ "$image_input" =~ ^https?:// ]]; then - # It's a URL - use it directly - image_url="$image_input" - - # Build JSON request body with URL - request_body=$(cat < /dev/null; then - # macOS: resize to max 800px on longest side - $(command -v sips) -Z 800 "$image_input" --out "$temp_compressed" &> /dev/null - if [ $? -eq 0 ]; then - image_to_encode="$temp_compressed" - >&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit" - fi - elif command -v convert &> /dev/null; then - # Linux: use ImageMagick - convert "$image_input" -resize 800x800\> "$temp_compressed" 2>/dev/null - if [ $? -eq 0 ]; then - image_to_encode="$temp_compressed" - >&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit" - fi - fi - fi - - # Detect image format from file extension - extension="${image_to_encode##*.}" - extension_lower=$(echo "$extension" | tr '[:upper:]' '[:lower:]') - - case "$extension_lower" in - jpg|jpeg) - mime_type="image/jpeg" - ;; - png) - mime_type="image/png" - ;; - gif) - mime_type="image/gif" - ;; - webp) - mime_type="image/webp" - ;; - *) - echo "{\"error\": \"Unsupported image format\", \"extension\": \"$extension\", \"supported\": [\"jpg\", \"jpeg\", \"png\", \"gif\", \"webp\"]}" - # Clean up temp file if exists - [ -n "$temp_compressed" ] && rm -f "$temp_compressed" - exit 1 - ;; - esac - - # Encode image to base64 - if command -v base64 &> /dev/null; then - # macOS and most Linux systems - base64_cmd=$(command -v base64) - base64_image=$($base64_cmd -i "$image_to_encode" 2>/dev/null || $base64_cmd "$image_to_encode" 2>/dev/null) - else - echo '{"error": "base64 command not found", "help": "Please install base64 utility"}' - # Clean up temp file if exists - [ -n "$temp_compressed" ] && rm -f "$temp_compressed" - exit 1 - fi - - # Clean up temp compressed file - [ -n "$temp_compressed" ] && rm -f "$temp_compressed" - - if [ -z "$base64_image" ]; then - echo "{\"error\": \"Failed to encode image to base64\", \"path\": \"$image_input\"}" - exit 1 - fi - - # Escape question for JSON (replace " with \") - escaped_question=$(echo "$question" | sed 's/"/\\"/g') - - # Build JSON request body with base64 image - # Note: Using printf to avoid issues with special characters - request_body=$(cat <&1) - -curl_exit_code=$? - -if [ $curl_exit_code -ne 0 ]; then - echo "{\"error\": \"Failed to call OpenAI API\", \"details\": \"$response\"}" - exit 1 -fi - -# Simple JSON validation - check if response starts with { or [ -if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then - echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}" - exit 1 -fi - -# Check for API error (look for "error" field in response) -if echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{'; then - # Extract error message if possible - error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1) - if [ -z "$error_msg" ]; then - error_msg="Unknown API error" - fi - echo "{\"error\": \"OpenAI API error\", \"message\": \"$error_msg\", \"response\": $response}" - exit 1 -fi - -# Extract the content from the response -# The response structure is: choices[0].message.content -content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1) - -# Extract usage information -prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1) -completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1) -total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1) - -# Build simplified response -if [ -n "$content" ]; then - # Unescape JSON content (basic unescaping) - content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g') - - cat <