mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-03-12 18:01:30 +08:00
refactor: convert image vision from skill to native tool
This commit is contained in:
@@ -64,6 +64,15 @@ def _import_optional_tools():
|
||||
except Exception as e:
|
||||
logger.error(f"[Tools] WebFetch failed to load: {e}")
|
||||
|
||||
# Vision Tool (conditionally loaded based on API key availability)
|
||||
try:
|
||||
from agent.tools.vision.vision import Vision
|
||||
tools['Vision'] = Vision
|
||||
except ImportError as e:
|
||||
logger.error(f"[Tools] Vision not loaded - missing dependency: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"[Tools] Vision failed to load: {e}")
|
||||
|
||||
return tools
|
||||
|
||||
# Load optional tools
|
||||
@@ -72,6 +81,7 @@ EnvConfig = _optional_tools.get('EnvConfig')
|
||||
SchedulerTool = _optional_tools.get('SchedulerTool')
|
||||
WebSearch = _optional_tools.get('WebSearch')
|
||||
WebFetch = _optional_tools.get('WebFetch')
|
||||
Vision = _optional_tools.get('Vision')
|
||||
GoogleSearch = _optional_tools.get('GoogleSearch')
|
||||
FileSave = _optional_tools.get('FileSave')
|
||||
Terminal = _optional_tools.get('Terminal')
|
||||
@@ -113,6 +123,7 @@ __all__ = [
|
||||
'SchedulerTool',
|
||||
'WebSearch',
|
||||
'WebFetch',
|
||||
'Vision',
|
||||
# Optional tools (may be None if dependencies not available)
|
||||
# 'BrowserTool'
|
||||
]
|
||||
|
||||
1
agent/tools/vision/__init__.py
Normal file
1
agent/tools/vision/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from agent.tools.vision.vision import Vision
|
||||
255
agent/tools/vision/vision.py
Normal file
255
agent/tools/vision/vision.py
Normal file
@@ -0,0 +1,255 @@
|
||||
"""
|
||||
Vision tool - Analyze images using OpenAI-compatible Vision API.
|
||||
Supports local files (auto base64-encoded) and HTTP URLs.
|
||||
Providers: OpenAI (preferred) > LinkAI (fallback).
|
||||
"""
|
||||
|
||||
import base64
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from agent.tools.base_tool import BaseTool, ToolResult
|
||||
from common.log import logger
|
||||
from config import conf
|
||||
|
||||
DEFAULT_MODEL = "gpt-4.1-mini"
|
||||
DEFAULT_TIMEOUT = 60
|
||||
MAX_TOKENS = 1000
|
||||
COMPRESS_THRESHOLD = 1_048_576 # 1 MB
|
||||
|
||||
SUPPORTED_EXTENSIONS = {
|
||||
"jpg": "image/jpeg",
|
||||
"jpeg": "image/jpeg",
|
||||
"png": "image/png",
|
||||
"gif": "image/gif",
|
||||
"webp": "image/webp",
|
||||
}
|
||||
|
||||
|
||||
class Vision(BaseTool):
|
||||
"""Analyze images using OpenAI-compatible Vision API"""
|
||||
|
||||
name: str = "vision"
|
||||
description: str = (
|
||||
"Analyze an image (local file or URL) using Vision API. "
|
||||
"Can describe content, extract text, identify objects, colors, etc. "
|
||||
"Requires OPENAI_API_KEY or LINKAI_API_KEY."
|
||||
)
|
||||
|
||||
params: dict = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"image": {
|
||||
"type": "string",
|
||||
"description": "Local file path or HTTP(S) URL of the image to analyze",
|
||||
},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "Question to ask about the image",
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
f"Vision model to use (default: {DEFAULT_MODEL}). "
|
||||
"Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4o"
|
||||
),
|
||||
},
|
||||
},
|
||||
"required": ["image", "question"],
|
||||
}
|
||||
|
||||
def __init__(self, config: dict = None):
|
||||
self.config = config or {}
|
||||
|
||||
@staticmethod
|
||||
def is_available() -> bool:
|
||||
return bool(
|
||||
conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
|
||||
or conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
|
||||
)
|
||||
|
||||
def execute(self, args: Dict[str, Any]) -> ToolResult:
|
||||
image = args.get("image", "").strip()
|
||||
question = args.get("question", "").strip()
|
||||
model = args.get("model", DEFAULT_MODEL).strip() or DEFAULT_MODEL
|
||||
|
||||
if not image:
|
||||
return ToolResult.fail("Error: 'image' parameter is required")
|
||||
if not question:
|
||||
return ToolResult.fail("Error: 'question' parameter is required")
|
||||
|
||||
api_key, api_base = self._resolve_provider()
|
||||
if not api_key:
|
||||
return ToolResult.fail(
|
||||
"Error: No API key configured for Vision.\n"
|
||||
"Please configure one of the following using env_config tool:\n"
|
||||
" 1. OPENAI_API_KEY (preferred): env_config(action=\"set\", key=\"OPENAI_API_KEY\", value=\"your-key\")\n"
|
||||
" 2. LINKAI_API_KEY (fallback): env_config(action=\"set\", key=\"LINKAI_API_KEY\", value=\"your-key\")\n\n"
|
||||
"Get your key at: https://platform.openai.com/api-keys or https://link-ai.tech"
|
||||
)
|
||||
|
||||
try:
|
||||
image_content = self._build_image_content(image)
|
||||
except Exception as e:
|
||||
return ToolResult.fail(f"Error: {e}")
|
||||
|
||||
try:
|
||||
return self._call_api(api_key, api_base, model, question, image_content)
|
||||
except requests.Timeout:
|
||||
return ToolResult.fail(f"Error: Vision API request timed out after {DEFAULT_TIMEOUT}s")
|
||||
except requests.ConnectionError:
|
||||
return ToolResult.fail("Error: Failed to connect to Vision API")
|
||||
except Exception as e:
|
||||
logger.error(f"[Vision] Unexpected error: {e}", exc_info=True)
|
||||
return ToolResult.fail(f"Error: Vision API call failed - {e}")
|
||||
|
||||
def _resolve_provider(self) -> Tuple[Optional[str], str]:
|
||||
"""Resolve API key and base URL. Priority: conf() > env vars."""
|
||||
api_key = conf().get("open_ai_api_key") or os.environ.get("OPENAI_API_KEY")
|
||||
if api_key:
|
||||
api_base = (conf().get("open_ai_api_base") or os.environ.get("OPENAI_API_BASE", "")).rstrip("/") \
|
||||
or "https://api.openai.com/v1"
|
||||
return api_key, self._ensure_v1(api_base)
|
||||
|
||||
api_key = conf().get("linkai_api_key") or os.environ.get("LINKAI_API_KEY")
|
||||
if api_key:
|
||||
api_base = (conf().get("linkai_api_base") or os.environ.get("LINKAI_API_BASE", "")).rstrip("/") \
|
||||
or "https://api.link-ai.tech"
|
||||
logger.debug("[Vision] Using LinkAI API (OPENAI_API_KEY not set)")
|
||||
return api_key, self._ensure_v1(api_base)
|
||||
|
||||
return None, ""
|
||||
|
||||
@staticmethod
|
||||
def _ensure_v1(api_base: str) -> str:
|
||||
"""Append /v1 if the base URL doesn't already end with a versioned path."""
|
||||
if not api_base:
|
||||
return api_base
|
||||
# Already has /v1 or similar version suffix
|
||||
if api_base.rstrip("/").split("/")[-1].startswith("v"):
|
||||
return api_base
|
||||
return api_base.rstrip("/") + "/v1"
|
||||
|
||||
def _build_image_content(self, image: str) -> dict:
|
||||
"""Build the image_url content block for the API request."""
|
||||
if image.startswith(("http://", "https://")):
|
||||
return {"type": "image_url", "image_url": {"url": image}}
|
||||
|
||||
if not os.path.isfile(image):
|
||||
raise FileNotFoundError(f"Image file not found: {image}")
|
||||
|
||||
ext = image.rsplit(".", 1)[-1].lower() if "." in image else ""
|
||||
mime_type = SUPPORTED_EXTENSIONS.get(ext)
|
||||
if not mime_type:
|
||||
raise ValueError(
|
||||
f"Unsupported image format '.{ext}'. "
|
||||
f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
|
||||
)
|
||||
|
||||
file_path = self._maybe_compress(image)
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
b64 = base64.b64encode(f.read()).decode("ascii")
|
||||
finally:
|
||||
if file_path != image and os.path.exists(file_path):
|
||||
os.remove(file_path)
|
||||
|
||||
data_url = f"data:{mime_type};base64,{b64}"
|
||||
return {"type": "image_url", "image_url": {"url": data_url}}
|
||||
|
||||
@staticmethod
|
||||
def _maybe_compress(path: str) -> str:
|
||||
"""Compress image if larger than threshold; return path to use."""
|
||||
file_size = os.path.getsize(path)
|
||||
if file_size <= COMPRESS_THRESHOLD:
|
||||
return path
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
|
||||
tmp.close()
|
||||
|
||||
try:
|
||||
# macOS: use sips
|
||||
subprocess.run(
|
||||
["sips", "-Z", "800", path, "--out", tmp.name],
|
||||
capture_output=True, check=True,
|
||||
)
|
||||
logger.debug(f"[Vision] Compressed image ({file_size // 1024}KB -> {os.path.getsize(tmp.name) // 1024}KB)")
|
||||
return tmp.name
|
||||
except (FileNotFoundError, subprocess.CalledProcessError):
|
||||
pass
|
||||
|
||||
try:
|
||||
# Linux: use ImageMagick convert
|
||||
subprocess.run(
|
||||
["convert", path, "-resize", "800x800>", tmp.name],
|
||||
capture_output=True, check=True,
|
||||
)
|
||||
logger.debug(f"[Vision] Compressed image ({file_size // 1024}KB -> {os.path.getsize(tmp.name) // 1024}KB)")
|
||||
return tmp.name
|
||||
except (FileNotFoundError, subprocess.CalledProcessError):
|
||||
pass
|
||||
|
||||
os.remove(tmp.name)
|
||||
return path
|
||||
|
||||
def _call_api(self, api_key: str, api_base: str, model: str,
|
||||
question: str, image_content: dict) -> ToolResult:
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": question},
|
||||
image_content,
|
||||
],
|
||||
}
|
||||
],
|
||||
"max_tokens": MAX_TOKENS,
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
resp = requests.post(
|
||||
f"{api_base}/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
)
|
||||
|
||||
if resp.status_code == 401:
|
||||
return ToolResult.fail("Error: Invalid API key. Please check your configuration.")
|
||||
if resp.status_code == 429:
|
||||
return ToolResult.fail("Error: API rate limit reached. Please try again later.")
|
||||
if resp.status_code != 200:
|
||||
return ToolResult.fail(f"Error: Vision API returned HTTP {resp.status_code}: {resp.text[:200]}")
|
||||
|
||||
data = resp.json()
|
||||
|
||||
if "error" in data:
|
||||
msg = data["error"].get("message", "Unknown API error")
|
||||
return ToolResult.fail(f"Error: Vision API error - {msg}")
|
||||
|
||||
content = ""
|
||||
choices = data.get("choices", [])
|
||||
if choices:
|
||||
content = choices[0].get("message", {}).get("content", "")
|
||||
|
||||
usage = data.get("usage", {})
|
||||
result = {
|
||||
"model": model,
|
||||
"content": content,
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
},
|
||||
}
|
||||
return ToolResult.success(result)
|
||||
@@ -1,119 +0,0 @@
|
||||
---
|
||||
name: openai-image-vision
|
||||
description: Analyze images using OpenAI's Vision API. Use bash command to execute the vision script like 'bash <base_dir>/scripts/vision.sh <image> <question>'. Can understand image content, objects, text, colors, and answer questions about images.
|
||||
homepage: https://platform.openai.com/docs/guides/vision
|
||||
metadata:
|
||||
emoji: 👁️
|
||||
requires:
|
||||
bins: ["curl", "base64"]
|
||||
anyEnv: ["OPENAI_API_KEY", "LINKAI_API_KEY"]
|
||||
---
|
||||
|
||||
# OpenAI Image Vision
|
||||
|
||||
Analyze images using OpenAI's GPT-4 Vision API. The model can understand visual elements including objects, shapes, colors, textures, and text within images.
|
||||
|
||||
## Setup
|
||||
|
||||
This skill requires at least one of the following API keys (OpenAI is preferred when both are set):
|
||||
|
||||
1. **OpenAI** (preferred): `env_config(action="set", key="OPENAI_API_KEY", value="your-key")`
|
||||
2. **LinkAI** (fallback): `env_config(action="set", key="LINKAI_API_KEY", value="your-key")`
|
||||
|
||||
Optional: Set custom API base URL:
|
||||
|
||||
```bash
|
||||
env_config(action="set", key="OPENAI_API_BASE", value="your-base-url")
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
**Important**: Scripts are located relative to this skill's base directory.
|
||||
|
||||
When you see this skill in `<available_skills>`, note the `<base_dir>` path.
|
||||
|
||||
**CRITICAL**: Always use `bash` command to execute the script:
|
||||
|
||||
```bash
|
||||
# General pattern (MUST start with bash):
|
||||
bash "<base_dir>/scripts/vision.sh" "<image_path_or_url>" "<question>" [model]
|
||||
|
||||
# DO NOT execute the script directly like this (WRONG):
|
||||
# "<base_dir>/scripts/vision.sh" ...
|
||||
|
||||
# Parameters:
|
||||
# - image_path_or_url: Local image file path or HTTP(S) URL (required)
|
||||
# - question: Question to ask about the image (required)
|
||||
# - model: OpenAI model to use (default: gpt-4.1-mini)
|
||||
# Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4-turbo
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Analyze a local image
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "/path/to/image.jpg" "What's in this image?"
|
||||
```
|
||||
|
||||
### Analyze an image from URL
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "https://example.com/image.jpg" "Describe this image in detail"
|
||||
```
|
||||
|
||||
### Use specific model
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "/path/to/photo.png" "What colors are prominent?" "gpt-4o-mini"
|
||||
```
|
||||
|
||||
### Extract text from image
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "/path/to/document.jpg" "Extract all text from this image"
|
||||
```
|
||||
|
||||
### Analyze multiple aspects
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "image.jpg" "List all objects you can see and describe the overall scene"
|
||||
```
|
||||
|
||||
## Supported Image Formats
|
||||
|
||||
- JPEG (.jpg, .jpeg)
|
||||
- PNG (.png)
|
||||
- GIF (.gif)
|
||||
- WebP (.webp)
|
||||
|
||||
**Performance Optimization**: Files larger than 1MB are automatically compressed to 800px (longest side) to avoid command-line parameter limits. This happens transparently without affecting analysis quality.
|
||||
|
||||
## Response Format
|
||||
|
||||
The script returns a JSON response:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "gpt-4.1-mini",
|
||||
"content": "The image shows...",
|
||||
"usage": {
|
||||
"prompt_tokens": 1234,
|
||||
"completion_tokens": 567,
|
||||
"total_tokens": 1801
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Or in case of error:
|
||||
|
||||
```json
|
||||
{
|
||||
"error": "Error description",
|
||||
"details": "Additional error information"
|
||||
}
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- **Image size**: Images are automatically resized if too large
|
||||
- **Timeout**: 60 seconds for API calls
|
||||
- **Rate limits**: Subject to your OpenAI API plan limits
|
||||
- **Privacy**: Images are sent to OpenAI's servers for processing
|
||||
- **Local files**: Automatically converted to base64 for API submission
|
||||
- **URLs**: Can be passed directly to the API without downloading
|
||||
@@ -1,243 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# OpenAI Vision API wrapper
|
||||
# API Docs: https://platform.openai.com/docs/guides/vision
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
image_input="${1:-}"
|
||||
question="${2:-}"
|
||||
model="${3:-gpt-4.1-mini}"
|
||||
|
||||
if [ -z "$image_input" ]; then
|
||||
echo '{"error": "Image path or URL is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$question" ]; then
|
||||
echo '{"error": "Question is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Determine API key and base URL (prefer OpenAI, fallback to LinkAI)
|
||||
api_key="${OPENAI_API_KEY:-}"
|
||||
api_base="${OPENAI_API_BASE:-https://api.openai.com/v1}"
|
||||
|
||||
if [ -z "$api_key" ] && [ -n "${LINKAI_API_KEY:-}" ]; then
|
||||
api_key="$LINKAI_API_KEY"
|
||||
api_base="${LINKAI_API_BASE:-https://api.link-ai.tech}/v1"
|
||||
>&2 echo "[vision.sh] Using LinkAI API (OPENAI_API_KEY not set)"
|
||||
fi
|
||||
|
||||
if [ -z "$api_key" ]; then
|
||||
echo '{"error": "No API key configured. Set OPENAI_API_KEY or LINKAI_API_KEY", "help": "Visit https://platform.openai.com/api-keys or https://link-ai.tech to get an API key"}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Remove trailing slash if present
|
||||
api_base="${api_base%/}"
|
||||
|
||||
# Determine if input is a URL or local file
|
||||
if [[ "$image_input" =~ ^https?:// ]]; then
|
||||
# It's a URL - use it directly
|
||||
image_url="$image_input"
|
||||
|
||||
# Build JSON request body with URL
|
||||
request_body=$(cat <<EOF
|
||||
{
|
||||
"model": "$model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "$question"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "$image_url"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"max_tokens": 1000
|
||||
}
|
||||
EOF
|
||||
)
|
||||
else
|
||||
# It's a local file - need to encode as base64
|
||||
if [ ! -f "$image_input" ]; then
|
||||
echo "{\"error\": \"Image file not found\", \"path\": \"$image_input\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check file size and compress if needed to avoid "Argument list too long" error
|
||||
# Files larger than 1MB should be compressed
|
||||
file_size=$(wc -c < "$image_input" | tr -d ' ')
|
||||
max_size=1048576 # 1MB
|
||||
|
||||
image_to_encode="$image_input"
|
||||
temp_compressed=""
|
||||
|
||||
if [ "$file_size" -gt "$max_size" ]; then
|
||||
# File is too large, compress it
|
||||
temp_compressed=$(mktemp "${TMPDIR:-/tmp}/vision_compressed_XXXXXX.jpg")
|
||||
|
||||
# Use sips (macOS) or convert (ImageMagick) to compress
|
||||
if command -v sips &> /dev/null; then
|
||||
# macOS: resize to max 800px on longest side
|
||||
$(command -v sips) -Z 800 "$image_input" --out "$temp_compressed" &> /dev/null
|
||||
if [ $? -eq 0 ]; then
|
||||
image_to_encode="$temp_compressed"
|
||||
>&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
|
||||
fi
|
||||
elif command -v convert &> /dev/null; then
|
||||
# Linux: use ImageMagick
|
||||
convert "$image_input" -resize 800x800\> "$temp_compressed" 2>/dev/null
|
||||
if [ $? -eq 0 ]; then
|
||||
image_to_encode="$temp_compressed"
|
||||
>&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Detect image format from file extension
|
||||
extension="${image_to_encode##*.}"
|
||||
extension_lower=$(echo "$extension" | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
case "$extension_lower" in
|
||||
jpg|jpeg)
|
||||
mime_type="image/jpeg"
|
||||
;;
|
||||
png)
|
||||
mime_type="image/png"
|
||||
;;
|
||||
gif)
|
||||
mime_type="image/gif"
|
||||
;;
|
||||
webp)
|
||||
mime_type="image/webp"
|
||||
;;
|
||||
*)
|
||||
echo "{\"error\": \"Unsupported image format\", \"extension\": \"$extension\", \"supported\": [\"jpg\", \"jpeg\", \"png\", \"gif\", \"webp\"]}"
|
||||
# Clean up temp file if exists
|
||||
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Encode image to base64
|
||||
if command -v base64 &> /dev/null; then
|
||||
# macOS and most Linux systems
|
||||
base64_cmd=$(command -v base64)
|
||||
base64_image=$($base64_cmd -i "$image_to_encode" 2>/dev/null || $base64_cmd "$image_to_encode" 2>/dev/null)
|
||||
else
|
||||
echo '{"error": "base64 command not found", "help": "Please install base64 utility"}'
|
||||
# Clean up temp file if exists
|
||||
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Clean up temp compressed file
|
||||
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
|
||||
|
||||
if [ -z "$base64_image" ]; then
|
||||
echo "{\"error\": \"Failed to encode image to base64\", \"path\": \"$image_input\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Escape question for JSON (replace " with \")
|
||||
escaped_question=$(echo "$question" | sed 's/"/\\"/g')
|
||||
|
||||
# Build JSON request body with base64 image
|
||||
# Note: Using printf to avoid issues with special characters
|
||||
request_body=$(cat <<EOF
|
||||
{
|
||||
"model": "$model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "$escaped_question"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "data:$mime_type;base64,$base64_image"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"max_tokens": 1000
|
||||
}
|
||||
EOF
|
||||
)
|
||||
fi
|
||||
|
||||
# Call OpenAI API
|
||||
curl_cmd=$(command -v curl)
|
||||
response=$($curl_cmd -sS --max-time 60 \
|
||||
-X POST \
|
||||
-H "Authorization: Bearer $api_key" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$request_body" \
|
||||
"$api_base/chat/completions" 2>&1)
|
||||
|
||||
curl_exit_code=$?
|
||||
|
||||
if [ $curl_exit_code -ne 0 ]; then
|
||||
echo "{\"error\": \"Failed to call OpenAI API\", \"details\": \"$response\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Simple JSON validation - check if response starts with { or [
|
||||
if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then
|
||||
echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check for API error (look for "error" field in response)
|
||||
if echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{'; then
|
||||
# Extract error message if possible
|
||||
error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
|
||||
if [ -z "$error_msg" ]; then
|
||||
error_msg="Unknown API error"
|
||||
fi
|
||||
echo "{\"error\": \"OpenAI API error\", \"message\": \"$error_msg\", \"response\": $response}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract the content from the response
|
||||
# The response structure is: choices[0].message.content
|
||||
content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
|
||||
|
||||
# Extract usage information
|
||||
prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
|
||||
completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
|
||||
total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
|
||||
|
||||
# Build simplified response
|
||||
if [ -n "$content" ]; then
|
||||
# Unescape JSON content (basic unescaping)
|
||||
content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g')
|
||||
|
||||
cat <<EOF
|
||||
{
|
||||
"model": "$model",
|
||||
"content": "$content",
|
||||
"usage": {
|
||||
"prompt_tokens": ${prompt_tokens:-0},
|
||||
"completion_tokens": ${completion_tokens:-0},
|
||||
"total_tokens": ${total_tokens:-0}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
else
|
||||
# If we can't extract content, return the full response
|
||||
echo "$response"
|
||||
fi
|
||||
Reference in New Issue
Block a user