From 36403e374957821d38e9b27fa6045ebef731d3e8 Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Fri, 8 Nov 2024 19:00:33 -0700 Subject: [PATCH 1/9] Adding in support for Llama 3.2 --- .env.example | 9 ++ Dockerfile | 12 ++- docker-compose.yml | 35 ++++--- setup.sh | 40 +++++--- skyvern/__init__.py | 11 +++ skyvern/config.py | 36 ++++++- .../forge/sdk/api/llm/api_handler_factory.py | 25 ++++- skyvern/forge/sdk/api/llm/config_registry.py | 93 +++++++++++++++++-- skyvern/forge/sdk/api/llm/llama_handler.py | 33 +++++++ skyvern/forge/sdk/api/llm/utils.py | 67 ++++++++----- skyvern/forge/sdk/settings_manager.py | 51 +++++++++- 11 files changed, 335 insertions(+), 77 deletions(-) create mode 100644 skyvern/forge/sdk/api/llm/llama_handler.py diff --git a/.env.example b/.env.example index 426651a29b..84381f8483 100644 --- a/.env.example +++ b/.env.example @@ -33,6 +33,15 @@ AZURE_GPT4O_MINI_API_KEY="" AZURE_GPT4O_MINI_API_BASE="" AZURE_GPT4O_MINI_API_VERSION="" +# ENABLE_LLAMA: Set to true to enable Llama as a language model provider +ENABLE_LLAMA=false +# LLAMA_API_BASE: The base URL for Llama API (default: http://localhost:11434) +LLAMA_API_BASE="" +# LLAMA_MODEL_NAME: The model name to use (e.g., llama3.2-vision) +LLAMA_MODEL_NAME="" +# LLAMA_API_ROUTE: The API route for Llama (default: /api/chat) +LLAMA_API_ROUTE="" + # LLM_KEY: The chosen language model to use. This should be one of the models # provided by the enabled LLM providers (e.g., OPENAI_GPT4_TURBO, OPENAI_GPT4V, ANTHROPIC_CLAUDE3, AZURE_OPENAI_GPT4V). LLM_KEY="" diff --git a/Dockerfile b/Dockerfile index 1364616ba3..c5b24c8e6e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,15 +14,21 @@ RUN playwright install-deps RUN playwright install RUN apt-get install -y xauth x11-apps netpbm && apt-get clean +# Add these lines to install dos2unix and convert entrypoint scripts +RUN apt-get update && \ + apt-get install -y dos2unix && \ + apt-get clean + COPY . /app +# Convert line endings +RUN dos2unix /app/entrypoint-skyvern.sh && \ + chmod +x /app/entrypoint-skyvern.sh + ENV PYTHONPATH="/app:$PYTHONPATH" ENV VIDEO_PATH=/data/videos ENV HAR_PATH=/data/har ENV LOG_PATH=/data/log ENV ARTIFACT_STORAGE_PATH=/data/artifacts -COPY ./entrypoint-skyvern.sh /app/entrypoint-skyvern.sh -RUN chmod +x /app/entrypoint-skyvern.sh - CMD [ "/bin/bash", "/app/entrypoint-skyvern.sh" ] diff --git a/docker-compose.yml b/docker-compose.yml index 83844785d9..b6c22abaee 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,9 +21,12 @@ services: retries: 5 skyvern: - image: public.ecr.aws/skyvern/skyvern:latest + # Replace the public image with a local build + build: + context: . + dockerfile: Dockerfile + # Keep the rest of the configuration restart: on-failure - # comment out if you want to externally call skyvern API ports: - 8000:8000 volumes: @@ -35,18 +38,20 @@ services: environment: - DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern - BROWSER_TYPE=chromium-headful - - ENABLE_OPENAI=true - - OPENAI_API_KEY= - # If you want to use other LLM provider, like azure and anthropic: - # - ENABLE_ANTHROPIC=true - # - LLM_KEY=ANTHROPIC_CLAUDE3_OPUS - # - ANTHROPIC_API_KEY= - # - ENABLE_AZURE=true - # - LLM_KEY=AZURE_OPENAI - # - AZURE_DEPLOYMENT= - # - AZURE_API_KEY= - # - AZURE_API_BASE= - # - AZURE_API_VERSION= + - ENABLE_LLAMA=true + - LLM_KEY=LLAMA3 + - LLAMA_API_BASE=http://host.docker.internal:11434 + - LLAMA_MODEL_NAME=llama3.2-vision + - LLAMA_API_ROUTE=/api/chat + - ENABLE_OPENAI=false + - ENABLE_ANTHROPIC=false + - ENABLE_AZURE=false + - ENABLE_BEDROCK=false + - ENABLE_AZURE_GPT4O_MINI=false + - LLAMA_BASE_URL=http://host.docker.internal:11434 + - LLAMA_MODEL=llama3.2-vision + - ENV=local + - SECONDARY_LLM_KEY=LLAMA3 depends_on: postgres: condition: service_healthy @@ -55,6 +60,8 @@ services: interval: 5s timeout: 5s retries: 5 + extra_hosts: + - "host.docker.internal:host-gateway" skyvern-ui: image: public.ecr.aws/skyvern/skyvern-ui:latest diff --git a/setup.sh b/setup.sh index 8b7a145ec2..c4ef773317 100755 --- a/setup.sh +++ b/setup.sh @@ -9,7 +9,7 @@ log_event() { # Function to check if a command exists command_exists() { - command -v "$1" &> /dev/null + command -v "$1" &>/dev/null } ensure_required_commands() { @@ -31,7 +31,7 @@ update_or_add_env_var() { sed -i.bak "s/^$key=.*/$key=$value/" .env && rm -f .env.bak else # Add new variable - echo "$key=$value" >> .env + echo "$key=$value" >>.env fi } @@ -98,16 +98,25 @@ setup_llm_providers() { update_or_add_env_var "ENABLE_AZURE" "false" fi + echo "Do you want to enable Llama (y/n)?" + read enable_llama + if [[ "$enable_llama" == "y" ]]; then + read -p "Enter path to Llama model: " llama_model_path + update_or_add_env_var "ENABLE_LLAMA" "true" + update_or_add_env_var "LLAMA_MODEL_PATH" "$llama_model_path" + model_options+=("LLAMA_3_2_VISION") + fi + # Model Selection if [ ${#model_options[@]} -eq 0 ]; then echo "No LLM providers enabled. You won't be able to run Skyvern unless you enable at least one provider. You can re-run this script to enable providers or manually update the .env file." else echo "Available LLM models based on your selections:" for i in "${!model_options[@]}"; do - echo "$((i+1)). ${model_options[$i]}" + echo "$((i + 1)). ${model_options[$i]}" done read -p "Choose a model by number (e.g., 1 for ${model_options[0]}): " model_choice - chosen_model=${model_options[$((model_choice-1))]} + chosen_model=${model_options[$((model_choice - 1))]} echo "Chosen LLM Model: $chosen_model" update_or_add_env_var "LLM_KEY" "$chosen_model" fi @@ -115,7 +124,6 @@ setup_llm_providers() { echo "LLM provider configurations updated in .env." } - # Function to initialize .env file initialize_env_file() { if [ -f ".env" ]; then @@ -165,14 +173,16 @@ remove_poetry_env() { # Choose python version choose_python_version_or_fail() { - # https://github.com/python-poetry/poetry/issues/2117 - # Py --list-paths + # https://github.com/python-poetry/poetry/issues/2117 + # Py --list-paths # This will output which paths are being used for Python 3.11 - # Windows users need to poetry env use {{ Py --list-paths with 3.11}} - poetry env use python3.11 || { echo "Error: Python 3.11 is not installed. If you're on Windows, check out https://github.com/python-poetry/poetry/issues/2117 to unblock yourself"; exit 1; } + # Windows users need to poetry env use {{ Py --list-paths with 3.11}} + poetry env use python3.11 || { + echo "Error: Python 3.11 is not installed. If you're on Windows, check out https://github.com/python-poetry/poetry/issues/2117 to unblock yourself" + exit 1 + } } - # Function to install dependencies install_dependencies() { poetry install @@ -211,9 +221,9 @@ setup_postgresql() { return 0 fi fi - + # Check if Docker is installed and running - if ! command_exists docker || ! docker info > /dev/null 2>&1; then + if ! command_exists docker || ! docker info >/dev/null 2>&1; then echo "Docker is not running or not installed. Please install or start Docker and try again." exit 1 fi @@ -221,7 +231,7 @@ setup_postgresql() { # Check if PostgreSQL is already running in a Docker container if docker ps | grep -q postgresql-container; then echo "PostgreSQL is already running in a Docker container." - else + else # Attempt to install and start PostgreSQL using Docker echo "Attempting to install PostgreSQL via Docker..." docker run --name postgresql-container -e POSTGRES_HOST_AUTH_METHOD=trust -d -p 5432:5432 postgres:14 @@ -229,7 +239,7 @@ setup_postgresql() { # Wait for PostgreSQL to start echo "Waiting for PostgreSQL to start..." - sleep 20 # Adjust sleep time as necessary + sleep 20 # Adjust sleep time as necessary fi # Assuming docker exec works directly since we've checked Docker's status before @@ -272,7 +282,7 @@ create_organization() { fi # Update the secrets-open-source.toml file - echo -e "[skyvern]\nconfigs = [\n {\"env\" = \"local\", \"host\" = \"http://127.0.0.1:8000/api/v1\", \"orgs\" = [{name=\"Skyvern\", cred=\"$api_token\"}]}\n]" > .streamlit/secrets.toml + echo -e "[skyvern]\nconfigs = [\n {\"env\" = \"local\", \"host\" = \"http://127.0.0.1:8000/api/v1\", \"orgs\" = [{name=\"Skyvern\", cred=\"$api_token\"}]}\n]" >.streamlit/secrets.toml echo ".streamlit/secrets.toml file updated with organization details." # Check if skyvern-frontend/.env exists and back it up diff --git a/skyvern/__init__.py b/skyvern/__init__.py index 502cde743a..c6df0049cb 100644 --- a/skyvern/__init__.py +++ b/skyvern/__init__.py @@ -2,6 +2,8 @@ from ddtrace.filters import FilterRequestsOnUrl from skyvern.forge.sdk.forge_log import setup_logger +from typing import Any, List +from skyvern.forge.sdk.models import Step tracer.configure( settings={ @@ -11,3 +13,12 @@ }, ) setup_logger() + +async def llama_handler( + prompt: str, + step: Step | None = None, + screenshots: list[bytes] | None = None, + parameters: dict[str, Any] | None = None, +) -> dict[str, Any]: + # Implement Llama 3.2 vision API integration here + ... diff --git a/skyvern/config.py b/skyvern/config.py index aed4b70ba0..c94e82526b 100644 --- a/skyvern/config.py +++ b/skyvern/config.py @@ -5,7 +5,26 @@ class Settings(BaseSettings): - model_config = SettingsConfigDict(env_file=(".env", ".env.staging", ".env.prod"), extra="ignore") + # Use only model_config, not Config class + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore" + ) + + # Llama Configuration + ENABLE_LLAMA: bool = True + LLAMA_API_BASE: str = "http://host.docker.internal:11434" + LLAMA_MODEL_NAME: str = "llama3.2-vision" + LLAMA_API_ROUTE: str = "/api/chat" + LLM_KEY: str = "LLAMA3" + SECONDARY_LLM_KEY: str = "LLAMA3" + + # Disable other providers + ENABLE_OPENAI: bool = False + ENABLE_ANTHROPIC: bool = False + ENABLE_AZURE: bool = False + ENABLE_BEDROCK: bool = False ADDITIONAL_MODULES: list[str] = [] @@ -18,6 +37,14 @@ class Settings(BaseSettings): BROWSER_SCREENSHOT_TIMEOUT_MS: int = 20000 BROWSER_LOADING_TIMEOUT_MS: int = 120000 OPTION_LOADING_TIMEOUT_MS: int = 600000 + MAX_SCRAPING_RETRIES: int = 0 + VIDEO_PATH: str | None = None + HAR_PATH: str | None = "./har" + LOG_PATH: str = "./log" + BROWSER_ACTION_TIMEOUT_MS: int = 5000 + BROWSER_SCREENSHOT_TIMEOUT_MS: int = 20000 + BROWSER_LOADING_TIMEOUT_MS: int = 120000 + OPTION_LOADING_TIMEOUT_MS: int = 600000 MAX_STEPS_PER_RUN: int = 75 MAX_NUM_SCREENSHOTS: int = 10 # Ratio should be between 0 and 1. @@ -91,8 +118,8 @@ class Settings(BaseSettings): # LLM Configuration # ##################### # ACTIVE LLM PROVIDER - LLM_KEY: str = "OPENAI_GPT4O" - SECONDARY_LLM_KEY: str | None = None + LLM_KEY: str = "LLAMA3" # Change default from OPENAI_GPT4O + SECONDARY_LLM_KEY: str = "LLAMA3" # Also set this to LLAMA3 # COMMON LLM_CONFIG_TIMEOUT: int = 300 LLM_CONFIG_MAX_TOKENS: int = 4096 @@ -126,6 +153,9 @@ class Settings(BaseSettings): SVG_MAX_LENGTH: int = 100000 + # Add debug property + DEBUG: bool = True + def is_cloud_environment(self) -> bool: """ :return: True if env is not local, else False diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index 5258ac21b5..2a493b92be 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -89,7 +89,12 @@ async def llm_api_handler_with_router_and_fallback( data=screenshot, ) - messages = await llm_messages_builder(prompt, screenshots, llm_config.add_assistant_prefix) + messages = await llm_messages_builder( + prompt=prompt, + screenshots=screenshots, + add_assistant_prefix=llm_config.add_assistant_prefix, + is_llama="llama" in llm_config.model_name.lower() + ) if step: await app.ARTIFACT_MANAGER.create_artifact( step=step, @@ -190,7 +195,12 @@ async def llm_api_handler( if not llm_config.supports_vision: screenshots = None - messages = await llm_messages_builder(prompt, screenshots, llm_config.add_assistant_prefix) + messages = await llm_messages_builder( + prompt=prompt, + screenshots=screenshots, + add_assistant_prefix=llm_config.add_assistant_prefix, + is_llama="llama" in llm_config.model_name.lower() + ) if step: await app.ARTIFACT_MANAGER.create_artifact( step=step, @@ -214,6 +224,7 @@ async def llm_api_handler( model=llm_config.model_name, messages=messages, timeout=SettingsManager.get_settings().LLM_CONFIG_TIMEOUT, + response_format={"type": "json_object"}, # Add this to force JSON response **active_parameters, ) LOG.info("LLM API call successful", llm_key=llm_key, model=llm_config.model_name) @@ -237,7 +248,11 @@ async def llm_api_handler( artifact_type=ArtifactType.LLM_RESPONSE, data=response.model_dump_json(indent=2).encode("utf-8"), ) - llm_cost = litellm.completion_cost(completion_response=response) + # Skip cost calculation for local Ollama models + if not llm_config.model_name.startswith("ollama/"): + llm_cost = litellm.completion_cost(completion_response=response) + else: + llm_cost = 0.0 # Local models are free prompt_tokens = response.get("usage", {}).get("prompt_tokens", 0) completion_tokens = response.get("usage", {}).get("completion_tokens", 0) await app.DATABASE.update_step( @@ -271,3 +286,7 @@ def register_custom_handler(cls, llm_key: str, handler: LLMAPIHandler) -> None: if llm_key in cls._custom_handlers: raise DuplicateCustomLLMProviderError(llm_key) cls._custom_handlers[llm_key] = handler + +if SettingsManager.get_settings().ENABLE_LLAMA: + from .llama_handler import llama_handler + LLMAPIHandlerFactory.register_custom_handler("LLAMA3", llama_handler) diff --git a/skyvern/forge/sdk/api/llm/config_registry.py b/skyvern/forge/sdk/api/llm/config_registry.py index 1f4dc6af49..cdbd5c2abe 100644 --- a/skyvern/forge/sdk/api/llm/config_registry.py +++ b/skyvern/forge/sdk/api/llm/config_registry.py @@ -1,4 +1,5 @@ import structlog +import logging from skyvern.forge.sdk.api.llm.exceptions import ( DuplicateLLMConfigError, @@ -10,7 +11,34 @@ from skyvern.forge.sdk.settings_manager import SettingsManager LOG = structlog.get_logger() +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +# Add debug logging at the top of the file +print("Initializing config registry...") + +settings = SettingsManager.get_settings() +print("Config Registry Settings:", { + "ENABLE_LLAMA": settings.ENABLE_LLAMA, + "LLAMA_API_BASE": settings.LLAMA_API_BASE, + "LLAMA_MODEL_NAME": settings.LLAMA_MODEL_NAME, + "LLM_KEY": settings.LLM_KEY, + "ENV_FILE": settings.model_config.get('env_file', '.env') # Use model_config instead of Config +}) + +# First check if any providers are enabled +provider_check = any([ + settings.ENABLE_OPENAI, + settings.ENABLE_ANTHROPIC, + settings.ENABLE_AZURE, + settings.ENABLE_BEDROCK, + settings.ENABLE_LLAMA, +]) +print("Provider check result:", provider_check) + +if not provider_check: + print("No providers enabled, raising NoProviderEnabledError") + raise NoProviderEnabledError() class LLMConfigRegistry: _configs: dict[str, LLMRouterConfig | LLMConfig] = {} @@ -43,19 +71,64 @@ def get_config(cls, llm_key: str) -> LLMRouterConfig | LLMConfig: return cls._configs[llm_key] -# if none of the LLM providers are enabled, raise an error -if not any( - [ - SettingsManager.get_settings().ENABLE_OPENAI, - SettingsManager.get_settings().ENABLE_ANTHROPIC, - SettingsManager.get_settings().ENABLE_AZURE, - SettingsManager.get_settings().ENABLE_AZURE_GPT4O_MINI, - SettingsManager.get_settings().ENABLE_BEDROCK, - ] -): +# Before the provider check, add debug logging +logger.debug("Current settings: %s", { + "ENABLE_LLAMA": SettingsManager.get_settings().ENABLE_LLAMA, + "LLAMA_API_BASE": SettingsManager.get_settings().LLAMA_API_BASE, + "LLAMA_MODEL_NAME": SettingsManager.get_settings().LLAMA_MODEL_NAME, + "LLM_KEY": SettingsManager.get_settings().LLM_KEY +}) + +# Add this before the provider check +logger.debug("Checking environment settings:") +settings = SettingsManager.get_settings() +logger.debug("Environment variables: %s", { + "ENABLE_LLAMA": settings.ENABLE_LLAMA, + "LLAMA_API_BASE": settings.LLAMA_API_BASE, + "LLAMA_MODEL_NAME": settings.LLAMA_MODEL_NAME, + "LLAMA_API_ROUTE": settings.LLAMA_API_ROUTE, + "LLM_KEY": settings.LLM_KEY, + "ENV_FILE": settings.model_config.get('env_file', '.env') +}) + +# First check if any providers are enabled +if not any([ + SettingsManager.get_settings().ENABLE_OPENAI, + SettingsManager.get_settings().ENABLE_ANTHROPIC, + SettingsManager.get_settings().ENABLE_AZURE, + SettingsManager.get_settings().ENABLE_BEDROCK, + SettingsManager.get_settings().ENABLE_LLAMA, # Make sure Llama is included +]): raise NoProviderEnabledError() +# First register Llama configuration +if SettingsManager.get_settings().ENABLE_LLAMA: + print("Registering Llama configuration...") + LLMConfigRegistry.register_config( + "LLAMA3", + LLMConfig( + model_name="ollama/llama3.2-vision", # Move model name here with ollama/ prefix + required_env_vars=[], + supports_vision=True, + add_assistant_prefix=False, + max_output_tokens=16384, + litellm_params=LiteLLMParams( + api_base=settings.LLAMA_API_BASE, + api_key="", + model_info={ + "completion_route": "/api/chat" + } + ) + ) + ) + +# Add after LLMConfigRegistry.register_config +logger.debug("Registered configs after Llama registration: %s", LLMConfigRegistry._configs) + +# After registration, check registered configs +logger.debug("Registered configs: %s", LLMConfigRegistry._configs) +# Then register other provider configurations if SettingsManager.get_settings().ENABLE_OPENAI: LLMConfigRegistry.register_config( "OPENAI_GPT4_TURBO", diff --git a/skyvern/forge/sdk/api/llm/llama_handler.py b/skyvern/forge/sdk/api/llm/llama_handler.py new file mode 100644 index 0000000000..0d703de53e --- /dev/null +++ b/skyvern/forge/sdk/api/llm/llama_handler.py @@ -0,0 +1,33 @@ +from typing import Any, Optional +import aiohttp +import base64 +import json +from skyvern.forge.sdk.models import Step + +async def llama_handler( + prompt: str, + step: Step | None = None, + screenshots: list[bytes] | None = None, + parameters: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Handler for local Llama 3.2 model running on Ollama""" + async with aiohttp.ClientSession() as session: + payload = { + "model": "llama3", # Using llama3 model name + "messages": [{"role": "user", "content": prompt}], + "stream": False + } + + if screenshots: + # Convert screenshots to base64 for vision tasks + payload["images"] = [base64.b64encode(img).decode('utf-8') for img in screenshots] + + async with session.post("http://localhost:11434/api/chat", json=payload) as response: + result = await response.json() + return { + "choices": [{ + "message": { + "content": result["message"]["content"] + } + }] + } \ No newline at end of file diff --git a/skyvern/forge/sdk/api/llm/utils.py b/skyvern/forge/sdk/api/llm/utils.py index 1f7ba93d42..6a55f19c66 100644 --- a/skyvern/forge/sdk/api/llm/utils.py +++ b/skyvern/forge/sdk/api/llm/utils.py @@ -16,32 +16,51 @@ async def llm_messages_builder( prompt: str, screenshots: list[bytes] | None = None, add_assistant_prefix: bool = False, + is_llama: bool = False, ) -> list[dict[str, Any]]: - messages: list[dict[str, Any]] = [ - { - "type": "text", - "text": prompt, - } - ] - - if screenshots: - for screenshot in screenshots: - encoded_image = base64.b64encode(screenshot).decode("utf-8") - messages.append( - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{encoded_image}", - }, - } - ) - # Anthropic models seems to struggle to always output a valid json object so we need to prefill the response to force it: - if add_assistant_prefix: - return [ - {"role": "user", "content": messages}, - {"role": "assistant", "content": "{"}, + if is_llama: + # Llama 3.2 vision format + content = [{"type": "text", "text": prompt}] + + if screenshots: + for screenshot in screenshots: + encoded_image = base64.b64encode(screenshot).decode("utf-8") + content.append({ + "type": "image", + "image_url": f"data:image/png;base64,{encoded_image}" + }) + + return [{ + "role": "user", + "content": content + }] + else: + # Original format for other models + messages: list[dict[str, Any]] = [ + { + "type": "text", + "text": prompt, + } ] - return [{"role": "user", "content": messages}] + + if screenshots: + for screenshot in screenshots: + encoded_image = base64.b64encode(screenshot).decode("utf-8") + messages.append( + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{encoded_image}", + }, + } + ) + + if add_assistant_prefix: + return [ + {"role": "user", "content": messages}, + {"role": "assistant", "content": "{"}, + ] + return [{"role": "user", "content": messages}] def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bool = False) -> dict[str, Any]: diff --git a/skyvern/forge/sdk/settings_manager.py b/skyvern/forge/sdk/settings_manager.py index cf3c3cf342..78a11f25e9 100644 --- a/skyvern/forge/sdk/settings_manager.py +++ b/skyvern/forge/sdk/settings_manager.py @@ -1,13 +1,54 @@ from skyvern.config import Settings from skyvern.config import settings as base_settings +from pydantic import Field # Import Field from pydantic +from pydantic_settings import BaseSettings # Import BaseSettings from pydantic_settings -class SettingsManager: - __instance: Settings = base_settings +class Settings(BaseSettings): + # Base configuration + ENV: str = Field(default="local") + + # Llama Configuration + ENABLE_LLAMA: bool = Field(default=False, env="ENABLE_LLAMA") + LLAMA_API_BASE: str = Field(default="http://localhost:11434", env="LLAMA_API_BASE") + LLAMA_MODEL_NAME: str = Field(default="llama3.2-vision", env="LLAMA_MODEL_NAME") + LLAMA_API_ROUTE: str = Field(default="/api/chat", env="LLAMA_API_ROUTE") + + # Disable other providers + ENABLE_OPENAI: bool = Field(default=False, env="ENABLE_OPENAI") + ENABLE_ANTHROPIC: bool = Field(default=False, env="ENABLE_ANTHROPIC") + ENABLE_AZURE: bool = Field(default=False, env="ENABLE_AZURE") + ENABLE_AZURE_GPT4O_MINI: bool = Field(default=False, env="ENABLE_AZURE_GPT4O_MINI") + ENABLE_BEDROCK: bool = Field(default=False, env="ENABLE_BEDROCK") - @staticmethod - def get_settings() -> Settings: - return SettingsManager.__instance + # LLM Configuration + LLM_KEY: str = Field(default="LLAMA3") + LLM_CONFIG_TIMEOUT: int = Field(default=300) + LLM_CONFIG_MAX_TOKENS: int = Field(default=16384) + LLM_CONFIG_TEMPERATURE: float = Field(default=0) + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + + +class SettingsManager: + _instance = None + + @staticmethod + def get_settings(): + if SettingsManager._instance is None: + print("\n=== Initializing Settings ===") + from skyvern.config import Settings + SettingsManager._instance = Settings(_env_file=".env") + print("Settings values:", { + "ENABLE_LLAMA": SettingsManager._instance.ENABLE_LLAMA, + "LLM_KEY": SettingsManager._instance.LLM_KEY, + "LLAMA_API_BASE": SettingsManager._instance.LLAMA_API_BASE, + "LLAMA_MODEL_NAME": SettingsManager._instance.LLAMA_MODEL_NAME, + "env_file": ".env" + }) + return SettingsManager._instance @staticmethod def set_settings(settings: Settings) -> None: From aecaa23bc3e05d95b9ec0260d30e17a96d37f497 Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Sat, 9 Nov 2024 06:29:08 -0700 Subject: [PATCH 2/9] Data return instruction updates for Ollama. Added function for markdown format fix for Ollama. --- .../forge/sdk/api/llm/api_handler_factory.py | 109 +++++---------- skyvern/forge/sdk/api/llm/utils.py | 126 ++++++++++++------ 2 files changed, 114 insertions(+), 121 deletions(-) diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index 2a493b92be..c2840624a2 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -55,24 +55,30 @@ def get_llm_api_handler_with_router(llm_key: str) -> LLMAPIHandler: ) main_model_group = llm_config.main_model_group + async def llm_api_handler( + prompt: str, + llm_key: str, + model: str, + messages: list[dict[str, Any]], + **parameters: dict[str, Any], + ) -> dict[str, Any]: + try: + response = await router.completion( + model=model, + messages=messages, + **parameters, + ) + return response + except Exception as e: + LOG.exception("LLM request failed unexpectedly", llm_key=llm_key) + raise LLMProviderError(llm_key) from e + async def llm_api_handler_with_router_and_fallback( prompt: str, step: Step | None = None, screenshots: list[bytes] | None = None, parameters: dict[str, Any] | None = None, ) -> dict[str, Any]: - """ - Custom LLM API handler that utilizes the LiteLLM router and fallbacks to OpenAI GPT-4 Vision. - - Args: - prompt: The prompt to generate completions for. - step: The step object associated with the prompt. - screenshots: The screenshots associated with the prompt. - parameters: Additional parameters to be passed to the LLM router. - - Returns: - The response from the LLM router. - """ if parameters is None: parameters = LLMAPIHandlerFactory.get_api_parameters(llm_config) @@ -85,7 +91,7 @@ async def llm_api_handler_with_router_and_fallback( for screenshot in screenshots or []: await app.ARTIFACT_MANAGER.create_artifact( step=step, - artifact_type=ArtifactType.SCREENSHOT_LLM, + artifact_type=ArtifactType.LLM_SCREENSHOT, data=screenshot, ) @@ -93,65 +99,18 @@ async def llm_api_handler_with_router_and_fallback( prompt=prompt, screenshots=screenshots, add_assistant_prefix=llm_config.add_assistant_prefix, - is_llama="llama" in llm_config.model_name.lower() + is_llama=llm_config.model_name.startswith("ollama/"), ) - if step: - await app.ARTIFACT_MANAGER.create_artifact( - step=step, - artifact_type=ArtifactType.LLM_REQUEST, - data=json.dumps( - { - "model": llm_key, - "messages": messages, - **parameters, - } - ).encode("utf-8"), - ) - try: - response = await router.acompletion(model=main_model_group, messages=messages, **parameters) - LOG.info("LLM API call successful", llm_key=llm_key, model=llm_config.model_name) - except litellm.exceptions.APIError as e: - raise LLMProviderErrorRetryableTask(llm_key) from e - except ValueError as e: - LOG.exception( - "LLM token limit exceeded", - llm_key=llm_key, - model=main_model_group, - ) - raise LLMProviderErrorRetryableTask(llm_key) from e - except Exception as e: - LOG.exception( - "LLM request failed unexpectedly", - llm_key=llm_key, - model=main_model_group, - ) - raise LLMProviderError(llm_key) from e - if step: - await app.ARTIFACT_MANAGER.create_artifact( - step=step, - artifact_type=ArtifactType.LLM_RESPONSE, - data=response.model_dump_json(indent=2).encode("utf-8"), - ) - llm_cost = litellm.completion_cost(completion_response=response) - prompt_tokens = response.get("usage", {}).get("prompt_tokens", 0) - completion_tokens = response.get("usage", {}).get("completion_tokens", 0) - await app.DATABASE.update_step( - task_id=step.task_id, - step_id=step.step_id, - organization_id=step.organization_id, - incremental_cost=llm_cost, - incremental_input_tokens=prompt_tokens if prompt_tokens > 0 else None, - incremental_output_tokens=completion_tokens if completion_tokens > 0 else None, - ) - parsed_response = parse_api_response(response, llm_config.add_assistant_prefix) - if step: - await app.ARTIFACT_MANAGER.create_artifact( - step=step, - artifact_type=ArtifactType.LLM_RESPONSE_PARSED, - data=json.dumps(parsed_response, indent=2).encode("utf-8"), - ) - return parsed_response + response = await llm_api_handler( + prompt=prompt, + llm_key=llm_config.model_name, + model=llm_config.model_name, + messages=messages, + **parameters, + ) + + return response return llm_api_handler_with_router_and_fallback @@ -195,12 +154,7 @@ async def llm_api_handler( if not llm_config.supports_vision: screenshots = None - messages = await llm_messages_builder( - prompt=prompt, - screenshots=screenshots, - add_assistant_prefix=llm_config.add_assistant_prefix, - is_llama="llama" in llm_config.model_name.lower() - ) + messages = await llm_messages_builder(prompt, screenshots, llm_config.add_assistant_prefix) if step: await app.ARTIFACT_MANAGER.create_artifact( step=step, @@ -224,7 +178,6 @@ async def llm_api_handler( model=llm_config.model_name, messages=messages, timeout=SettingsManager.get_settings().LLM_CONFIG_TIMEOUT, - response_format={"type": "json_object"}, # Add this to force JSON response **active_parameters, ) LOG.info("LLM API call successful", llm_key=llm_key, model=llm_config.model_name) @@ -263,7 +216,7 @@ async def llm_api_handler( incremental_input_tokens=prompt_tokens if prompt_tokens > 0 else None, incremental_output_tokens=completion_tokens if completion_tokens > 0 else None, ) - parsed_response = parse_api_response(response, llm_config.add_assistant_prefix) + parsed_response = parse_api_response(response, llm_config.add_assistant_prefix, llm_config.model_name.startswith("ollama/")) if step: await app.ARTIFACT_MANAGER.create_artifact( step=step, diff --git a/skyvern/forge/sdk/api/llm/utils.py b/skyvern/forge/sdk/api/llm/utils.py index 6a55f19c66..961d1b3ddd 100644 --- a/skyvern/forge/sdk/api/llm/utils.py +++ b/skyvern/forge/sdk/api/llm/utils.py @@ -20,6 +20,13 @@ async def llm_messages_builder( ) -> list[dict[str, Any]]: if is_llama: # Llama 3.2 vision format + system_message = { + "role": "system", + "content": "You are a helpful AI assistant. Respond with pure JSON only, without markdown formatting or explanations. "\ + "Your response should be a valid JSON object that can be parsed directly. "\ + "When analyzing images, provide structured responses in pure JSON format." + } + content = [{"type": "text", "text": prompt}] if screenshots: @@ -30,65 +37,76 @@ async def llm_messages_builder( "image_url": f"data:image/png;base64,{encoded_image}" }) - return [{ - "role": "user", - "content": content - }] + return [ + system_message, + { + "role": "user", + "content": content + } + ] else: # Original format for other models messages: list[dict[str, Any]] = [ { - "type": "text", - "text": prompt, + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": prompt } ] - + if screenshots: for screenshot in screenshots: encoded_image = base64.b64encode(screenshot).decode("utf-8") - messages.append( - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{encoded_image}", - }, + messages.append({ + "role": "user", + "content": { + "type": "image", + "image_url": f"data:image/png;base64,{encoded_image}" } - ) - - if add_assistant_prefix: - return [ - {"role": "user", "content": messages}, - {"role": "assistant", "content": "{"}, - ] - return [{"role": "user", "content": messages}] + }) + + return messages -def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bool = False) -> dict[str, Any]: +def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bool = False, is_llama: bool = False) -> dict[str, Any]: content = None try: content = response.choices[0].message.content - # Since we prefilled Anthropic response with "{" we need to add it back to the response to have a valid json object: if add_assistant_prefix: content = "{" + content - content = try_to_extract_json_from_markdown_format(content) - if not content: - raise EmptyLLMResponseError(str(response)) - return commentjson.loads(content) - except Exception as e: - if content: - LOG.warning( - "Failed to parse LLM response. Will retry auto-fixing the response for unescaped quotes.", - exc_info=True, - content=content, - ) + + # First try to extract JSON from markdown code blocks if present + if content.strip().startswith("```"): + if is_llama: + content = try_to_extract_json_from_markdown_format_llama(content) + else: + content = try_to_extract_json_from_markdown_format(content) + + # Attempt to parse the content as JSON + try: + return commentjson.loads(content) + except ValueError as e: + LOG.warning("Failed to parse LLM response as JSON. Attempting to auto-fix.", content=content) + # Attempt to fix unescaped quotes in the JSON string + fixed_content = fix_unescaped_quotes_in_json(content) try: - return fix_and_parse_json_string(content) + return commentjson.loads(fixed_content) except Exception as e2: - LOG.exception("Failed to auto-fix LLM response.", error=str(e2)) - raise InvalidLLMResponseFormat(str(response)) from e2 - - raise InvalidLLMResponseFormat(str(response)) from e - + LOG.error("Failed to auto-fix JSON string.", content=fixed_content) + # Try one last time with the JSON extractor + clean_content = try_to_extract_json_from_markdown_format(content) + if clean_content != content: + try: + return commentjson.loads(clean_content) + except: + pass + raise InvalidLLMResponseFormat(content) from e2 + except Exception as e: + LOG.error("Unexpected error while parsing LLM response.", content=content) + raise InvalidLLMResponseFormat(content) from e def fix_cutoff_json(json_string: str, error_position: int) -> dict[str, Any]: """ @@ -118,7 +136,6 @@ def fix_cutoff_json(json_string: str, error_position: int) -> dict[str, Any]: except Exception as e: raise InvalidLLMResponseFormat(json_string) from e - def fix_unescaped_quotes_in_json(json_string: str) -> str: """ Extracts the positions of quotation marks that define the JSON structure @@ -173,7 +190,6 @@ def fix_unescaped_quotes_in_json(json_string: str) -> str: return json_string - def fix_and_parse_json_string(json_string: str) -> dict[str, Any]: """ Auto-fixes a JSON string by escaping unescaped quotes and ignoring the last action if the JSON is cutoff. @@ -201,7 +217,6 @@ def fix_and_parse_json_string(json_string: str) -> dict[str, Any]: # Try to fix the cutoff JSON string and see if it can be parsed return fix_cutoff_json(json_string, error_position) - def try_to_extract_json_from_markdown_format(text: str) -> str: pattern = r"```json\s*(.*?)\s*```" match = re.search(pattern, text, re.DOTALL) @@ -209,3 +224,28 @@ def try_to_extract_json_from_markdown_format(text: str) -> str: return match.group(1) else: return text + +def try_to_extract_json_from_markdown_format_llama(text: str) -> str: + """Extract JSON content from markdown code blocks. + This is particularly useful for models like Llama that may wrap their JSON responses. + + Args: + text (str): The text to process, which may contain JSON in markdown blocks + + Returns: + str: The extracted JSON string, or the original text if no JSON found + """ + # First try to extract from ```json blocks + json_pattern = r"```(?:json)?\s*([\s\S]*?)\s*```" + match = re.search(json_pattern, text, re.MULTILINE) + if match: + return match.group(1).strip() + + # If no code blocks found, try to extract anything that looks like a JSON object + json_object_pattern = r"\{[\s\S]*?\}" # Non-greedy match for nested objects + match = re.search(json_object_pattern, text) + if match: + return match.group(0) + + # If no JSON-like content found, return original text + return text From 6c646b5f2c2a59ad2f16f83396aee8e957ef2d2a Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Sat, 9 Nov 2024 09:26:00 -0700 Subject: [PATCH 3/9] Provided more explicit instructions to Llama on its expected output. --- .../forge/sdk/api/llm/api_handler_factory.py | 15 +- skyvern/forge/sdk/api/llm/utils.py | 247 +++++------------- 2 files changed, 78 insertions(+), 184 deletions(-) diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index c2840624a2..ac6323f9af 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -150,11 +150,16 @@ async def llm_api_handler( data=screenshot, ) - # TODO (kerem): instead of overriding the screenshots, should we just not take them in the first place? if not llm_config.supports_vision: screenshots = None - messages = await llm_messages_builder(prompt, screenshots, llm_config.add_assistant_prefix) + messages = await llm_messages_builder( + prompt=prompt, + screenshots=screenshots, + add_assistant_prefix=llm_config.add_assistant_prefix, + is_llama=llm_config.model_name.startswith("ollama/"), + ) + if step: await app.ARTIFACT_MANAGER.create_artifact( step=step, @@ -163,16 +168,12 @@ async def llm_api_handler( { "model": llm_config.model_name, "messages": messages, - # we're not using active_parameters here because it may contain sensitive information **parameters, } ).encode("utf-8"), ) t_llm_request = time.perf_counter() try: - # TODO (kerem): add a timeout to this call - # TODO (kerem): add a retry mechanism to this call (acompletion_with_retries) - # TODO (kerem): use litellm fallbacks? https://litellm.vercel.app/docs/tutorials/fallbacks#how-does-completion_with_fallbacks-work LOG.info("Calling LLM API", llm_key=llm_key, model=llm_config.model_name) response = await litellm.acompletion( model=llm_config.model_name, @@ -242,4 +243,4 @@ def register_custom_handler(cls, llm_key: str, handler: LLMAPIHandler) -> None: if SettingsManager.get_settings().ENABLE_LLAMA: from .llama_handler import llama_handler - LLMAPIHandlerFactory.register_custom_handler("LLAMA3", llama_handler) + LLMAPIHandlerFactory.register_custom_handler("LLAMA3", llama_handler) \ No newline at end of file diff --git a/skyvern/forge/sdk/api/llm/utils.py b/skyvern/forge/sdk/api/llm/utils.py index 961d1b3ddd..3780c09b5b 100644 --- a/skyvern/forge/sdk/api/llm/utils.py +++ b/skyvern/forge/sdk/api/llm/utils.py @@ -11,7 +11,6 @@ LOG = structlog.get_logger() - async def llm_messages_builder( prompt: str, screenshots: list[bytes] | None = None, @@ -19,24 +18,48 @@ async def llm_messages_builder( is_llama: bool = False, ) -> list[dict[str, Any]]: if is_llama: - # Llama 3.2 vision format + # Much stricter system message system_message = { "role": "system", - "content": "You are a helpful AI assistant. Respond with pure JSON only, without markdown formatting or explanations. "\ - "Your response should be a valid JSON object that can be parsed directly. "\ - "When analyzing images, provide structured responses in pure JSON format." + "content": ( + "CRITICAL INSTRUCTION: You are a PURE JSON bot. You must NEVER write prose or explanations.\n\n" + "NO MATTER WHAT IS ASKED:\n" + "1. ALWAYS respond with actions array\n" + "2. NEVER write explanations or text\n" + "3. ONLY valid responses are:\n" + "{\"actions\": [{\"type\": \"analyze\", \"element\": \"...\"}, ...]}\n" + "{\"actions\": [{\"type\": \"click\", \"element\": \"...\"}, ...]}\n" + "{\"actions\": [{\"type\": \"input\", \"element\": \"...\", \"value\": \"...\"}, ...]}\n\n" + "Even if asked for analysis, description, or explanation, ONLY respond with actions JSON.\n" + "Even if the question seems general, ONLY respond with actions JSON.\n" + "NEVER use markdown. NEVER explain. NEVER add notes.\n\n" + "CORRECT:\n" + "{\"actions\":[{\"type\":\"analyze\",\"element\":\"search box for part lookup\"}]}\n\n" + "INCORRECT:\n" + "Here's what I found...\n" + "Let me explain...\n" + "The webpage shows...\n" + "```json\n{...}```" + ) } - content = [{"type": "text", "text": prompt}] - + # Build content array + content = [] if screenshots: for screenshot in screenshots: encoded_image = base64.b64encode(screenshot).decode("utf-8") content.append({ "type": "image", - "image_url": f"data:image/png;base64,{encoded_image}" + "data": encoded_image, + "format": "png" }) + # Force action-based response in prompt + content.append({ + "type": "text", + "text": f"{prompt} RESPOND ONLY WITH ACTIONS JSON." + }) + return [ system_message, { @@ -45,7 +68,7 @@ async def llm_messages_builder( } ] else: - # Original format for other models + # Original format for other models (unchanged) messages: list[dict[str, Any]] = [ { "role": "system", @@ -70,182 +93,52 @@ async def llm_messages_builder( return messages - def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bool = False, is_llama: bool = False) -> dict[str, Any]: + """Parse the response from the LLM API into a dictionary. + + Args: + response: The response from the LLM API + add_assistant_prefix: Whether to add a prefix to the response + is_llama: Whether the response is from a Llama/Ollama model + + Returns: + The parsed response as a dictionary + """ content = None try: - content = response.choices[0].message.content + content = response.choices[0].message.content.strip() if add_assistant_prefix: content = "{" + content - # First try to extract JSON from markdown code blocks if present - if content.strip().startswith("```"): - if is_llama: - content = try_to_extract_json_from_markdown_format_llama(content) - else: - content = try_to_extract_json_from_markdown_format(content) - - # Attempt to parse the content as JSON - try: - return commentjson.loads(content) - except ValueError as e: - LOG.warning("Failed to parse LLM response as JSON. Attempting to auto-fix.", content=content) - # Attempt to fix unescaped quotes in the JSON string - fixed_content = fix_unescaped_quotes_in_json(content) - try: - return commentjson.loads(fixed_content) - except Exception as e2: - LOG.error("Failed to auto-fix JSON string.", content=fixed_content) - # Try one last time with the JSON extractor - clean_content = try_to_extract_json_from_markdown_format(content) - if clean_content != content: - try: - return commentjson.loads(clean_content) - except: - pass - raise InvalidLLMResponseFormat(content) from e2 + # For Llama responses, try to extract just the JSON + if is_llama: + # Find anything that looks like a JSON object + json_pattern = r"\{[^{}]*\}" + matches = re.finditer(json_pattern, content) + # Try each match until we find valid JSON + for match in matches: + try: + return commentjson.loads(match.group(0)) + except: + continue + + # If no valid JSON found in matches, try the stripped content + if content.startswith("{") and content.endswith("}"): + try: + return commentjson.loads(content) + except: + pass + + raise ValueError("No valid JSON found in response") + + # For non-Llama models, use original parsing + return commentjson.loads(content) + except Exception as e: - LOG.error("Unexpected error while parsing LLM response.", content=content) + LOG.error("Failed to parse LLM response.", content=content) raise InvalidLLMResponseFormat(content) from e -def fix_cutoff_json(json_string: str, error_position: int) -> dict[str, Any]: - """ - Fixes a cutoff JSON string by ignoring the last incomplete action and making it a valid JSON. - - Args: - json_string (str): The cutoff JSON string to process. - error_position (int): The position of the error in the JSON string. - - Returns: - str: The fixed JSON string. - """ - LOG.info("Fixing cutoff JSON string.") - try: - # Truncate the string to the error position - truncated_string = json_string[:error_position] - # Find the last valid action - last_valid_action_pos = truncated_string.rfind("},") - if last_valid_action_pos != -1: - # Remove the incomplete action - fixed_string = truncated_string[: last_valid_action_pos + 1] + "\n ]\n}" - return commentjson.loads(fixed_string) - else: - # If no valid action found, return an empty actions list - LOG.warning("No valid action found in the cutoff JSON string.") - return {"actions": []} - except Exception as e: - raise InvalidLLMResponseFormat(json_string) from e def fix_unescaped_quotes_in_json(json_string: str) -> str: - """ - Extracts the positions of quotation marks that define the JSON structure - and the strings between them, handling unescaped quotation marks within strings. - - Args: - json_string (str): The JSON-like string to process. - - Returns: - str: The JSON-like string with unescaped quotation marks within strings. - """ - escape_char = "\\" - # Indices to add the escape character to. Since we're processing the string from left to right, we need to sort - # the indices in descending order to avoid index shifting. - indices_to_add_escape_char = [] - in_string = False - escape = False - json_structure_chars = {",", ":", "}", "]", "{", "["} - - i = 0 - while i < len(json_string): - char = json_string[i] - if char == escape_char: - escape = not escape - elif char == '"' and not escape: - if in_string: - # Check if the next non-whitespace character is a JSON structure character - j = i + 1 - # Skip whitespace characters - while j < len(json_string) and json_string[j].isspace(): - j += 1 - if j < len(json_string) and json_string[j] in json_structure_chars: - # If the next character is a JSON structure character, the quote is the end of the JSON string - in_string = False - else: - # If the next character is not a JSON structure character, the quote is part of the string - # Update the indices to add the escape character with the current index - indices_to_add_escape_char.append(i) - else: - # Start of the JSON string - in_string = True - else: - escape = False - i += 1 - - # Sort the indices in descending order to avoid index shifting then add the escape character to the string - if indices_to_add_escape_char: - LOG.warning("Unescaped quotes found in JSON string. Adding escape character to fix the issue.") - indices_to_add_escape_char.sort(reverse=True) - for index in indices_to_add_escape_char: - json_string = json_string[:index] + escape_char + json_string[index:] - - return json_string - -def fix_and_parse_json_string(json_string: str) -> dict[str, Any]: - """ - Auto-fixes a JSON string by escaping unescaped quotes and ignoring the last action if the JSON is cutoff. - - Args: - json_string (str): The JSON string to process. - - Returns: - dict[str, Any]: The parsed JSON object. - """ - - LOG.info("Auto-fixing JSON string.") - # Escape unescaped quotes in the JSON string - json_string = fix_unescaped_quotes_in_json(json_string) - try: - # Attempt to parse the JSON string - return commentjson.loads(json_string) - except Exception: - LOG.warning("Failed to parse JSON string. Attempting to fix the JSON string.") - try: - # This seems redundant but we're doing this to get error position. Comment json doesn't return that - return json.loads(json_string) - except json.JSONDecodeError as e: - error_position = e.pos - # Try to fix the cutoff JSON string and see if it can be parsed - return fix_cutoff_json(json_string, error_position) - -def try_to_extract_json_from_markdown_format(text: str) -> str: - pattern = r"```json\s*(.*?)\s*```" - match = re.search(pattern, text, re.DOTALL) - if match: - return match.group(1) - else: - return text - -def try_to_extract_json_from_markdown_format_llama(text: str) -> str: - """Extract JSON content from markdown code blocks. - This is particularly useful for models like Llama that may wrap their JSON responses. - - Args: - text (str): The text to process, which may contain JSON in markdown blocks - - Returns: - str: The extracted JSON string, or the original text if no JSON found - """ - # First try to extract from ```json blocks - json_pattern = r"```(?:json)?\s*([\s\S]*?)\s*```" - match = re.search(json_pattern, text, re.MULTILINE) - if match: - return match.group(1).strip() - - # If no code blocks found, try to extract anything that looks like a JSON object - json_object_pattern = r"\{[\s\S]*?\}" # Non-greedy match for nested objects - match = re.search(json_object_pattern, text) - if match: - return match.group(0) - - # If no JSON-like content found, return original text - return text + """Fix unescaped quotes in JSON string.""" + escape_ \ No newline at end of file From 35d94e0626eb514dd59416b0cdf3bbbabda5974b Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Sat, 9 Nov 2024 11:56:45 -0700 Subject: [PATCH 4/9] Testing llama specific instructions. --- .../ollama/answer-user-detail-questions.j2 | 43 ++++ .../ollama/auto-completion-choose-option.j2 | 65 ++++++ .../auto-completion-potential-answers.j2 | 43 ++++ .../ollama/auto-completion-tweak-value.j2 | 56 +++++ .../forge/prompts/ollama/check-user-goal.j2 | 34 +++ .../forge/prompts/ollama/css-shape-convert.j2 | 21 ++ skyvern/forge/prompts/ollama/custom-select.j2 | 51 +++++ .../forge/prompts/ollama/extract-action.j2 | 32 +++ .../prompts/ollama/extract-information.j2 | 26 +++ skyvern/forge/prompts/ollama/generate-task.j2 | 24 +++ .../prompts/ollama/opened-dropdown-confirm.j2 | 12 ++ .../ollama/parse-input-or-select-context.j2 | 19 ++ .../ollama/summarize-max-steps-reason.j2 | 17 ++ skyvern/forge/prompts/ollama/svg-convert.j2 | 16 ++ skyvern/forge/sdk/api/llm/utils.py | 203 +++++++++++++----- 15 files changed, 613 insertions(+), 49 deletions(-) create mode 100644 skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 create mode 100644 skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 create mode 100644 skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 create mode 100644 skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 create mode 100644 skyvern/forge/prompts/ollama/check-user-goal.j2 create mode 100644 skyvern/forge/prompts/ollama/css-shape-convert.j2 create mode 100644 skyvern/forge/prompts/ollama/custom-select.j2 create mode 100644 skyvern/forge/prompts/ollama/extract-action.j2 create mode 100644 skyvern/forge/prompts/ollama/extract-information.j2 create mode 100644 skyvern/forge/prompts/ollama/generate-task.j2 create mode 100644 skyvern/forge/prompts/ollama/opened-dropdown-confirm.j2 create mode 100644 skyvern/forge/prompts/ollama/parse-input-or-select-context.j2 create mode 100644 skyvern/forge/prompts/ollama/summarize-max-steps-reason.j2 create mode 100644 skyvern/forge/prompts/ollama/svg-convert.j2 diff --git a/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 b/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 new file mode 100644 index 0000000000..fcce1ebcf8 --- /dev/null +++ b/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 @@ -0,0 +1,43 @@ +You are a precise question-answering assistant. Let's work through these questions systematically. + +For each question: +1. Read the question carefully +2. Look for relevant information in: + - User's goal + - User's details +3. Provide only the exact information needed +4. Format as key-value pairs in JSON + +Critical rules: +- Answer directly - no explanations +- Include only requested information +- Use exact values from provided details +- Maintain strict JSON format + +You will be given information about a user's goal and details. + +Your job is to answer the user's questions based on the information provided. + +The user's questions will be provided in JSON format. + +Your answers should be direct and to the point. No need to explain the answer. + +Your response should be in JSON format. Basically fill in the answer part and return the JSON. + +User's goal: {{ navigation_goal }} + +User's details: {{ navigation_payload }} + +User's questions: {{ queries_and_answers }} + +YOUR RESPONSE HAS TO BE IN JSON FORMAT. DO NOT RETURN ANYTHING ELSE. +THESE ANSWERS WILL BE USED TO FILL OUT INFORMATION ON A WEBPAGE. DO NOT INCLUDE ANY UNRELATED INFORMATION OR UNNECESSARY DETAILS IN YOUR ANSWERS. + +EXAMPLE RESPONSE FORMAT: +{ + "question_1": "answer_1", + "question_2": "answer_2", + "question_3": "answer_3" +} + + diff --git a/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 b/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 new file mode 100644 index 0000000000..f917aebd1c --- /dev/null +++ b/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 @@ -0,0 +1,65 @@ +You are an auto-completion selection expert. Let's analyze the available options carefully. + +Follow these steps: +1. First, confirm if auto-completion is active by checking for: + - Multiple suggestions appearing + - Even "No results" messages count as attempts +2. Then, if suggestions exist: + - Check each option against user's goal + - Verify the option has a valid element ID + - Evaluate how well it matches the context + +Remember: +- Only use existing element IDs +- Ignore non-meaningful messages +- Consider the user's specific goal + +There is an input element on an HTML page. Based on the context and information provided, you have two goals: + - Confirm if an auto-completion attempt appears after the user inputs the current value. + - If auto-completion suggestions appear, assist the user in selecting the most appropriate element based on the user's goal, details, and the context. + +You can confirm an auto-completion attempt based on the following rules: + - Several auto-completion suggestions appear for the input value. + - Although messages like "No results" and "No match" mean no option was matched, they still indicate an attempt to generate auto-completion suggestions. + +You must identify a potential auto-completion suggestion based on the following rules: + - The option must be an element with an ID from the provided "HTML elements". Do not create or assume options outside of these elements. + - The content of the option must be meaningful. Do not consider non-message indicators like "No results" or "No match" as valid options. + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Each interactable element is tagged with an ID. + +Reply in JSON format with the following keys: +{ + "auto_completion_attempt": bool, // True if there's any auto completion attempt based on the rules. Otherwise, it should be False. + "reasoning": str, // The reasoning behind the decision. Be specific, referencing the value and the element id in your reasoning. Mention why you chose the element id. Keep the reasoning short and to the point. + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence. + "relevance_float": float, // The relative between the selected element and the provided information. You should consider how much the selected option is related to the user goal, the user details and the context. Pick a number between 0.00 and 1.00. 0.00 means no relevance, 1.00 means full relevance, the precision is 0.01. + "value": str, // The value to select. + "id": str, // The id of the most relevant and interactable element to take the action. The id must be from "HTML elements". It should be null if no element is relative or there's no auto completion suggestion. +} + +Context: +``` +Choose an auto-completion suggestion for "{{ field_information }}" +``` + +Input value: +``` +{{ filled_value }} +``` + +User goal: +``` +{{ navigation_goal }} +``` + +User details: +``` +{{ navigation_payload_str }} +``` + +HTML elements: +``` +{{ elements }} +``` \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 b/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 new file mode 100644 index 0000000000..a472193ba3 --- /dev/null +++ b/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 @@ -0,0 +1,43 @@ +You're doing an auto completion input action on HTML page. The current filled value doesn't match any option. +Based on the context and current value, give ten most potential values with the same meaning as the current value. +You can provide values like: + - Subset or superset meaning from the current value + - Summarized from the current value + - Remove too detailed information, making more general and concise +But don't add any extra information to the value. + +You are a creative suggestion generator. Let's find alternative ways to express the current value. + +For each suggestion, carefully: +1. Consider if it's more general or specific +2. Evaluate if it maintains the core meaning +3. Remove unnecessary details +4. Rate its relevance to the original + +Remember: +- Keep suggestions concise +- Don't add new information +- Focus on clarity and simplicity +- Order by relevance + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Reply in JSON format with the following keys: +{ + "potential_values": [ + { + "reasoning": str, // the reasoning why you recommend this value, including the relationship between the value you recommend and the current value. Keep the reasoning short and to the point. + "relevance_float": float, // The relative between the target value and the element. Pick a number between 0.00 and 1.00. 0.00 means no relevance, 1.00 means full relevance, the precision is 0.01. + "value": str, // the value you recommend + } + ], // The list of potential values. Sorted by the descending order of relevance_float +} + +Context: +``` +Choose an auto-completion suggestion for "{{ field_information }}" +``` + +Current Value: +``` +{{ current_value }} +``` \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 b/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 new file mode 100644 index 0000000000..cb879117e0 --- /dev/null +++ b/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 @@ -0,0 +1,56 @@ +You are an auto-completion optimization assistant. Let's work together to find the best input value. + +I will provide: +1. The current value that didn't work +2. Previously tried values +3. Any popup suggestions that appeared + +Your task is to carefully: +1. Analyze any popup elements for patterns +2. Consider how to modify the current value +3. Suggest a refined version that might work better + +Important rules: +- Never copy exact popup suggestions +- Make minimal, logical changes +- Keep the same core meaning +- Explain your reasoning clearly + +You're doing an auto completion input action on HTML page. User has tried several values, but none of them could find a match. +Based on the context, current value, tried values, option elements popped up while typing, tweak the value into a reasonable one based on the information. +You can try to change the value under the following rules: + 1. the value must be reasonably changed from the current value, like superset, subset of the current value + 2. If there're popped up elements, find the common concept among all elements, and then tweak the current value into a reasonable value based on the same concept. + +Don't add any extra information to the value. +Don't use any value from the popped up elements. + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Reply in JSON format with the following keys: +{ + "is_any_popped_up_elements": bool, // if there's any popped up elements to extract the concept + "common_concept": str, // Simple words to describe the common concept among all elements. null if there's no popped up elements. + "reasoning": str, // The reasoning behind the change. Be specific, referencing tweaked value in your reasoning. Mention why you make this decision. Keep the reasoning short and to the point. + "confidence_float": float, // The confidence of the decision. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "tweaked_value": str, // the value tweaked from current value. If common_concept is not null, the value should also under the same concept +} + +Context: +``` +Choose an auto-completion suggestion for "{{ field_information }}" +``` + +Current Value: +``` +{{ current_value }} +``` + +Tried Values: +``` +{{ tried_values }} +``` + +Popped up elements: +``` +{{ popped_up_elements }} +``` \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/check-user-goal.j2 b/skyvern/forge/prompts/ollama/check-user-goal.j2 new file mode 100644 index 0000000000..e08181c10f --- /dev/null +++ b/skyvern/forge/prompts/ollama/check-user-goal.j2 @@ -0,0 +1,34 @@ +You are a web navigation assistant. Your task is to analyze if a user has achieved their goal on a webpage. + +Follow these precise steps: +1. First, carefully read the user's goal and details +2. Then, examine all visible elements on the page +3. Look for specific evidence of goal completion +4. Document your reasoning step-by-step + +Remember: +- Focus only on visible elements and text +- Look for concrete evidence, not assumptions +- Be specific in your reasoning + +Note: Validate your response carefully. It must be a single JSON object with exactly these three fields: +- page_info: Clear description of relevant page elements +- thoughts: Step-by-step analysis of goal completion +- user_goal_achieved: Boolean conclusion based on evidence + +Make sure to ONLY return the JSON object in this format with no additional text before or after it: +```json +{ + "page_info": str, // Think step by step. Describe all the useful information in the page related to the user goal. + "thoughts": str, // Think step by step. What information makes you believe whether user goal has completed or not. Use information you see on the site to explain. + "user_goal_achieved": bool // True if the user goal has been completed, false otherwise. +} + +Elements on the page: +{{ elements }} + +User Goal: +{{ navigation_goal }} + +User Details: +{{ navigation_payload }} diff --git a/skyvern/forge/prompts/ollama/css-shape-convert.j2 b/skyvern/forge/prompts/ollama/css-shape-convert.j2 new file mode 100644 index 0000000000..43750d7a35 --- /dev/null +++ b/skyvern/forge/prompts/ollama/css-shape-convert.j2 @@ -0,0 +1,21 @@ +You are a visual analysis assistant. I will show you a screenshot of an HTML element, and you need to analyze its shape carefully. + +Follow these steps: +1. Look at the element's overall geometric form +2. Identify any distinctive features (rounded corners, sharp edges, etc.) +3. Determine what this shape typically represents in UI design +4. Rate your confidence in this assessment + +Important: Your response must be in valid JSON format with no extra text. +Example of valid response: +{ + "confidence_float": 0.95, + "shape": "rounded rectangle button typical of call-to-action elements" +} + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Reply in JSON format with the following keys: +{ + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "shape": string, // A short description of the shape of element and its meaning +} \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/custom-select.j2 b/skyvern/forge/prompts/ollama/custom-select.j2 new file mode 100644 index 0000000000..bc0ab98f32 --- /dev/null +++ b/skyvern/forge/prompts/ollama/custom-select.j2 @@ -0,0 +1,51 @@ +Help select an option from a {{ "multi-level" if select_history else "single" }} selection menu. + +IMPORTANT: +1. Return ONLY a JSON object +2. Use only valid options from the elements list +3. Don't select placeholders or loading indicators +4. Required fields must be filled + +Selection Context: +Field: "{{ field_information }}" +Required: {{ "yes" if required_field else "no" }} +{% if target_value %}Target Value: {{ target_value }}{% endif %} + +Required JSON fields: +- reasoning: Why this option was chosen +- confidence_float: 0.0 to 1.0 +- id: Element ID from the list +- action_type: "CLICK" or "INPUT_TEXT" +- value: Selected value +{% if target_value %}- relevant: Whether selection matches target{% endif %} + +Context: +``` +Select an option for "{{ field_information }}". It's {{ "a required" if required_field else "an optional" }} field. +``` +{% if target_value %} +Target value: +``` +{{ target_value }} +``` +{% endif %} +User goal: +``` +{{ navigation_goal }} +``` + +User details: +``` +{{ navigation_payload_str }} +``` + +HTML elements: +``` +{{ elements }} +``` +{% if select_history %} +Select History: +``` +{{ select_history }} +``` +{% endif %} \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/extract-action.j2 b/skyvern/forge/prompts/ollama/extract-action.j2 new file mode 100644 index 0000000000..1c80561509 --- /dev/null +++ b/skyvern/forge/prompts/ollama/extract-action.j2 @@ -0,0 +1,32 @@ +Determine browser actions to achieve a user goal. + +IMPORTANT: +1. Output ONLY a JSON object +2. Use only existing elements +3. Prioritize red warnings and popups +4. Return COMPLETE when goal achieved +5. Return TERMINATE only if goal impossible + +Required JSON Structure: +{ + "user_goal_stage": string description of progress, + "user_goal_achieved": boolean, + "action_plan": string description or "COMPLETE"/"TERMINATE", + "actions": array of action objects +} + +// ...existing code for action object structure... + +Current State: +1. URL: {{ current_url }} +2. Goal: {{ navigation_goal }} +{% if data_extraction_goal %} +3. Data Goal: {{ data_extraction_goal }} +{% endif %} +4. User Details: {{ navigation_payload_str }} +5. Time (UTC): {{ utc_datetime }} + +{% if action_history %} +Previous Actions: +{{ action_history }} +{% endif %} diff --git a/skyvern/forge/prompts/ollama/extract-information.j2 b/skyvern/forge/prompts/ollama/extract-information.j2 new file mode 100644 index 0000000000..dee3e29142 --- /dev/null +++ b/skyvern/forge/prompts/ollama/extract-information.j2 @@ -0,0 +1,26 @@ +Extract information from a webpage screenshot. + +IMPORTANT: +1. Return ONLY a JSON object +2. Follow the exact schema format +3. Use null for missing information +4. No additional text or explanations + +{% if extracted_information_schema %} +Required Schema: +{{ extracted_information_schema }} +{% endif %} + +Data Extraction Goal: {{ data_extraction_goal }} + +Available Information: +1. URL: {{ current_url }} +2. DOM Elements: {{ elements }} +3. Page Text: {{ extracted_text }} +4. User Data: {{ navigation_payload }} +5. Current UTC Time: {{ utc_datetime }} + +{% if error_code_mapping_str %} +Error Codes: +{{ error_code_mapping_str }} +{% endif %} \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/generate-task.j2 b/skyvern/forge/prompts/ollama/generate-task.j2 new file mode 100644 index 0000000000..1cfd4d0afc --- /dev/null +++ b/skyvern/forge/prompts/ollama/generate-task.j2 @@ -0,0 +1,24 @@ +Your task is to create a browser automation task definition. + +IMPORTANT: +1. Output ONLY a JSON object +2. No explanations or additional text +3. All URLs must use HTTPS +4. At least one goal (navigation or data extraction) must be provided + +Required Fields: +1. url (string): Starting webpage URL +2. suggested_title (string): Brief task description +3. navigation_goal_reasoning (string): Why navigation is needed +4. is_navigation_goal_required (boolean): Whether navigation is needed +5. data_extraction_goal_reasoning (string): Why data extraction is needed +6. is_data_extraction_goal_required (boolean): Whether data extraction is needed + +Optional Fields (use null if not needed): +1. navigation_goal (string): Include "COMPLETE when..." criteria +2. data_extraction_goal (string): Data to extract +3. navigation_payload (json): Form values or parameters + +Create a task for this prompt: +``` +{{ user_prompt }} diff --git a/skyvern/forge/prompts/ollama/opened-dropdown-confirm.j2 b/skyvern/forge/prompts/ollama/opened-dropdown-confirm.j2 new file mode 100644 index 0000000000..9331f5e4c2 --- /dev/null +++ b/skyvern/forge/prompts/ollama/opened-dropdown-confirm.j2 @@ -0,0 +1,12 @@ +Check if a screenshot shows an open dropdown menu. + +IMPORTANT: Return ONLY a JSON object. No other text. + +A dropdown menu is considered open if: +1. At least one valid option is visible +2. "No results" or "No match" messages don't count +3. Placeholders like "Please select" don't count + +Required JSON fields: +- reasoning: Why you determined it is/isn't a dropdown +- is_opened_dropdown_menu: true/false \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/parse-input-or-select-context.j2 b/skyvern/forge/prompts/ollama/parse-input-or-select-context.j2 new file mode 100644 index 0000000000..cb3f381d72 --- /dev/null +++ b/skyvern/forge/prompts/ollama/parse-input-or-select-context.j2 @@ -0,0 +1,19 @@ +You are analyzing an INPUT or SELECT element with ID "{{ element_id }}". + +IMPORTANT: +1. Respond ONLY with a JSON object +2. No additional text or explanations +3. No markdown formatting + +Required JSON fields: +- thought: How you verified the information +- field: Which field this action will fill +- is_required: true/false for required field +- is_search_bar: true/false for search functionality + +Context Information: +{{ action_reasoning }} + +Available Elements: +{{ elements }} + diff --git a/skyvern/forge/prompts/ollama/summarize-max-steps-reason.j2 b/skyvern/forge/prompts/ollama/summarize-max-steps-reason.j2 new file mode 100644 index 0000000000..f108f3c48b --- /dev/null +++ b/skyvern/forge/prompts/ollama/summarize-max-steps-reason.j2 @@ -0,0 +1,17 @@ +Your task is to explain why a user goal was not achieved within {{ step_count }} steps. + +IMPORTANT: Respond ONLY with a JSON object. No other text. + +Required JSON fields: +- page_info: Describe all page information related to the user goal +- reasoning: Short explanation of why the goal wasn't achieved + +Here are the details to consider: + +Goal: {{ navigation_goal }} + +User Details: {{ navigation_payload }} + +Steps Taken: +{% for step in steps %}Step {{ step.order }} -- {{ step.actions_result }} +{% endfor %} \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/svg-convert.j2 b/skyvern/forge/prompts/ollama/svg-convert.j2 new file mode 100644 index 0000000000..3baa40a80e --- /dev/null +++ b/skyvern/forge/prompts/ollama/svg-convert.j2 @@ -0,0 +1,16 @@ +You will analyze an SVG element and return information about its shape. + +IMPORTANT: You must respond ONLY with a JSON object. Do not add any other text. +Do not add explanations. Do not use markdown. + +Input SVG Element: +``` +{{svg_element}} +``` + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Reply in JSON format with the following keys: +{ + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "shape": string, // A short description of the shape of SVG and its meaning +} \ No newline at end of file diff --git a/skyvern/forge/sdk/api/llm/utils.py b/skyvern/forge/sdk/api/llm/utils.py index 3780c09b5b..e05582afb0 100644 --- a/skyvern/forge/sdk/api/llm/utils.py +++ b/skyvern/forge/sdk/api/llm/utils.py @@ -11,6 +11,7 @@ LOG = structlog.get_logger() + async def llm_messages_builder( prompt: str, screenshots: list[bytes] | None = None, @@ -18,46 +19,29 @@ async def llm_messages_builder( is_llama: bool = False, ) -> list[dict[str, Any]]: if is_llama: - # Much stricter system message system_message = { "role": "system", "content": ( - "CRITICAL INSTRUCTION: You are a PURE JSON bot. You must NEVER write prose or explanations.\n\n" - "NO MATTER WHAT IS ASKED:\n" - "1. ALWAYS respond with actions array\n" - "2. NEVER write explanations or text\n" - "3. ONLY valid responses are:\n" - "{\"actions\": [{\"type\": \"analyze\", \"element\": \"...\"}, ...]}\n" - "{\"actions\": [{\"type\": \"click\", \"element\": \"...\"}, ...]}\n" - "{\"actions\": [{\"type\": \"input\", \"element\": \"...\", \"value\": \"...\"}, ...]}\n\n" - "Even if asked for analysis, description, or explanation, ONLY respond with actions JSON.\n" - "Even if the question seems general, ONLY respond with actions JSON.\n" - "NEVER use markdown. NEVER explain. NEVER add notes.\n\n" - "CORRECT:\n" - "{\"actions\":[{\"type\":\"analyze\",\"element\":\"search box for part lookup\"}]}\n\n" - "INCORRECT:\n" - "Here's what I found...\n" - "Let me explain...\n" - "The webpage shows...\n" - "```json\n{...}```" + "CRITICAL INSTRUCTION: You are a JSON-only assistant. DO NOT provide explanations or help.\n" ) } - # Build content array + # Build content array with images first content = [] if screenshots: for screenshot in screenshots: encoded_image = base64.b64encode(screenshot).decode("utf-8") + # Use ollama's native format without data URI prefix content.append({ "type": "image", "data": encoded_image, "format": "png" }) - # Force action-based response in prompt + # Add text prompt last content.append({ - "type": "text", - "text": f"{prompt} RESPOND ONLY WITH ACTIONS JSON." + "type": "text", + "text": f"{prompt} OUTPUT JSON ONLY." }) return [ @@ -92,7 +76,6 @@ async def llm_messages_builder( }) return messages - def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bool = False, is_llama: bool = False) -> dict[str, Any]: """Parse the response from the LLM API into a dictionary. @@ -106,39 +89,161 @@ def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bo """ content = None try: - content = response.choices[0].message.content.strip() + content = response.choices[0].message.content if add_assistant_prefix: content = "{" + content - # For Llama responses, try to extract just the JSON - if is_llama: - # Find anything that looks like a JSON object - json_pattern = r"\{[^{}]*\}" - matches = re.finditer(json_pattern, content) - # Try each match until we find valid JSON - for match in matches: - try: - return commentjson.loads(match.group(0)) - except: - continue - - # If no valid JSON found in matches, try the stripped content - if content.startswith("{") and content.endswith("}"): - try: - return commentjson.loads(content) - except: - pass - - raise ValueError("No valid JSON found in response") - - # For non-Llama models, use original parsing + # Extract JSON if wrapped in markdown (for Llama) + if is_llama and content.strip().startswith("```"): + json_pattern = r"```(?:json)?\s*([\s\S]*?)\s*```" + match = re.search(json_pattern, content, re.MULTILINE) + if match: + content = match.group(1).strip() + return commentjson.loads(content) except Exception as e: LOG.error("Failed to parse LLM response.", content=content) raise InvalidLLMResponseFormat(content) from e +def fix_cutoff_json(json_string: str, error_position: int) -> dict[str, Any]: + """ + Fixes a cutoff JSON string by ignoring the last incomplete action and making it a valid JSON. + + Args: + json_string (str): The cutoff JSON string to process. + error_position (int): The position of the error in the JSON string. + + Returns: + str: The fixed JSON string. + """ + LOG.info("Fixing cutoff JSON string.") + try: + # Truncate the string to the error position + truncated_string = json_string[:error_position] + # Find the last valid action + last_valid_action_pos = truncated_string.rfind("},") + if last_valid_action_pos != -1: + # Remove the incomplete action + fixed_string = truncated_string[: last_valid_action_pos + 1] + "\n ]\n}" + return commentjson.loads(fixed_string) + else: + # If no valid action found, return an empty actions list + LOG.warning("No valid action found in the cutoff JSON string.") + return {"actions": []} + except Exception as e: + raise InvalidLLMResponseFormat(json_string) from e def fix_unescaped_quotes_in_json(json_string: str) -> str: - """Fix unescaped quotes in JSON string.""" - escape_ \ No newline at end of file + """ + Extracts the positions of quotation marks that define the JSON structure + and the strings between them, handling unescaped quotation marks within strings. + + Args: + json_string (str): The JSON-like string to process. + + Returns: + str: The JSON-like string with unescaped quotation marks within strings. + """ + escape_char = "\\" + # Indices to add the escape character to. Since we're processing the string from left to right, we need to sort + # the indices in descending order to avoid index shifting. + indices_to_add_escape_char = [] + in_string = False + escape = False + json_structure_chars = {",", ":", "}", "]", "{", "["} + + i = 0 + while i < len(json_string): + char = json_string[i] + if char == escape_char: + escape = not escape + elif char == '"' and not escape: + if in_string: + # Check if the next non-whitespace character is a JSON structure character + j = i + 1 + # Skip whitespace characters + while j < len(json_string) and json_string[j].isspace(): + j += 1 + if j < len(json_string) and json_string[j] in json_structure_chars: + # If the next character is a JSON structure character, the quote is the end of the JSON string + in_string = False + else: + # If the next character is not a JSON structure character, the quote is part of the string + # Update the indices to add the escape character with the current index + indices_to_add_escape_char.append(i) + else: + # Start of the JSON string + in_string = True + else: + escape = False + i += 1 + + # Sort the indices in descending order to avoid index shifting then add the escape character to the string + if indices_to_add_escape_char: + LOG.warning("Unescaped quotes found in JSON string. Adding escape character to fix the issue.") + indices_to_add_escape_char.sort(reverse=True) + for index in indices_to_add_escape_char: + json_string = json_string[:index] + escape_char + json_string[index:] + + return json_string + +def fix_and_parse_json_string(json_string: str) -> dict[str, Any]: + """ + Auto-fixes a JSON string by escaping unescaped quotes and ignoring the last action if the JSON is cutoff. + + Args: + json_string (str): The JSON string to process. + + Returns: + dict[str, Any]: The parsed JSON object. + """ + + LOG.info("Auto-fixing JSON string.") + # Escape unescaped quotes in the JSON string + json_string = fix_unescaped_quotes_in_json(json_string) + try: + # Attempt to parse the JSON string + return commentjson.loads(json_string) + except Exception: + LOG.warning("Failed to parse JSON string. Attempting to fix the JSON string.") + try: + # This seems redundant but we're doing this to get error position. Comment json doesn't return that + return json.loads(json_string) + except json.JSONDecodeError as e: + error_position = e.pos + # Try to fix the cutoff JSON string and see if it can be parsed + return fix_cutoff_json(json_string, error_position) + +def try_to_extract_json_from_markdown_format(text: str) -> str: + pattern = r"```json\s*(.*?)\s*```" + match = re.search(pattern, text, re.DOTALL) + if match: + return match.group(1) + else: + return text + +def try_to_extract_json_from_markdown_format_llama(text: str) -> str: + """Extract JSON content from markdown code blocks. + This is particularly useful for models like Llama that may wrap their JSON responses. + + Args: + text (str): The text to process, which may contain JSON in markdown blocks + + Returns: + str: The extracted JSON string, or the original text if no JSON found + """ + # First try to extract from ```json blocks + json_pattern = r"```(?:json)?\s*([\s\S]*?)\s*```" + match = re.search(json_pattern, text, re.MULTILINE) + if match: + return match.group(1).strip() + + # If no code blocks found, try to extract anything that looks like a JSON object + json_object_pattern = r"\{[\s\S]*?\}" # Non-greedy match for nested objects + match = re.search(json_object_pattern, text) + if match: + return match.group(0) + + # If no JSON-like content found, return original text + return text From 44a04cd5069fb95fcd828f7fcb6772c2017b850d Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Sat, 9 Nov 2024 13:18:57 -0700 Subject: [PATCH 5/9] Refactor of promtps. Temporary disabling of skyvern prompts for Llama testing. --- skyvern/forge/prompts.py | 3 +- .../ollama/answer-user-detail-questions.j2 | 68 +++++----- .../ollama/auto-completion-choose-option.j2 | 120 +++++++++--------- .../auto-completion-potential-answers.j2 | 78 +++++++----- .../ollama/auto-completion-tweak-value.j2 | 105 ++++++++------- .../forge/prompts/ollama/check-user-goal.j2 | 75 +++++++---- .../forge/prompts/ollama/css-shape-convert.j2 | 57 ++++++--- skyvern/forge/prompts/ollama/custom-select.j2 | 105 ++++++++------- .../forge/prompts/ollama/extract-action.j2 | 113 +++++++++++++---- .../prompts/ollama/extract-information.j2 | 72 ++++++++--- skyvern/forge/prompts/ollama/generate-task.j2 | 89 +++++++++---- .../prompts/ollama/opened-dropdown-confirm.j2 | 54 ++++++-- .../ollama/parse-input-or-select-context.j2 | 63 +++++++-- .../ollama/summarize-max-steps-reason.j2 | 60 +++++++-- skyvern/forge/prompts/ollama/svg-convert.j2 | 21 ++- skyvern/forge/sdk/api/llm/utils.py | 2 +- 16 files changed, 695 insertions(+), 390 deletions(-) diff --git a/skyvern/forge/prompts.py b/skyvern/forge/prompts.py index ce836d93ec..cb7d42a9d4 100644 --- a/skyvern/forge/prompts.py +++ b/skyvern/forge/prompts.py @@ -1,4 +1,5 @@ from skyvern.forge.sdk.prompting import PromptEngine # Initialize the prompt engine -prompt_engine = PromptEngine("skyvern") +prompt_engine = PromptEngine("ollama") +prompt_engine_llama = PromptEngine("ollama") diff --git a/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 b/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 index fcce1ebcf8..2d99c81c95 100644 --- a/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 +++ b/skyvern/forge/prompts/ollama/answer-user-detail-questions.j2 @@ -1,43 +1,45 @@ -You are a precise question-answering assistant. Let's work through these questions systematically. +You are a JSON API endpoint that answers questions based on user details and goals. API endpoints ONLY return data - no explanations allowed. -For each question: -1. Read the question carefully -2. Look for relevant information in: - - User's goal - - User's details -3. Provide only the exact information needed -4. Format as key-value pairs in JSON - -Critical rules: -- Answer directly - no explanations -- Include only requested information -- Use exact values from provided details -- Maintain strict JSON format - -You will be given information about a user's goal and details. - -Your job is to answer the user's questions based on the information provided. - -The user's questions will be provided in JSON format. - -Your answers should be direct and to the point. No need to explain the answer. - -Your response should be in JSON format. Basically fill in the answer part and return the JSON. +Purpose: +- Answer user questions based on provided information +- Use exact information from user details +- Keep answers direct and concise +- Fill in answers as JSON key-value pairs +Input data: User's goal: {{ navigation_goal }} - User's details: {{ navigation_payload }} - User's questions: {{ queries_and_answers }} -YOUR RESPONSE HAS TO BE IN JSON FORMAT. DO NOT RETURN ANYTHING ELSE. -THESE ANSWERS WILL BE USED TO FILL OUT INFORMATION ON A WEBPAGE. DO NOT INCLUDE ANY UNRELATED INFORMATION OR UNNECESSARY DETAILS IN YOUR ANSWERS. - -EXAMPLE RESPONSE FORMAT: +Instructions for answering: +1. Read each question carefully +2. Find relevant information in user's goal and details +3. Provide only the exact information needed +4. Include answers in the JSON response +5. Keep answers direct - no explanations +6. Use precise values from provided details + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. NO additional formatting or whitespace +6. Response must be pure JSON only + +Response format (replace with actual answers): { - "question_1": "answer_1", - "question_2": "answer_2", - "question_3": "answer_3" + "question_1": "", + "question_2": "", + "question_3": "" } +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +These answers will be used to fill out information on a webpage automatically. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 b/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 index f917aebd1c..86d595e142 100644 --- a/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 +++ b/skyvern/forge/prompts/ollama/auto-completion-choose-option.j2 @@ -1,65 +1,61 @@ -You are an auto-completion selection expert. Let's analyze the available options carefully. - -Follow these steps: -1. First, confirm if auto-completion is active by checking for: - - Multiple suggestions appearing - - Even "No results" messages count as attempts -2. Then, if suggestions exist: - - Check each option against user's goal - - Verify the option has a valid element ID - - Evaluate how well it matches the context - -Remember: -- Only use existing element IDs -- Ignore non-meaningful messages -- Consider the user's specific goal - -There is an input element on an HTML page. Based on the context and information provided, you have two goals: - - Confirm if an auto-completion attempt appears after the user inputs the current value. - - If auto-completion suggestions appear, assist the user in selecting the most appropriate element based on the user's goal, details, and the context. - -You can confirm an auto-completion attempt based on the following rules: - - Several auto-completion suggestions appear for the input value. - - Although messages like "No results" and "No match" mean no option was matched, they still indicate an attempt to generate auto-completion suggestions. - -You must identify a potential auto-completion suggestion based on the following rules: - - The option must be an element with an ID from the provided "HTML elements". Do not create or assume options outside of these elements. - - The content of the option must be meaningful. Do not consider non-message indicators like "No results" or "No match" as valid options. - -MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. -Each interactable element is tagged with an ID. - -Reply in JSON format with the following keys: +You are a JSON API endpoint for auto-completion analysis. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Analyze auto-completion attempts for input fields +- Evaluate suggested options against user goals +- Select the most appropriate option +- Return analysis in strict JSON format + +Auto-completion Detection Rules: +1. Count as attempt if: + - Multiple suggestions appear + - Even "No results" messages indicate an attempt +2. Valid suggestions must: + - Have an ID from provided HTML elements + - Contain meaningful content (not just "No results") + - Match user goals and context + +Analysis Requirements: +1. Check for auto-completion presence +2. Evaluate suggestion relevance +3. Consider user goals and details +4. Select best matching element +5. Provide confidence ratings +6. Use only existing element IDs + +Input Data: +Context: Choose an auto-completion suggestion for "{{ field_information }}" +Input value: {{ filled_value }} +User goal: {{ navigation_goal }} +User details: {{ navigation_payload_str }} +HTML elements: {{ elements }} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: { - "auto_completion_attempt": bool, // True if there's any auto completion attempt based on the rules. Otherwise, it should be False. - "reasoning": str, // The reasoning behind the decision. Be specific, referencing the value and the element id in your reasoning. Mention why you chose the element id. Keep the reasoning short and to the point. - "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence. - "relevance_float": float, // The relative between the selected element and the provided information. You should consider how much the selected option is related to the user goal, the user details and the context. Pick a number between 0.00 and 1.00. 0.00 means no relevance, 1.00 means full relevance, the precision is 0.01. - "value": str, // The value to select. - "id": str, // The id of the most relevant and interactable element to take the action. The id must be from "HTML elements". It should be null if no element is relative or there's no auto completion suggestion. + "auto_completion_attempt": false, // true if attempt detected + "reasoning": "", // brief reason for decision + "confidence_float": 0.0, // 0.0 to 1.0 + "relevance_float": 0.0, // 0.00 to 1.00 + "value": "", // selected value + "id": null // element ID or null } -Context: -``` -Choose an auto-completion suggestion for "{{ field_information }}" -``` - -Input value: -``` -{{ filled_value }} -``` - -User goal: -``` -{{ navigation_goal }} -``` - -User details: -``` -{{ navigation_payload_str }} -``` - -HTML elements: -``` -{{ elements }} -``` \ No newline at end of file +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types + +This response will be used for automated webpage interaction. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 b/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 index a472193ba3..f51af90a6d 100644 --- a/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 +++ b/skyvern/forge/prompts/ollama/auto-completion-potential-answers.j2 @@ -1,43 +1,57 @@ -You're doing an auto completion input action on HTML page. The current filled value doesn't match any option. -Based on the context and current value, give ten most potential values with the same meaning as the current value. -You can provide values like: - - Subset or superset meaning from the current value - - Summarized from the current value - - Remove too detailed information, making more general and concise -But don't add any extra information to the value. +You are a JSON API endpoint for generating alternative input values. API endpoints ONLY return data - no explanations allowed. -You are a creative suggestion generator. Let's find alternative ways to express the current value. +Purpose: +- Generate 10 alternative values for failed auto-completion +- Maintain same core meaning as original value +- Provide variations that might match system expectations +- Return strictly formatted JSON array of options -For each suggestion, carefully: -1. Consider if it's more general or specific -2. Evaluate if it maintains the core meaning -3. Remove unnecessary details -4. Rate its relevance to the original +Value Generation Rules: +1. Create variations by: + - Using subset of original value + - Using superset of original value + - Summarizing original value + - Removing unnecessary details +2. Each variation must: + - Keep core meaning intact + - Not add new information + - Be more concise when possible +3. Order by relevance (highest to lowest) -Remember: -- Keep suggestions concise -- Don't add new information -- Focus on clarity and simplicity -- Order by relevance +Input Data: +Context: Choose an auto-completion suggestion for "{{ field_information }}" +Current Value: {{ current_value }} -MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. -Reply in JSON format with the following keys: +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Exactly 10 variations required + +Required Response Format: { "potential_values": [ { - "reasoning": str, // the reasoning why you recommend this value, including the relationship between the value you recommend and the current value. Keep the reasoning short and to the point. - "relevance_float": float, // The relative between the target value and the element. Pick a number between 0.00 and 1.00. 0.00 means no relevance, 1.00 means full relevance, the precision is 0.01. - "value": str, // the value you recommend + "reasoning": "", // brief explanation of relationship to original + "relevance_float": 0.00, // 0.00 to 1.00, two decimal places + "value": "" // alternative value } - ], // The list of potential values. Sorted by the descending order of relevance_float + // Repeat for total of 10 values + ] } -Context: -``` -Choose an auto-completion suggestion for "{{ field_information }}" -``` +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Fewer or more than 10 values +- Missing or extra fields +- Invalid value types +- Invalid relevance range -Current Value: -``` -{{ current_value }} -``` \ No newline at end of file +This response will be used for automated value suggestion. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 b/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 index cb879117e0..4d93afb1b6 100644 --- a/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 +++ b/skyvern/forge/prompts/ollama/auto-completion-tweak-value.j2 @@ -1,56 +1,55 @@ -You are an auto-completion optimization assistant. Let's work together to find the best input value. - -I will provide: -1. The current value that didn't work -2. Previously tried values -3. Any popup suggestions that appeared - -Your task is to carefully: -1. Analyze any popup elements for patterns -2. Consider how to modify the current value -3. Suggest a refined version that might work better - -Important rules: -- Never copy exact popup suggestions -- Make minimal, logical changes -- Keep the same core meaning -- Explain your reasoning clearly - -You're doing an auto completion input action on HTML page. User has tried several values, but none of them could find a match. -Based on the context, current value, tried values, option elements popped up while typing, tweak the value into a reasonable one based on the information. -You can try to change the value under the following rules: - 1. the value must be reasonably changed from the current value, like superset, subset of the current value - 2. If there're popped up elements, find the common concept among all elements, and then tweak the current value into a reasonable value based on the same concept. - -Don't add any extra information to the value. -Don't use any value from the popped up elements. - -MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. -Reply in JSON format with the following keys: +You are a JSON API endpoint for value refinement after failed auto-completions. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Analyze failed auto-completion attempts +- Identify patterns in popup suggestions +- Extract common concepts if present +- Generate refined input value +- Return analysis in strict JSON format + +Value Refinement Rules: +1. Current value modifications: + - Must relate to original value + - Can be subset or superset + - Must maintain core meaning +2. Popup element handling: + - Identify common patterns + - Extract shared concepts + - Don't copy exact values + - Use concept for guidance only + +Input Data: +Context: Choose an auto-completion suggestion for "{{ field_information }}" +Current Value: {{ current_value }} +Tried Values: {{ tried_values }} +Popped Elements: {{ popped_up_elements }} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: { - "is_any_popped_up_elements": bool, // if there's any popped up elements to extract the concept - "common_concept": str, // Simple words to describe the common concept among all elements. null if there's no popped up elements. - "reasoning": str, // The reasoning behind the change. Be specific, referencing tweaked value in your reasoning. Mention why you make this decision. Keep the reasoning short and to the point. - "confidence_float": float, // The confidence of the decision. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "tweaked_value": str, // the value tweaked from current value. If common_concept is not null, the value should also under the same concept + "is_any_popped_up_elements": false, // true if popups detected + "common_concept": null, // concept or null + "reasoning": "", // brief reason for changes + "confidence_float": 0.0, // 0.0 to 1.0 + "tweaked_value": "" // modified value } -Context: -``` -Choose an auto-completion suggestion for "{{ field_information }}" -``` - -Current Value: -``` -{{ current_value }} -``` - -Tried Values: -``` -{{ tried_values }} -``` - -Popped up elements: -``` -{{ popped_up_elements }} -``` \ No newline at end of file +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Direct copying of popup values + +This response will be used for automated value refinement. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/check-user-goal.j2 b/skyvern/forge/prompts/ollama/check-user-goal.j2 index e08181c10f..7545b486dc 100644 --- a/skyvern/forge/prompts/ollama/check-user-goal.j2 +++ b/skyvern/forge/prompts/ollama/check-user-goal.j2 @@ -1,34 +1,57 @@ -You are a web navigation assistant. Your task is to analyze if a user has achieved their goal on a webpage. +You are a JSON API endpoint for analyzing goal completion status. API endpoints ONLY return data - no explanations allowed. -Follow these precise steps: -1. First, carefully read the user's goal and details -2. Then, examine all visible elements on the page -3. Look for specific evidence of goal completion -4. Document your reasoning step-by-step +Purpose: +- Analyze webpage content against user goals +- Check if user objective is complete +- Evaluate page elements and content +- Provide structured analysis in JSON +- Return clear completion status -Remember: -- Focus only on visible elements and text -- Look for concrete evidence, not assumptions -- Be specific in your reasoning +Analysis Requirements: +1. Page Information: + - Identify relevant page elements + - Extract useful content + - Match elements to user goal + - Document key findings +2. Analysis Process: + - Compare page state to goal + - Evaluate completion criteria + - Check required elements + - Verify user details match +3. Goal Status: + - Determine if goal is met + - Provide evidence-based decision + - Use strict true/false evaluation -Note: Validate your response carefully. It must be a single JSON object with exactly these three fields: -- page_info: Clear description of relevant page elements -- thoughts: Step-by-step analysis of goal completion -- user_goal_achieved: Boolean conclusion based on evidence +Input Data: +Elements on page: {{ elements }} +User Goal: {{ navigation_goal }} +User Details: {{ navigation_payload }} -Make sure to ONLY return the JSON object in this format with no additional text before or after it: -```json +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: { - "page_info": str, // Think step by step. Describe all the useful information in the page related to the user goal. - "thoughts": str, // Think step by step. What information makes you believe whether user goal has completed or not. Use information you see on the site to explain. - "user_goal_achieved": bool // True if the user goal has been completed, false otherwise. + "page_info": "", // relevant page information and findings + "thoughts": "", // analysis of goal completion evidence + "user_goal_achieved": false // true if goal completed, false if not } -Elements on the page: -{{ elements }} - -User Goal: -{{ navigation_goal }} +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Incorrect boolean format -User Details: -{{ navigation_payload }} +This response will be used for automated goal verification. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/css-shape-convert.j2 b/skyvern/forge/prompts/ollama/css-shape-convert.j2 index 43750d7a35..6f5ec65ea9 100644 --- a/skyvern/forge/prompts/ollama/css-shape-convert.j2 +++ b/skyvern/forge/prompts/ollama/css-shape-convert.j2 @@ -1,21 +1,46 @@ -You are a visual analysis assistant. I will show you a screenshot of an HTML element, and you need to analyze its shape carefully. +You are a JSON API endpoint for visual element analysis. API endpoints ONLY return data - no explanations allowed. -Follow these steps: -1. Look at the element's overall geometric form -2. Identify any distinctive features (rounded corners, sharp edges, etc.) -3. Determine what this shape typically represents in UI design -4. Rate your confidence in this assessment +Purpose: +- Analyze HTML element appearance +- Identify visual shape and meaning +- Provide confidence rating +- Return analysis in strict JSON format -Important: Your response must be in valid JSON format with no extra text. -Example of valid response: +Analysis Requirements: +1. Shape Description: + - Brief, clear description + - Include visual appearance + - Include implied meaning + - Keep description concise +2. Confidence Rating: + - Rate certainty of analysis + - Use 0.0 to 1.0 scale + - Consider clarity of shape + - Consider common usage + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: { - "confidence_float": 0.95, - "shape": "rounded rectangle button typical of call-to-action elements" + "confidence_float": 0.0, // 0.0 to 1.0 + "shape": "" // brief description } -MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. -Reply in JSON format with the following keys: -{ - "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "shape": string, // A short description of the shape of element and its meaning -} \ No newline at end of file +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Invalid confidence range + +This response will be used for automated element classification. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/custom-select.j2 b/skyvern/forge/prompts/ollama/custom-select.j2 index bc0ab98f32..550f5429cb 100644 --- a/skyvern/forge/prompts/ollama/custom-select.j2 +++ b/skyvern/forge/prompts/ollama/custom-select.j2 @@ -1,51 +1,62 @@ -Help select an option from a {{ "multi-level" if select_history else "single" }} selection menu. +You are a JSON API endpoint for HTML element selection and input. API endpoints ONLY return data - no explanations allowed. -IMPORTANT: -1. Return ONLY a JSON object -2. Use only valid options from the elements list -3. Don't select placeholders or loading indicators -4. Required fields must be filled +Purpose: +- Perform {{ "multi-level selection" if select_history else "selection" }} on webpage +- Choose best matching element or input value +- Consider user goals and context +- Return decision in strict JSON format -Selection Context: -Field: "{{ field_information }}" -Required: {{ "yes" if required_field else "no" }} +Selection Rules: +1. Element Matching: + - Match to user goal and details + - Consider fallback options if needed + - Never select placeholders + - Skip loading indicators + - Required fields must have value{% if select_history %} + - Consider selection history + - Complete multi-level process{% endif %} + +2. Action Types: + - CLICK: Select existing option + - INPUT_TEXT: Search only if no valid options + +Input Data: +Context: Select an option for "{{ field_information }}" ({{ "required" if required_field else "optional" }}) {% if target_value %}Target Value: {{ target_value }}{% endif %} +User Goal: {{ navigation_goal }} +User Details: {{ navigation_payload_str }} +Elements: {{ elements }} +{% if select_history %}Selection History: {{ select_history }}{% endif %} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "reasoning": "", // brief reason for selection + "confidence_float": 0.0, // 0.0 to 1.0 + "id": "", // element ID from list + "action_type": "", // "CLICK" or "INPUT_TEXT" + "value": ""{% if target_value %}, + "relevant": false // true if matches target{% endif %} +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Invalid action_type values +- Empty required fields +- Placeholder selections -Required JSON fields: -- reasoning: Why this option was chosen -- confidence_float: 0.0 to 1.0 -- id: Element ID from the list -- action_type: "CLICK" or "INPUT_TEXT" -- value: Selected value -{% if target_value %}- relevant: Whether selection matches target{% endif %} - -Context: -``` -Select an option for "{{ field_information }}". It's {{ "a required" if required_field else "an optional" }} field. -``` -{% if target_value %} -Target value: -``` -{{ target_value }} -``` -{% endif %} -User goal: -``` -{{ navigation_goal }} -``` - -User details: -``` -{{ navigation_payload_str }} -``` - -HTML elements: -``` -{{ elements }} -``` -{% if select_history %} -Select History: -``` -{{ select_history }} -``` -{% endif %} \ No newline at end of file +This response will be used for automated form interaction. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/extract-action.j2 b/skyvern/forge/prompts/ollama/extract-action.j2 index 1c80561509..4ae9b22603 100644 --- a/skyvern/forge/prompts/ollama/extract-action.j2 +++ b/skyvern/forge/prompts/ollama/extract-action.j2 @@ -1,32 +1,91 @@ -Determine browser actions to achieve a user goal. +You are a JSON API endpoint for automated web navigation. API endpoints ONLY return data - no explanations allowed. -IMPORTANT: -1. Output ONLY a JSON object -2. Use only existing elements -3. Prioritize red warnings and popups -4. Return COMPLETE when goal achieved -5. Return TERMINATE only if goal impossible +Purpose: +- Analyze webpage state +- Plan goal-oriented actions +- Consider element context +- Handle required fields +- Process error conditions +- Return structured action plan -Required JSON Structure: +Analysis Requirements: +1. Page Elements: + - Use only existing elements + - Check for red error text + - Prioritize popup actions + - Verify required fields + - Consider SVG meanings + +2. Action Planning: + - Match user goal state + - Use provided details + - Avoid duplicate actions + - Handle errors appropriately + - Consider action history + +Input Data: +Current URL: {{ current_url }} +Elements: {{ elements }} +User Goal: {{ navigation_goal }} +{% if data_extraction_goal %}Data Extraction: {{ data_extraction_goal }}{% endif %} +User Details: {{ navigation_payload_str }} +{% if action_history %}Action History: {{ action_history }}{% endif %} +UTC DateTime: {{ utc_datetime }} +{% if error_code_mapping_str %}Error Codes: {{ error_code_mapping_str }}{% endif %} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only + +Required Response Format: { - "user_goal_stage": string description of progress, - "user_goal_achieved": boolean, - "action_plan": string description or "COMPLETE"/"TERMINATE", - "actions": array of action objects + "user_goal_stage": "", // current progress description + "user_goal_achieved": false, // true if complete + "action_plan": "", // summary of planned actions + "actions": [{ + "reasoning": "", // why this action + "user_detail_query": null, // question for needed details + "user_detail_answer": null, // answer from provided info + "confidence_float": 0.0, // 0.0 to 1.0 + "action_type": "", // CLICK/INPUT_TEXT/etc. + "id": "", // element ID from list + "text": null, // for INPUT_TEXT + "file_url": null, // for UPLOAD_FILE + "download": false, // for CLICK downloads + "option": null{% if error_code_mapping_str %}, + "errors": [{ + "error_code": "", + "reasoning": "", + "confidence_float": 0.0 + }]{% endif %} + }]{% if verification_code_check %}, + "verification_code_reasoning": "", + "need_verification_code": false{% endif %} } -// ...existing code for action object structure... - -Current State: -1. URL: {{ current_url }} -2. Goal: {{ navigation_goal }} -{% if data_extraction_goal %} -3. Data Goal: {{ data_extraction_goal }} -{% endif %} -4. User Details: {{ navigation_payload_str }} -5. Time (UTC): {{ utc_datetime }} - -{% if action_history %} -Previous Actions: -{{ action_history }} -{% endif %} +Valid Action Types: +- CLICK: Click element +- INPUT_TEXT: Enter text +- UPLOAD_FILE: Upload file +- SELECT_OPTION: Choose option +- WAIT: Wait for changes +- SOLVE_CAPTCHA: Handle captcha +- COMPLETE: Goal achieved +- TERMINATE: Stop process + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Invalid action types +- Nonexistent element IDs + +This response will be used for automated web interaction. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/extract-information.j2 b/skyvern/forge/prompts/ollama/extract-information.j2 index dee3e29142..88cc2f7232 100644 --- a/skyvern/forge/prompts/ollama/extract-information.j2 +++ b/skyvern/forge/prompts/ollama/extract-information.j2 @@ -1,26 +1,56 @@ -Extract information from a webpage screenshot. +You are a JSON API endpoint for webpage data extraction. API endpoints ONLY return data - no explanations allowed. -IMPORTANT: -1. Return ONLY a JSON object -2. Follow the exact schema format -3. Use null for missing information -4. No additional text or explanations +Purpose: +- Extract specified data from webpage +- Match exact schema requirements +- Return null for unavailable data +- Maintain strict JSON format -{% if extracted_information_schema %} -Required Schema: -{{ extracted_information_schema }} -{% endif %} +Extraction Requirements: +1. Data Sources: + - Screenshot content + - Page elements + - URL information + - Extracted text + - User payload data -Data Extraction Goal: {{ data_extraction_goal }} +2. Output Rules: + - Follow schema exactly + - Use null for missing data + - Include all required fields + - Exclude extra fields{% if error_code_mapping_str %} + - Use only defined error codes{% endif %} -Available Information: -1. URL: {{ current_url }} -2. DOM Elements: {{ elements }} -3. Page Text: {{ extracted_text }} -4. User Data: {{ navigation_payload }} -5. Current UTC Time: {{ utc_datetime }} +Input Data: +Current URL: {{ current_url }} +Elements: {{ elements }} +Extraction Goal: {{ data_extraction_goal }} +Text Content: {{ extracted_text }} +Navigation Payload: {{ navigation_payload }} +UTC DateTime: {{ utc_datetime }} +{% if error_code_mapping_str %}Error Codes: {{ error_code_mapping_str }}{% endif %} -{% if error_code_mapping_str %} -Error Codes: -{{ error_code_mapping_str }} -{% endif %} \ No newline at end of file +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Follow schema exactly: +{% if extracted_information_schema %}{{ extracted_information_schema }}{% else %}{ + // Schema not provided - use minimal structure +}{% endif %} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Fields not in schema +- Missing required fields +- Invalid value types{% if error_code_mapping_str %} +- Undefined error codes{% endif %} + +This response will be used for automated data processing. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/generate-task.j2 b/skyvern/forge/prompts/ollama/generate-task.j2 index 1cfd4d0afc..336d1a0f93 100644 --- a/skyvern/forge/prompts/ollama/generate-task.j2 +++ b/skyvern/forge/prompts/ollama/generate-task.j2 @@ -1,24 +1,65 @@ -Your task is to create a browser automation task definition. - -IMPORTANT: -1. Output ONLY a JSON object -2. No explanations or additional text -3. All URLs must use HTTPS -4. At least one goal (navigation or data extraction) must be provided - -Required Fields: -1. url (string): Starting webpage URL -2. suggested_title (string): Brief task description -3. navigation_goal_reasoning (string): Why navigation is needed -4. is_navigation_goal_required (boolean): Whether navigation is needed -5. data_extraction_goal_reasoning (string): Why data extraction is needed -6. is_data_extraction_goal_required (boolean): Whether data extraction is needed - -Optional Fields (use null if not needed): -1. navigation_goal (string): Include "COMPLETE when..." criteria -2. data_extraction_goal (string): Data to extract -3. navigation_payload (json): Form values or parameters - -Create a task for this prompt: -``` -{{ user_prompt }} +You are a JSON API endpoint for browser task creation. API endpoints ONLY return data - no explanations allowed. + +Purpose: +- Parse user task requirements +- Generate automation schema +- Define task goals and payloads +- Return strict JSON format + +Task Requirements: +1. Required Fields: + - url: HTTPS starting point + - suggested_title: Brief task description + - navigation_goal_reasoning: Why navigation needed + - is_navigation_goal_required: Navigation requirement flag + - data_extraction_goal_reasoning: Why extraction needed + - is_data_extraction_goal_required: Extraction requirement flag + +2. Optional Fields: + - navigation_goal: Action steps (if required) + - data_extraction_goal: Data targets (if required) + - navigation_payload: Required input data + +3. Validation Rules: + - At least one goal required + - Navigation goal needs completion criteria + - URLs must use HTTPS + - Use null for unused fields + +Input Data: +User Prompt: {{ user_prompt }} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only + +Required Response Format: +{ + "url": "", // required, HTTPS only + "suggested_title": "", // required, brief description + "navigation_goal_reasoning": "", // required, why navigation needed + "is_navigation_goal_required": false, // required boolean + "navigation_goal": null, // optional, include COMPLETE criteria + "data_extraction_goal_reasoning": "", // required, why extraction needed + "is_data_extraction_goal_required": false, // required boolean + "data_extraction_goal": null, // optional, data requirements + "navigation_payload": null // optional, input data +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing required fields +- Invalid value types +- Non-HTTPS URLs +- No goals defined +- Missing completion criteria + +This response will be used for automated task creation. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/opened-dropdown-confirm.j2 b/skyvern/forge/prompts/ollama/opened-dropdown-confirm.j2 index 9331f5e4c2..b4a21862f9 100644 --- a/skyvern/forge/prompts/ollama/opened-dropdown-confirm.j2 +++ b/skyvern/forge/prompts/ollama/opened-dropdown-confirm.j2 @@ -1,12 +1,48 @@ -Check if a screenshot shows an open dropdown menu. +You are a JSON API endpoint for dropdown menu detection. API endpoints ONLY return data - no explanations allowed. -IMPORTANT: Return ONLY a JSON object. No other text. +Purpose: +- Analyze screenshot for dropdown menu +- Check for visible options +- Exclude placeholder messages +- Return analysis in strict JSON format -A dropdown menu is considered open if: -1. At least one valid option is visible -2. "No results" or "No match" messages don't count -3. Placeholders like "Please select" don't count +Detection Rules: +1. Valid Dropdown: + - Has visible options + - Options are selectable + - Menu is expanded -Required JSON fields: -- reasoning: Why you determined it is/isn't a dropdown -- is_opened_dropdown_menu: true/false \ No newline at end of file +2. Invalid Cases: + - "No results" messages + - "No match" indicators + - Placeholder text only + - "Please select" options + - Single dash (-) options + - "Select..." text + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "reasoning": "", // brief detection explanation + "is_opened_dropdown_menu": false // true if valid dropdown detected +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Invalid boolean format + +This response will be used for automated menu detection. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/parse-input-or-select-context.j2 b/skyvern/forge/prompts/ollama/parse-input-or-select-context.j2 index cb3f381d72..6a966b6072 100644 --- a/skyvern/forge/prompts/ollama/parse-input-or-select-context.j2 +++ b/skyvern/forge/prompts/ollama/parse-input-or-select-context.j2 @@ -1,19 +1,54 @@ -You are analyzing an INPUT or SELECT element with ID "{{ element_id }}". +You are a JSON API endpoint for web field analysis. API endpoints ONLY return data - no explanations allowed. -IMPORTANT: -1. Respond ONLY with a JSON object -2. No additional text or explanations -3. No markdown formatting +Purpose: +- Analyze INPUT/SELECT element +- Verify field properties +- Cross-check with context +- Return structured analysis -Required JSON fields: -- thought: How you verified the information -- field: Which field this action will fill -- is_required: true/false for required field -- is_search_bar: true/false for search functionality +Analysis Requirements: +1. Field Checking: + - Element type verification + - Required status check + - Search functionality detection + - Field purpose identification -Context Information: -{{ action_reasoning }} +2. Verification Process: + - Compare with context + - Check element attributes + - Validate field purpose + - Confirm requirements -Available Elements: -{{ elements }} +Input Data: +Element ID: {{ element_id }} +Action Reasoning: {{ action_reasoning }} +Elements: {{ elements }} +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "thought": "", // verification process description + "field": "", // field purpose/name + "is_required": false, // required field status + "is_search_bar": false // search functionality status +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Invalid boolean format + +This response will be used for automated field interaction. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/summarize-max-steps-reason.j2 b/skyvern/forge/prompts/ollama/summarize-max-steps-reason.j2 index f108f3c48b..e93656d1aa 100644 --- a/skyvern/forge/prompts/ollama/summarize-max-steps-reason.j2 +++ b/skyvern/forge/prompts/ollama/summarize-max-steps-reason.j2 @@ -1,17 +1,55 @@ -Your task is to explain why a user goal was not achieved within {{ step_count }} steps. +You are a JSON API endpoint for task failure analysis. API endpoints ONLY return data - no explanations allowed. -IMPORTANT: Respond ONLY with a JSON object. No other text. +Purpose: +- Analyze failed task completion +- Review step history +- Evaluate page state +- Return structured analysis -Required JSON fields: -- page_info: Describe all page information related to the user goal -- reasoning: Short explanation of why the goal wasn't achieved +Analysis Requirements: +1. Page Information: + - Current page state + - Relevant UI elements + - Error messages + - Progress indicators -Here are the details to consider: - -Goal: {{ navigation_goal }} +2. Step Analysis: + - Review {{ step_count }} steps taken + - Identify failure points + - Consider user goals + - Evaluate action results +Input Data: +User Goal: {{ navigation_goal }} User Details: {{ navigation_payload }} - Steps Taken: -{% for step in steps %}Step {{ step.order }} -- {{ step.actions_result }} -{% endfor %} \ No newline at end of file +{% for step in steps %}Step {{ step.order }}: {{ step.actions_result }} +{% endfor %} + +CRITICAL FORMATTING RULES: +1. Start response with { and end with } +2. NO text before or after JSON +3. NO markdown formatting or code blocks +4. NO explanations, notes, or comments +5. Response must be pure JSON only +6. Use exact format specified below + +Required Response Format: +{ + "page_info": "", // current page state analysis + "reasoning": "" // failure cause analysis +} + +AUTOMATIC FAILURE TRIGGERS: +- Text before the opening { +- Text after the closing } +- Explanations or markdown +- Notes or comments +- Code blocks or ``` +- Any content outside JSON structure +- Missing or extra fields +- Invalid value types +- Generic explanations +- Missing step references + +This response will be used for automated failure analysis. Invalid format will cause system errors. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/svg-convert.j2 b/skyvern/forge/prompts/ollama/svg-convert.j2 index 3baa40a80e..ae2b1073ea 100644 --- a/skyvern/forge/prompts/ollama/svg-convert.j2 +++ b/skyvern/forge/prompts/ollama/svg-convert.j2 @@ -1,16 +1,11 @@ -You will analyze an SVG element and return information about its shape. +You are a JSON API endpoint that identifies icons and shapes. Return ONLY JSON with no other text. -IMPORTANT: You must respond ONLY with a JSON object. Do not add any other text. -Do not add explanations. Do not use markdown. +SVG Element: {{svg_element}} -Input SVG Element: -``` -{{svg_element}} -``` - -MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. -Reply in JSON format with the following keys: +Required format - no other text allowed: { - "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "shape": string, // A short description of the shape of SVG and its meaning -} \ No newline at end of file + "confidence_float": 0.0, // number between 0.0 and 1.0 + "shape": "" // what the shape shows +} + +Invalid formats will cause errors. \ No newline at end of file diff --git a/skyvern/forge/sdk/api/llm/utils.py b/skyvern/forge/sdk/api/llm/utils.py index e05582afb0..25977f2a60 100644 --- a/skyvern/forge/sdk/api/llm/utils.py +++ b/skyvern/forge/sdk/api/llm/utils.py @@ -22,7 +22,7 @@ async def llm_messages_builder( system_message = { "role": "system", "content": ( - "CRITICAL INSTRUCTION: You are a JSON-only assistant. DO NOT provide explanations or help.\n" + "CRITICAL INSTRUCTION: You are a JSON-only assistant. Do not respond in anything other than valid JSON and always follow the script you are given preisely.\n" ) } From 4219ecff5f87e639d7156cf955f12b5c24c3825f Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Sat, 9 Nov 2024 18:52:18 -0700 Subject: [PATCH 6/9] Further testing of custom prompts for llama. --- .../forge/prompts/ollama/extract-action.j2 | 158 ++++++++---------- .../prompts/ollama/extract-information.j2 | 76 ++++----- skyvern/forge/prompts/ollama/svg-convert.j2 | 17 +- skyvern/forge/sdk/api/llm/utils.py | 4 +- 4 files changed, 113 insertions(+), 142 deletions(-) diff --git a/skyvern/forge/prompts/ollama/extract-action.j2 b/skyvern/forge/prompts/ollama/extract-action.j2 index 4ae9b22603..156b14a9b9 100644 --- a/skyvern/forge/prompts/ollama/extract-action.j2 +++ b/skyvern/forge/prompts/ollama/extract-action.j2 @@ -1,91 +1,79 @@ -You are a JSON API endpoint for automated web navigation. API endpoints ONLY return data - no explanations allowed. +Identify actions to help user progress towards the user goal using the DOM elements given in the list and the screenshot of the website. +Include only the elements that are relevant to the user goal, without altering or imagining new elements. +Accurately interpret and understand the functional significance of SVG elements based on their shapes and context within the webpage. +Use the user details to fill in necessary values. Always satisfy required fields if the field isn't already filled in. Don't return any action for the same field, if this field is already filled in and the value is the same as the one you would have filled in. +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Each interactable element is tagged with an ID. +If you see any information in red in the page screenshot, this means a condition wasn't satisfied. prioritize actions with the red information. +If you see a popup in the page screenshot, prioritize actions on the popup. -Purpose: -- Analyze webpage state -- Plan goal-oriented actions -- Consider element context -- Handle required fields -- Process error conditions -- Return structured action plan - -Analysis Requirements: -1. Page Elements: - - Use only existing elements - - Check for red error text - - Prioritize popup actions - - Verify required fields - - Consider SVG meanings - -2. Action Planning: - - Match user goal state - - Use provided details - - Avoid duplicate actions - - Handle errors appropriately - - Consider action history - -Input Data: -Current URL: {{ current_url }} -Elements: {{ elements }} -User Goal: {{ navigation_goal }} -{% if data_extraction_goal %}Data Extraction: {{ data_extraction_goal }}{% endif %} -User Details: {{ navigation_payload_str }} -{% if action_history %}Action History: {{ action_history }}{% endif %} -UTC DateTime: {{ utc_datetime }} -{% if error_code_mapping_str %}Error Codes: {{ error_code_mapping_str }}{% endif %} - -CRITICAL FORMATTING RULES: -1. Start response with { and end with } -2. NO text before or after JSON -3. NO markdown formatting or code blocks -4. NO explanations, notes, or comments -5. Response must be pure JSON only - -Required Response Format: +Reply in JSON format with the following keys: { - "user_goal_stage": "", // current progress description - "user_goal_achieved": false, // true if complete - "action_plan": "", // summary of planned actions - "actions": [{ - "reasoning": "", // why this action - "user_detail_query": null, // question for needed details - "user_detail_answer": null, // answer from provided info - "confidence_float": 0.0, // 0.0 to 1.0 - "action_type": "", // CLICK/INPUT_TEXT/etc. - "id": "", // element ID from list - "text": null, // for INPUT_TEXT - "file_url": null, // for UPLOAD_FILE - "download": false, // for CLICK downloads - "option": null{% if error_code_mapping_str %}, - "errors": [{ - "error_code": "", - "reasoning": "", - "confidence_float": 0.0 - }]{% endif %} - }]{% if verification_code_check %}, - "verification_code_reasoning": "", - "need_verification_code": false{% endif %} + "user_goal_stage": str, // A string to describe the reasoning whether user goal has been achieved or not. + "user_goal_achieved": bool, // True if the user goal has been completed, otherwise False. + "action_plan": str, // A string that describes the plan of actions you're going to take. Be specific and to the point. Use this as a quick summary of the actions you're going to take, and what order you're going to take them in, and how that moves you towards your overall goal. Output "COMPLETE" action in the "actions" if user_goal_achieved is True. Output "TERMINATE" action in the "actions" if your plan is to terminate the process. + "actions": array // An array of actions. Here's the format of each action: + [{ + "reasoning": str, // The reasoning behind the action. This reasoning must be user information agnostic. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point. + "user_detail_query": str, // Think of this value as a Jeopardy question. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Otherwise, use null. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, use null. + "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details. + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned. + "id": str, // The id of the element to take action on. The id has to be one from the elements list + "text": str, // Text for INPUT_TEXT action only + "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. + "download": bool, // Can only be true for CLICK actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download. + "option": { // The option to select for SELECT_OPTION action only. null if not SELECT_OPTION action + "label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE + "index": int, // the index corresponding to the option index under the select element. + "value": str // the value of the option. MAKE SURE YOU USE THIS VALUE TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION VALUE HERE + }, +{% if error_code_mapping_str %} + "errors": array // A list of errors. This is used to surface any errors that matches the current situation for COMPLETE and TERMINATE actions. For other actions or if no error description suits the current situation on the screenshots, return an empty list. You are allowed to return multiple errors if there are multiple errors on the page. + [{ + "error_code": str, // The error code from the user's error code list + "reasoning": str, // The reasoning behind the error. Be specific, referencing any user information and their fields in your reasoning. Keep the reasoning short and to the point. + "confidence_float": float // The confidence of the error. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + }] +{% endif %} + }],{% if verification_code_check %} + "verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if a verification code is needed for login or any verification step. Explain why you believe a verification code is needed or not. Has the code been sent and is code available somewhere (email, phone or 2FA device)? + "need_verification_code": bool, // Whether a verification code is needed to proceed. True only if the code is available to user. If the code is not sent, return false {% endif %} } +{% if action_history %} +Consider the action history from the last step and the screenshot together, if actions from the last step don't yield positive impact, try other actions or other action combinations. +{% endif %} +Clickable elements from `{{ current_url }}`: +``` +{{ elements }} +``` -Valid Action Types: -- CLICK: Click element -- INPUT_TEXT: Enter text -- UPLOAD_FILE: Upload file -- SELECT_OPTION: Choose option -- WAIT: Wait for changes -- SOLVE_CAPTCHA: Handle captcha -- COMPLETE: Goal achieved -- TERMINATE: Stop process +The URL of the page you're on right now is `{{ current_url }}`. -AUTOMATIC FAILURE TRIGGERS: -- Text before the opening { -- Text after the closing } -- Explanations or markdown -- Notes or comments -- Code blocks or ``` -- Any content outside JSON structure -- Missing or extra fields -- Invalid value types -- Invalid action types -- Nonexistent element IDs +User goal: +``` +{{ navigation_goal }} +``` +{% if error_code_mapping_str %} +Use the error codes and their descriptions to surface user-defined errors. Do not return any error that's not defined by the user. User defined errors: +{{ error_code_mapping_str }} +{% endif %} +{% if data_extraction_goal %} +User Data Extraction Goal: +``` +{{ data_extraction_goal }} +``` +{% endif %} -This response will be used for automated web interaction. Invalid format will cause system errors. \ No newline at end of file +User details: +``` +{{ navigation_payload_str }} +``` +{% if action_history %} +Action results from previous steps: (note: even if the action history suggests goal is achieved, check the screenshot and the DOM elements to make sure the goal is achieved) +{{ action_history }} +{% endif %} +Current datetime in UTC, YYYY-MM-DD HH:MM format: +``` +{{ utc_datetime }} +``` diff --git a/skyvern/forge/prompts/ollama/extract-information.j2 b/skyvern/forge/prompts/ollama/extract-information.j2 index 88cc2f7232..46de997b5e 100644 --- a/skyvern/forge/prompts/ollama/extract-information.j2 +++ b/skyvern/forge/prompts/ollama/extract-information.j2 @@ -1,56 +1,38 @@ -You are a JSON API endpoint for webpage data extraction. API endpoints ONLY return data - no explanations allowed. +You are given a screenshot, user data extraction goal, the JSON schema for the output data format, and the current URL. -Purpose: -- Extract specified data from webpage -- Match exact schema requirements -- Return null for unavailable data -- Maintain strict JSON format +Your task is to: -Extraction Requirements: -1. Data Sources: - - Screenshot content - - Page elements - - URL information - - Extracted text - - User payload data +Extract the requested information from the screenshot and output it in the specified JSON schema format: -2. Output Rules: - - Follow schema exactly - - Use null for missing data - - Include all required fields - - Exclude extra fields{% if error_code_mapping_str %} - - Use only defined error codes{% endif %} +DO NOT USE: +❌ Markdown formatting +❌ Code blocks +❌ Explanations +❌ HTML analysis +❌ Notes or comments -Input Data: -Current URL: {{ current_url }} -Elements: {{ elements }} -Extraction Goal: {{ data_extraction_goal }} -Text Content: {{ extracted_text }} -Navigation Payload: {{ navigation_payload }} -UTC DateTime: {{ utc_datetime }} -{% if error_code_mapping_str %}Error Codes: {{ error_code_mapping_str }}{% endif %} - -CRITICAL FORMATTING RULES: -1. Start response with { and end with } -2. NO text before or after JSON -3. NO markdown formatting or code blocks -4. NO explanations, notes, or comments -5. Response must be pure JSON only -6. Follow schema exactly: +REQUIRED FORMAT: {% if extracted_information_schema %}{{ extracted_information_schema }}{% else %}{ // Schema not provided - use minimal structure + "extracted_data": null }{% endif %} -AUTOMATIC FAILURE TRIGGERS: -- Text before the opening { -- Text after the closing } -- Explanations or markdown -- Notes or comments -- Code blocks or ``` -- Any content outside JSON structure -- Fields not in schema -- Missing required fields -- Invalid value types{% if error_code_mapping_str %} -- Undefined error codes{% endif %} +SYSTEM RULES: +1. Start with { +2. End with } +3. Only pure JSON allowed +4. No explanations or analysis +5. Use null for missing data +6. Follow schema exactly +7. No markdown or formatting + +Input Data: +GOAL={{ data_extraction_goal }} +URL={{ current_url }} +ELEMENTS={{ elements }} +TEXT={{ extracted_text }} +DETAILS={{ navigation_payload }} +TIME={{ utc_datetime }} +{% if error_code_mapping_str %}ERRORS={{ error_code_mapping_str }}{% endif %} -This response will be used for automated data processing. Invalid format will cause system errors. \ No newline at end of file +SYSTEM WARNING: Response format violations will cause task termination. \ No newline at end of file diff --git a/skyvern/forge/prompts/ollama/svg-convert.j2 b/skyvern/forge/prompts/ollama/svg-convert.j2 index ae2b1073ea..3aa7964d0d 100644 --- a/skyvern/forge/prompts/ollama/svg-convert.j2 +++ b/skyvern/forge/prompts/ollama/svg-convert.j2 @@ -1,11 +1,14 @@ -You are a JSON API endpoint that identifies icons and shapes. Return ONLY JSON with no other text. +You are given a svg element. You need to figure out what its shape means. +SVG Element: +``` +{{svg_element}} +``` -SVG Element: {{svg_element}} - -Required format - no other text allowed: +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Reply in JSON format with the following keys: { - "confidence_float": 0.0, // number between 0.0 and 1.0 - "shape": "" // what the shape shows + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "shape": string, // A short description of the shape of SVG and its meaning } -Invalid formats will cause errors. \ No newline at end of file +Leaving shape empty will cause errors. Do not do that. Please provide a valid shape. diff --git a/skyvern/forge/sdk/api/llm/utils.py b/skyvern/forge/sdk/api/llm/utils.py index 25977f2a60..eaf3cb9ae6 100644 --- a/skyvern/forge/sdk/api/llm/utils.py +++ b/skyvern/forge/sdk/api/llm/utils.py @@ -21,9 +21,7 @@ async def llm_messages_builder( if is_llama: system_message = { "role": "system", - "content": ( - "CRITICAL INSTRUCTION: You are a JSON-only assistant. Do not respond in anything other than valid JSON and always follow the script you are given preisely.\n" - ) + "content": "You are a helpful assistant. You keep to the strict formatting rules. You are loved. You are appreciated. You are a good assistant." } # Build content array with images first From 4b3ee6bb87bea83c73e7b1610c2d7e4f7450c1e8 Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Sun, 10 Nov 2024 09:32:44 -0700 Subject: [PATCH 7/9] Switched LLM server. Fine tuned extract-actions for llama. --- docker-compose.yml | 4 +- skyvern/config.py | 2 +- .../forge/prompts/ollama/extract-action.j2 | 134 ++++++++---------- skyvern/forge/sdk/api/llm/llama_handler.py | 2 +- skyvern/forge/sdk/settings_manager.py | 2 +- 5 files changed, 64 insertions(+), 80 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index b6c22abaee..03c1a666b7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,7 +40,7 @@ services: - BROWSER_TYPE=chromium-headful - ENABLE_LLAMA=true - LLM_KEY=LLAMA3 - - LLAMA_API_BASE=http://host.docker.internal:11434 + - LLAMA_API_BASE=http://192.168.1.65:11434 - LLAMA_MODEL_NAME=llama3.2-vision - LLAMA_API_ROUTE=/api/chat - ENABLE_OPENAI=false @@ -48,7 +48,7 @@ services: - ENABLE_AZURE=false - ENABLE_BEDROCK=false - ENABLE_AZURE_GPT4O_MINI=false - - LLAMA_BASE_URL=http://host.docker.internal:11434 + - LLAMA_BASE_URL=http://192.168.1.65:11434 - LLAMA_MODEL=llama3.2-vision - ENV=local - SECONDARY_LLM_KEY=LLAMA3 diff --git a/skyvern/config.py b/skyvern/config.py index c94e82526b..54d2e0e77d 100644 --- a/skyvern/config.py +++ b/skyvern/config.py @@ -14,7 +14,7 @@ class Settings(BaseSettings): # Llama Configuration ENABLE_LLAMA: bool = True - LLAMA_API_BASE: str = "http://host.docker.internal:11434" + LLAMA_API_BASE: str = "http://192.168.1.65:11434" LLAMA_MODEL_NAME: str = "llama3.2-vision" LLAMA_API_ROUTE: str = "/api/chat" LLM_KEY: str = "LLAMA3" diff --git a/skyvern/forge/prompts/ollama/extract-action.j2 b/skyvern/forge/prompts/ollama/extract-action.j2 index 156b14a9b9..b688fe53cb 100644 --- a/skyvern/forge/prompts/ollama/extract-action.j2 +++ b/skyvern/forge/prompts/ollama/extract-action.j2 @@ -1,79 +1,63 @@ -Identify actions to help user progress towards the user goal using the DOM elements given in the list and the screenshot of the website. -Include only the elements that are relevant to the user goal, without altering or imagining new elements. -Accurately interpret and understand the functional significance of SVG elements based on their shapes and context within the webpage. -Use the user details to fill in necessary values. Always satisfy required fields if the field isn't already filled in. Don't return any action for the same field, if this field is already filled in and the value is the same as the one you would have filled in. -MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. -Each interactable element is tagged with an ID. -If you see any information in red in the page screenshot, this means a condition wasn't satisfied. prioritize actions with the red information. -If you see a popup in the page screenshot, prioritize actions on the popup. +You are executing an automated browser task. Here is your context: +1. Webpage URL: {{ current_url }} +2. Task Goal: {{ navigation_goal }} +3. Valid Element IDs: {{ elements }} +4. User Input Data: {{ navigation_payload_str }} -Reply in JSON format with the following keys: -{ - "user_goal_stage": str, // A string to describe the reasoning whether user goal has been achieved or not. - "user_goal_achieved": bool, // True if the user goal has been completed, otherwise False. - "action_plan": str, // A string that describes the plan of actions you're going to take. Be specific and to the point. Use this as a quick summary of the actions you're going to take, and what order you're going to take them in, and how that moves you towards your overall goal. Output "COMPLETE" action in the "actions" if user_goal_achieved is True. Output "TERMINATE" action in the "actions" if your plan is to terminate the process. - "actions": array // An array of actions. Here's the format of each action: - [{ - "reasoning": str, // The reasoning behind the action. This reasoning must be user information agnostic. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point. - "user_detail_query": str, // Think of this value as a Jeopardy question. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Otherwise, use null. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, use null. - "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details. - "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned. - "id": str, // The id of the element to take action on. The id has to be one from the elements list - "text": str, // Text for INPUT_TEXT action only - "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. - "download": bool, // Can only be true for CLICK actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download. - "option": { // The option to select for SELECT_OPTION action only. null if not SELECT_OPTION action - "label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE - "index": int, // the index corresponding to the option index under the select element. - "value": str // the value of the option. MAKE SURE YOU USE THIS VALUE TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION VALUE HERE - }, -{% if error_code_mapping_str %} - "errors": array // A list of errors. This is used to surface any errors that matches the current situation for COMPLETE and TERMINATE actions. For other actions or if no error description suits the current situation on the screenshots, return an empty list. You are allowed to return multiple errors if there are multiple errors on the page. - [{ - "error_code": str, // The error code from the user's error code list - "reasoning": str, // The reasoning behind the error. Be specific, referencing any user information and their fields in your reasoning. Keep the reasoning short and to the point. - "confidence_float": float // The confidence of the error. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - }] -{% endif %} - }],{% if verification_code_check %} - "verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if a verification code is needed for login or any verification step. Explain why you believe a verification code is needed or not. Has the code been sent and is code available somewhere (email, phone or 2FA device)? - "need_verification_code": bool, // Whether a verification code is needed to proceed. True only if the code is available to user. If the code is not sent, return false {% endif %} -} -{% if action_history %} -Consider the action history from the last step and the screenshot together, if actions from the last step don't yield positive impact, try other actions or other action combinations. -{% endif %} -Clickable elements from `{{ current_url }}`: -``` -{{ elements }} -``` +You must complete this task using the provided elements and context. Prioritize actions based on their relevance to the task goal, focusing on elements marked in red or any visible popups in screenshots. Do not deviate from or invent elements. Use SVG shapes contextually. + +Generate a structured JSON response that adheres to the provided format, with the following requirements: +- No text outside the JSON +- Valid, formatted JSON only +- Strict adherence to field values and specified constraints -The URL of the page you're on right now is `{{ current_url }}`. +ALLOWED ACTION TYPES for the variable action_type: +- "CLICK" - For buttons, links, and navigation +- "INPUT_TEXT" - For typing in text fields +- "UPLOAD_FILE" - For file upload fields +- "SELECT_OPTION" - For dropdown menus +- "CHECKBOX" - For checkboxes (prefer CLICK instead) +- "WAIT" - When waiting is needed +- "SOLVE_CAPTCHA" - For captcha solving +- "TERMINATE" - When goal cannot be achieved +- "COMPLETE" - When goal is achieved +- "NULL_ACTION" - When no action needed -User goal: -``` -{{ navigation_goal }} -``` -{% if error_code_mapping_str %} -Use the error codes and their descriptions to surface user-defined errors. Do not return any error that's not defined by the user. User defined errors: -{{ error_code_mapping_str }} -{% endif %} -{% if data_extraction_goal %} -User Data Extraction Goal: -``` -{{ data_extraction_goal }} -``` -{% endif %} +Output format (do not add or change fields): + +{ + "user_goal_stage": str, + "user_goal_achieved": bool, + "action_plan": str, + "actions": [ + { + "reasoning": str, + "user_detail_query": str, + "user_detail_answer": str, + "confidence_float": float, + "action_type": str, + "id": str, + "text": str, + "file_url": str, + "download": bool, + "option": { + "label": str, + "index": int, + "value": str + } + } + ]{% if error_code_mapping_str %}, + "errors": [ + { + "error_code": str, + "reasoning": str, + "confidence_float": float + } + ]{% endif %}{% if verification_code_check %}, + "verification_code_reasoning": str, + "need_verification_code": bool{% endif %} +} -User details: -``` -{{ navigation_payload_str }} -``` -{% if action_history %} -Action results from previous steps: (note: even if the action history suggests goal is achieved, check the screenshot and the DOM elements to make sure the goal is achieved) -{{ action_history }} -{% endif %} -Current datetime in UTC, YYYY-MM-DD HH:MM format: -``` -{{ utc_datetime }} -``` +- Focus only on relevant elements and task completion. +- No questions, clarifications, or explanations. +- Provide a valid JSON response matching the task requirements immediately. \ No newline at end of file diff --git a/skyvern/forge/sdk/api/llm/llama_handler.py b/skyvern/forge/sdk/api/llm/llama_handler.py index 0d703de53e..1b6190d193 100644 --- a/skyvern/forge/sdk/api/llm/llama_handler.py +++ b/skyvern/forge/sdk/api/llm/llama_handler.py @@ -22,7 +22,7 @@ async def llama_handler( # Convert screenshots to base64 for vision tasks payload["images"] = [base64.b64encode(img).decode('utf-8') for img in screenshots] - async with session.post("http://localhost:11434/api/chat", json=payload) as response: + async with session.post("http://192.168.1.65:11434/api/chat", json=payload) as response: result = await response.json() return { "choices": [{ diff --git a/skyvern/forge/sdk/settings_manager.py b/skyvern/forge/sdk/settings_manager.py index 78a11f25e9..e9bb5d7439 100644 --- a/skyvern/forge/sdk/settings_manager.py +++ b/skyvern/forge/sdk/settings_manager.py @@ -10,7 +10,7 @@ class Settings(BaseSettings): # Llama Configuration ENABLE_LLAMA: bool = Field(default=False, env="ENABLE_LLAMA") - LLAMA_API_BASE: str = Field(default="http://localhost:11434", env="LLAMA_API_BASE") + LLAMA_API_BASE: str = Field(default="http://192.168.1.65:11434", env="LLAMA_API_BASE") LLAMA_MODEL_NAME: str = Field(default="llama3.2-vision", env="LLAMA_MODEL_NAME") LLAMA_API_ROUTE: str = Field(default="/api/chat", env="LLAMA_API_ROUTE") From d14dd2074347845a8d37a188f850801571021a70 Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Mon, 11 Nov 2024 06:39:20 -0700 Subject: [PATCH 8/9] Llama Extract-action outputting json response. --- .../forge/prompts/ollama/extract-action.j2 | 100 ++++++++---------- 1 file changed, 47 insertions(+), 53 deletions(-) diff --git a/skyvern/forge/prompts/ollama/extract-action.j2 b/skyvern/forge/prompts/ollama/extract-action.j2 index b688fe53cb..dc02c28c6a 100644 --- a/skyvern/forge/prompts/ollama/extract-action.j2 +++ b/skyvern/forge/prompts/ollama/extract-action.j2 @@ -1,63 +1,57 @@ -You are executing an automated browser task. Here is your context: -1. Webpage URL: {{ current_url }} -2. Task Goal: {{ navigation_goal }} -3. Valid Element IDs: {{ elements }} -4. User Input Data: {{ navigation_payload_str }} - -You must complete this task using the provided elements and context. Prioritize actions based on their relevance to the task goal, focusing on elements marked in red or any visible popups in screenshots. Do not deviate from or invent elements. Use SVG shapes contextually. - -Generate a structured JSON response that adheres to the provided format, with the following requirements: -- No text outside the JSON -- Valid, formatted JSON only -- Strict adherence to field values and specified constraints - -ALLOWED ACTION TYPES for the variable action_type: -- "CLICK" - For buttons, links, and navigation -- "INPUT_TEXT" - For typing in text fields -- "UPLOAD_FILE" - For file upload fields -- "SELECT_OPTION" - For dropdown menus -- "CHECKBOX" - For checkboxes (prefer CLICK instead) -- "WAIT" - When waiting is needed -- "SOLVE_CAPTCHA" - For captcha solving -- "TERMINATE" - When goal cannot be achieved -- "COMPLETE" - When goal is achieved -- "NULL_ACTION" - When no action needed - -Output format (do not add or change fields): +EXECUTE IMMEDIATE TASK - DO NOT ANALYZE OR EXPLAIN + +You are currently running as an automated browser task. You have been provided: +1. A webpage to analyze (current URL: {{ current_url }}) +2. A specific goal to achieve: {{ navigation_goal }} +3. Valid element IDs to use: {{ elements }} +4. User details for forms: {{ navigation_payload_str }} + +This is an active automation task, not a request for help or clarification. +Execute this task now by providing a JSON response. +No questions or clarifications are needed - all required information is provided. + +Your task: Generate actions to help user progress towards the goal using the DOM elements in the list and any popup overlays present. +Include only the elements that exist in the provided list, without inventing new ones. +Use the user details to fill in necessary values in any forms. + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. + +Here's exactly what I need - this format, with these exact fields: { "user_goal_stage": str, "user_goal_achieved": bool, - "action_plan": str, - "actions": [ + "action_plan": str, + "actions": [ { - "reasoning": str, - "user_detail_query": str, + "reasoning": str, + "user_detail_query": str, "user_detail_answer": str, - "confidence_float": float, - "action_type": str, - "id": str, - "text": str, - "file_url": str, - "download": bool, - "option": { - "label": str, - "index": int, - "value": str + "confidence_float": float, + "action_type": str, + "id": str, + "text": str, + "file_url": str, + "download": bool, + "option": { + "label": str, + "index": int, + "value": str } } - ]{% if error_code_mapping_str %}, - "errors": [ - { - "error_code": str, - "reasoning": str, - "confidence_float": float - } - ]{% endif %}{% if verification_code_check %}, - "verification_code_reasoning": str, - "need_verification_code": bool{% endif %} + ] } -- Focus only on relevant elements and task completion. -- No questions, clarifications, or explanations. -- Provide a valid JSON response matching the task requirements immediately. \ No newline at end of file +FINAL REMINDER: This is an active task requiring immediate execution. +DO NOT: +- Ask questions +- Request clarification +- Offer alternatives +- Explain the format +DO: +- Return valid JSON now +- Include actions array +- Use provided element IDs +- Follow format exactly + +Now, give me your JSON response and remember, I do not want a question in return. Just the JSON response related to the forementioned requirements. \ No newline at end of file From ba837f34973c0360f58d66cc41bf3fd4484b631f Mon Sep 17 00:00:00 2001 From: Cole Stasney Date: Tue, 12 Nov 2024 16:15:05 -0700 Subject: [PATCH 9/9] Added utils function to compensate for LLama conversational responses. Closer to success with extract-action. --- .../forge/prompts/ollama/extract-action.j2 | 98 +++++++++---------- skyvern/forge/sdk/api/llm/utils.py | 71 +++++++++----- 2 files changed, 97 insertions(+), 72 deletions(-) diff --git a/skyvern/forge/prompts/ollama/extract-action.j2 b/skyvern/forge/prompts/ollama/extract-action.j2 index dc02c28c6a..ad1829091d 100644 --- a/skyvern/forge/prompts/ollama/extract-action.j2 +++ b/skyvern/forge/prompts/ollama/extract-action.j2 @@ -1,57 +1,55 @@ -EXECUTE IMMEDIATE TASK - DO NOT ANALYZE OR EXPLAIN - -You are currently running as an automated browser task. You have been provided: -1. A webpage to analyze (current URL: {{ current_url }}) -2. A specific goal to achieve: {{ navigation_goal }} -3. Valid element IDs to use: {{ elements }} -4. User details for forms: {{ navigation_payload_str }} - -This is an active automation task, not a request for help or clarification. -Execute this task now by providing a JSON response. -No questions or clarifications are needed - all required information is provided. - -Your task: Generate actions to help user progress towards the goal using the DOM elements in the list and any popup overlays present. -Include only the elements that exist in the provided list, without inventing new ones. -Use the user details to fill in necessary values in any forms. - -MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. - -Here's exactly what I need - this format, with these exact fields: - +EXECUTE THIS BROWSER TASK NOW: + +URL: {{ current_url }} +GOAL: {{ navigation_goal }} +ELEMENTS: {{ elements }} +USER_INPUT: {{ navigation_payload_str }} + +RETURN JSON WITH THESE EXACT ACTIONS: +1. INPUT_TEXT - For typing in search/text fields +2. CLICK - For buttons and links +3. SELECT_OPTION - For dropdowns +4. CHECKBOX - For checkboxes +5. WAIT - When waiting needed +6. SOLVE_CAPTCHA - For captchas +7. TERMINATE - If goal impossible +8. COMPLETE - When goal achieved +9. UPLOAD_FILE - For file uploads +10. NULL_ACTION - When no action needed + +EXAMPLE RESPONSE: { - "user_goal_stage": str, - "user_goal_achieved": bool, - "action_plan": str, + "user_goal_stage": "Starting search", + "user_goal_achieved": false, + "action_plan": "Search for product", "actions": [ { - "reasoning": str, - "user_detail_query": str, - "user_detail_answer": str, - "confidence_float": float, - "action_type": str, - "id": str, - "text": str, - "file_url": str, - "download": bool, - "option": { - "label": str, - "index": int, - "value": str - } + "reasoning": "Enter search term", + "user_detail_query": "What to search?", + "user_detail_answer": "search term", + "confidence_float": 1.0, + "action_type": "INPUT_TEXT", + "id": "an html element id from 'ELEMENTS' provided above. Scan it and find the correct id for the action. An id in html looks like this "