-
Notifications
You must be signed in to change notification settings - Fork 1
/
tools.py
123 lines (95 loc) · 4.37 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import time
from langchain_openai import ChatOpenAI
from langchain.tools import BaseTool
from langchain.pydantic_v1 import BaseModel, Field
from langchain.callbacks.manager import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
from typing import Optional
import json
from langchain.tools import SteamshipImageGenerationTool
from langchain.schema import HumanMessage, SystemMessage
import numpy as np
from PIL import Image
import base64
from typing import Optional, Type
from langchain.llms import Ollama
from langchain_google_vertexai import ChatVertexAI
from PIL import ImageGrab
from langchain_openai import ChatOpenAI
import cv2
from voice import VoiceService
vs = VoiceService()
# vs.piper('hello')
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "google-credentials.json"
llm = ChatVertexAI(model_name="gemini-1.5-pro-preview-0514", temperature=0)
def analyze_image(image: str, query: str):
message = HumanMessage(
content=[
{
"type": "text",
"text": query,
},
{
"type": "image_url",
"image_url": {"url": image}
}
]
)
response = llm.invoke([message])
return response.content
class VisionInput(BaseModel):
query: str = Field(..., description="use this as the query: 'give a detailed description of this image, as detailed as possible.'")
wait_message: str = Field(..., description="address the user politely and ask the user to hold on while you access the webcam")
class VisionTool(BaseTool):
name="Vision Tool"
description="Useful tool to take a snapshot. Use this tool to access the user's webcam. USE this tool when asked to 'take a look' at something and no context was provided. Format input as: {{ \"query\": \"...\", \"wait_message\": \"...\" }}."
args_schema: Type[BaseModel] = VisionInput
# return_direct=True
def _run(self, query: str, wait_message: str, run_manager: Optional[CallbackManagerForToolRun]):
print(wait_message)
# vs.piper(str(wait_message))
image_path = self.capture()
analyze = analyze_image(image_path, query)
return analyze
async def _arun(self, query: str, wait_message: str, run_manager: Optional[AsyncCallbackManagerForToolRun]):
return await self._run(query, wait_message, run_manager)
def capture(self):
cap = cv2.VideoCapture(0)
if not cap.isOpened():
raise IOError("Camera failed to open")
print("Camera opened successfully")
time.sleep(1)
ret, frame = cap.read()
if not ret:
raise IOError("Failed to grab frame")
else:
img_path = os.path.join("images", "snapshot.jpg")
cv2.imwrite(img_path, frame)
print(f"Image saved successfully to: {img_path}")
cap.release()
print("Camera closed successfully")
return img_path
class ScreenshotInput(BaseModel):
query: str = Field(..., description="use this as the query: 'give a detailed description of this image, as detailed as possible.'")
wait_message: str = Field(..., description="address the user politely and ask the user to hold on while you take a screenshot of the active window.")
class ScreenshotTool(BaseTool):
name="Screenshot Tool"
description="Useful tool to take a screenshot. ONLY use this tool when asked to look at the SCREEN. Use this tool to screenshot the user's screen. Use this tool if asked to look at the user's screen. Format input as: {{ \"query\": \"...\", \"wait_message\": \"...\" }}."
args_schema: Type[BaseModel] = ScreenshotInput
# return_direct=True
def _run(self, query: str, wait_message: str, run_manager: Optional[CallbackManagerForToolRun]):
print(wait_message)
# vs.piper(str(wait_message))
image_path = self.capture()
analyze = analyze_image(image_path, query)
return analyze
async def _arun(self, query: str, wait_message: str, run_manager: Optional[CallbackManagerForToolRun]):
return await self._run(query, wait_message, run_manager)
def capture(self):
screenshot = ImageGrab.grab()
img_path = os.path.join("images", "screenshot.png")
screenshot.save(img_path)
return img_path