-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrun.py
99 lines (77 loc) · 2.46 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from whisper_mic import WhisperMic
import cv2
from openai import OpenAI
import base64
from Streamer import Streamer
import pyaudio
import numpy as np
import os
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
mic = WhisperMic(pause=0.7)
while True:
result = mic.listen()
print(result)
# Capture a single frame
cap = cv2.VideoCapture(0) # 0 is the default camera
ret, frame = cap.read()
# Check if the frame is captured successfully
if ret:
rotated_frame = cv2.rotate(frame, cv2.ROTATE_180)
# Save the captured image to a file
cv2.imwrite('captured_image.jpg', rotated_frame)
else:
print("Error: Could not capture an image.")
# Release the camera
cap.release()
# Open the image file in binary mode
with open('captured_image.jpg', 'rb') as image_file:
# Read the binary data
binary_data = image_file.read()
# Encode the binary data to Base64
base64_encoded_data = base64.b64encode(binary_data)
# Convert the Base64 bytes to string
base64_string = base64_encoded_data.decode('utf-8')
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": result + ". Be Concise."},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_string}",
},
},
],
}
],
max_tokens=300,
)
ai_resp = response.choices[0].message.content
print("\033[92m" + ai_resp + "\033[0m")
streamer = Streamer(os.environ.get("RESEMBLE_API_KEY"))
printed_header = False
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=22050,
output=True)
start = datetime.now()
is_first = True
ai_resp = ai_resp.replace("&", "and")
ai_resp = ai_resp.replace("!", " ")
for audio_chunk in streamer.stream(os.environ.get("RESEMBLE_PROJECT"), os.environ.get("RESEMBLE_VOICE"), ai_resp):
if is_first:
current_time = datetime.now()
time_difference = current_time - start
is_first = False
audio = np.frombuffer(audio_chunk, np.int16)
stream.write(bytes(audio_chunk))
stream.stop_stream()
stream.close()
p.terminate()