-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
167 lines (140 loc) · 4.78 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from gpiozero import Button
from time import sleep
from picamera import PiCamera
from signal import pause
from google.oauth2 import service_account
import vertexai
from vertexai.vision_models import ImageTextModel, Image
from pydub import AudioSegment
from pydub.playback import play
from google.cloud import texttospeech
SCOPES = ['https://www.googleapis.com/auth/cloud-platform']
SERVICE_ACCOUNT_FILE = 'keys.json'
PROJECT_ID = 'groovy-height-411217' # @param {type:"string"}
LOCATION = 'us-central1' # @param {type:"string"}
credentials = service_account.Credentials.from_service_account_file(
SERVICE_ACCOUNT_FILE, scopes=SCOPES)
vertexai.init(project=PROJECT_ID, location=LOCATION,credentials=credentials,)
model = ImageTextModel.from_pretrained("imagetext@001")
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
speaking_rate=1)
print("Initialized")
button2 = Button(2)
button27 = Button(27)
camera=PiCamera()
camera.start_preview(alpha=192)
def getDesc():
print("Button clicked")
camera.capture('pic.jpg')
source_image = Image.load_from_file(location='./pic.jpg')
print("Request sent")
captions = model.get_captions(
image=source_image,
# Optional:
number_of_results=1,
language="hi",
)
print(captions)
"""Synthesizes speech from the input string of text."""
client = texttospeech.TextToSpeechClient(credentials=credentials)
input_text = texttospeech.SynthesisInput(text=captions[0])
# Note: the voice can also be specified by name.
# Names of voices can be retrieved with client.list_voices().
voice = texttospeech.VoiceSelectionParams(
language_code="hi-in",
name="hi-IN-Neural2-A",
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
speaking_rate=1
)
response = client.synthesize_speech(
request={"input": input_text, "voice": voice, "audio_config": audio_config}
)
# The response's audio_content is binary.
with open("output.mp3", "wb") as out:
out.write(response.audio_content)
print('Audio content written to file "output.mp3"')
song = AudioSegment.from_wav("output.mp3")
play(song)
def getQnA():
print("Button 27 clicked");
camera.capture('pic.jpg')
source_image = Image.load_from_file(location='./pic.jpg');
# the file name output you want to record into
filename = "recorded.wav"
# set the chunk size of 1024 samples
chunk = 1024
# sample format
FORMAT = pyaudio.paInt16
# mono, change to 2 if you want stereo
channels = 1
# 44100 samples per second
sample_rate = 44100
record_seconds = 5
p = pyaudio.PyAudio()
# open stream object as input & output
stream = p.open(format=FORMAT,channels=channels,rate=sample_rate,input=True,output=True,frames_per_buffer=chunk)
frames = []
print("Recording...")
while button27.is_pressed:
data = stream.read(chunk)
frames.append(data)
print("Finished recording.")
# stop and close stream
stream.stop_stream()
stream.close()
# terminate pyaudio object
p.terminate()
# save audio file
# open the file in 'write bytes' mode
wf = wave.open(filename, "wb")
# set the channels
wf.setnchannels(channels)
# set the sample format
wf.setsampwidth(p.get_sample_size(FORMAT))
# set the sample rate
wf.setframerate(sample_rate)
# write the frames as bytes
wf.writeframes(b"".join(frames))
# close the file
wf.close();
#Using Speech to text library to extract question
# Instantiates a client
client = speech.SpeechClient(credentials=credentials)
# The name of the audio file to transcribe
speech_file = "./recorded.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
language_code="hi")
# Detects speech in the audio file
response = client.recognize(config=config, audio=audio)
answers = model.ask_question(
image=source_image,
question=response.results[0].alternatives[0].transcript,
# Optional parameters
number_of_results=1,
)
print(answers)
client = texttospeech.TextToSpeechClient(credentials=credentials)
input_text = texttospeech.SynthesisInput(text=answers[0])
# Note: the voice can also be specified by name.
# Names of voices can be retrieved with client.list_voices().
voice = texttospeech.VoiceSelectionParams(
language_code="hi-in",
name="hi-IN-Neural2-A",
)
response = client.synthesize_speech(
request={"input": input_text, "voice": voice, "audio_config": audio_config}
)
with open("output.mp3", "wb") as out:
out.write(response.audio_content)
print('Audio content written to file "output.mp3"')
song = AudioSegment.from_wav("output.mp3")
play(song)
button2.when_pressed=getDesc
button27.when_pressed=getQnA
pause()