-
Notifications
You must be signed in to change notification settings - Fork 0
/
listener.py
298 lines (238 loc) · 10 KB
/
listener.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# Copyright 2016 Mycroft AI, Inc.
#
# This file is part of Mycroft Core.
#
# Mycroft Core is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Mycroft Core is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Mycroft Core. If not, see <http://www.gnu.org/licenses/>.
import collections
import audioop
from time import sleep
import pyaudio
from speech_recognition import (
Microphone,
AudioSource,
AudioData
)
import speech_recognition
threshold_multiplier = "1.0"
dynamic_energy_ratio = "1.5"
__author__ = 'seanfitz'
class MutableStream(object):
def __init__(self, wrapped_stream, format, muted=False):
assert wrapped_stream is not None
self.wrapped_stream = wrapped_stream
self.muted = muted
self.SAMPLE_WIDTH = pyaudio.get_sample_size(format)
self.muted_buffer = b''.join([b'\x00' * self.SAMPLE_WIDTH])
def mute(self):
self.muted = True
def unmute(self):
self.muted = False
def read(self, size):
frames = collections.deque()
remaining = size
while remaining > 0:
to_read = min(self.wrapped_stream.get_read_available(), remaining)
if to_read == 0:
sleep(.01)
continue
result = self.wrapped_stream.read(to_read)
frames.append(result)
remaining -= to_read
if self.muted:
return self.muted_buffer
input_latency = self.wrapped_stream.get_input_latency()
if input_latency > 0.2:
print("High input latency: %f" % input_latency)
audio = b"".join(list(frames))
return audio
def close(self):
self.wrapped_stream.close()
self.wrapped_stream = None
def is_stopped(self):
return self.wrapped_stream.is_stopped()
def stop_stream(self):
return self.wrapped_stream.stop_stream()
class MutableMicrophone(Microphone):
def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024):
Microphone.__init__(
self, device_index=device_index, sample_rate=sample_rate,
chunk_size=chunk_size)
self.muted = False
def __enter__(self):
assert self.stream is None, \
"This audio source is already inside a context manager"
self.audio = pyaudio.PyAudio()
self.stream = MutableStream(self.audio.open(
input_device_index=self.device_index, channels=1,
format=self.format, rate=self.SAMPLE_RATE,
frames_per_buffer=self.CHUNK,
input=True, # stream is an input stream
), self.format, self.muted)
return self
def __exit__(self, exc_type, exc_value, traceback):
if not self.stream.is_stopped():
self.stream.stop_stream()
self.stream.close()
self.stream = None
self.audio.terminate()
def mute(self):
self.muted = True
if self.stream:
self.stream.mute()
def unmute(self):
self.muted = False
if self.stream:
self.stream.unmute()
class ResponsiveRecognizer(speech_recognition.Recognizer):
# The maximum audio in seconds to keep for transcribing a phrase
# The wake word must fit in this time
SAVED_WW_SEC = 1.0
# Padding of silence when feeding to pocketsphinx
SILENCE_SEC = 0.01
# The minimum seconds of noise before a
# phrase can be considered complete
MIN_LOUD_SEC_PER_PHRASE = 0.1
# The maximum length a phrase can be recorded,
# provided there is noise the entire time
RECORDING_TIMEOUT = 30.0
# The maximum time it will continue to record silence
# when not enough noise has been detected
RECORDING_TIMEOUT_WITH_SILENCE = 3.0
# Time between pocketsphinx checks for the wake word
SEC_BETWEEN_WW_CHECKS = 0.2
def __init__(self, wake_word_recognizer):
speech_recognition.Recognizer.__init__(self)
self.wake_word_recognizer = wake_word_recognizer
self.audio = pyaudio.PyAudio()
self.threshold_multiplier = float(threshold_multiplier)
self.dynamic_energy_ratio = float(dynamic_energy_ratio)
@staticmethod
def record_sound_chunk(source):
return source.stream.read(source.CHUNK)
@staticmethod
def calc_energy(sound_chunk, sample_width):
return audioop.rms(sound_chunk, sample_width)
def wake_word_in_audio(self, frame_data):
hyp = self.wake_word_recognizer.transcribe(frame_data)
return self.wake_word_recognizer.found_wake_word(hyp)
def record_phrase(self, source, sec_per_buffer):
"""
This attempts to record an entire spoken phrase. Essentially,
this waits for a period of silence and then returns the audio
:rtype: bytearray
:param source: AudioSource
:param sec_per_buffer: Based on source.SAMPLE_RATE
:return: bytearray representing the frame_data of the recorded phrase
"""
num_loud_chunks = 0
noise = 0
max_noise = 25
min_noise = 0
def increase_noise(level):
if level < max_noise:
return level + 200 * sec_per_buffer
return level
def decrease_noise(level):
if level > min_noise:
return level - 100 * sec_per_buffer
return level
# Smallest number of loud chunks required to return
min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer)
# Maximum number of chunks to record before timing out
max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer)
num_chunks = 0
# Will return if exceeded this even if there's not enough loud chunks
max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE /
sec_per_buffer)
# bytearray to store audio in
byte_data = '\0' * source.SAMPLE_WIDTH
phrase_complete = False
while num_chunks < max_chunks and not phrase_complete:
chunk = self.record_sound_chunk(source)
byte_data += chunk
num_chunks += 1
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
test_threshold = self.energy_threshold * self.threshold_multiplier
is_loud = energy > test_threshold
if is_loud:
noise = increase_noise(noise)
num_loud_chunks += 1
else:
noise = decrease_noise(noise)
self.adjust_threshold(energy, sec_per_buffer)
was_loud_enough = num_loud_chunks > min_loud_chunks
quiet_enough = noise <= min_noise
recorded_too_much_silence = num_chunks > max_chunks_of_silence
if quiet_enough and (was_loud_enough or recorded_too_much_silence):
phrase_complete = True
return byte_data
@staticmethod
def sec_to_bytes(sec, source):
return sec * source.SAMPLE_RATE * source.SAMPLE_WIDTH
def wait_until_wake_word(self, source, sec_per_buffer):
num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
source.SAMPLE_WIDTH)
silence = '\0' * num_silent_bytes
# bytearray to store audio in
byte_data = silence
buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer
buffers_since_check = 0.0
# Max bytes for byte_data before audio is removed from the front
max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)
said_wake_word = False
while not said_wake_word:
chunk = self.record_sound_chunk(source)
energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
if energy < self.energy_threshold * self.threshold_multiplier:
self.adjust_threshold(energy, sec_per_buffer)
needs_to_grow = len(byte_data) < max_size
if needs_to_grow:
byte_data += chunk
else: # Remove beginning of audio and add new chunk to end
byte_data = byte_data[len(chunk):] + chunk
buffers_since_check += 1.0
if buffers_since_check < buffers_per_check:
buffers_since_check -= buffers_per_check
said_wake_word = self.wake_word_in_audio(byte_data + silence)
@staticmethod
def create_audio_data(raw_data, source):
"""
Constructs an AudioData instance with the same parameters
as the source and the specified frame_data
"""
return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def listen(self, source):
"""
Listens for audio that Mycroft should respond to
:param source: an ``AudioSource`` instance for reading from
"""
assert isinstance(source, AudioSource), "Source must be an AudioSource"
bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH
sec_per_buffer = float(source.CHUNK) / bytes_per_sec
print("Waiting for wake word...")
self.wait_until_wake_word(source, sec_per_buffer)
print("Recording...")
frame_data = self.record_phrase(source, sec_per_buffer)
audio_data = self.create_audio_data(frame_data, source)
print("Thinking...")
return audio_data
def adjust_threshold(self, energy, seconds_per_buffer):
if self.dynamic_energy_threshold and energy > 0:
# account for different chunk sizes and rates
damping = (
self.dynamic_energy_adjustment_damping ** seconds_per_buffer)
target_energy = energy * self.dynamic_energy_ratio
self.energy_threshold = (
self.energy_threshold * damping +
target_energy * (1 - damping))