diff --git a/ros_speech_recognition/README.md b/ros_speech_recognition/README.md index 266799b94..37ccf657a 100644 --- a/ros_speech_recognition/README.md +++ b/ros_speech_recognition/README.md @@ -30,6 +30,106 @@ This package uses Python package [SpeechRecognition](https://pypi.python.org/pyp print result # => 'Hello, world!' ``` +If you are using `ros_speech_recognition` with `~continuous` is `True`, you can subscribe `/Tablet/voice` (`speech_recognition_msgs/SpeechRecognitionCandidates`) message. + +1. Launch sample launch file. + + + ```bash + roslaunch ros_speech_recognition sample_ros_speech_recognition.launch + ``` + +2. echo the message. + + + ```bash + $ rostopic echo /Tablet/voice + transcript: + - may I help you + confidence: [0.9286448955535889] + sentences: + - + header: + seq: 0 + stamp: + secs: 1641425262 + nsecs: 268165588 + frame_id: '' + words: + - + start_time: 0.0 + end_time: 0.2 + word: "may" + confidence: 0.91376436 + speaker_tag: 0 + - + start_time: 0.2 + end_time: 0.4 + word: "I" + confidence: 0.9366196 + speaker_tag: 0 + - + start_time: 0.4 + end_time: 0.5 + word: "help" + confidence: 0.9531065 + speaker_tag: 0 + - + start_time: 0.5 + end_time: 0.8 + word: "you" + confidence: 0.9110889 + speaker_tag: 0 + --- + transcript: + - pick up the red kettle + confidence: [0.9499567747116089] + sentences: + - + header: + seq: 0 + stamp: + secs: 1641425268 + nsecs: 58182954 + frame_id: '' + words: + - + start_time: 0.0 + end_time: 0.4 + word: "pick" + confidence: 0.953269 + speaker_tag: 0 + - + start_time: 0.4 + end_time: 0.6 + word: "up" + confidence: 0.95326656 + speaker_tag: 0 + - + start_time: 0.6 + end_time: 0.8 + word: "the" + confidence: 0.96866167 + speaker_tag: 0 + - + start_time: 0.8 + end_time: 1.1 + word: "red" + confidence: 0.98762906 + speaker_tag: 0 + - + start_time: 1.1 + end_time: 1.5 + word: "kettle" + confidence: 0.8869578 + speaker_tag: 0 + ``` + +The `word` is recognized word and the `confidence` means a higher number indicates an estimated greater likelihood that the recognized words are correct. +`start_time` indicates time offset relative to the beginning of the audio (timestamp of header), and corresponding to the start of the spoken word. +`end_time` indicates time offset relative to the beginning of the audio, and corresponding to the end of the spoken word. + + ## Interface ### Publishing Topics @@ -38,6 +138,11 @@ This package uses Python package [SpeechRecognition](https://pypi.python.org/pyp Action client to play sound on events. If the action server is not available or `~enable_sound_effect` is `False`, no sound is played. + +* `/Tablet/voice` (`speech_recognition_msgs/SpeechRecognitionCandidates`) + + Publish recognized results when `~continuous` is `True`. + ### Subscribing Topics * `audio` (`audio_common_msgs/AudioData`) diff --git a/ros_speech_recognition/launch/speech_recognition.launch b/ros_speech_recognition/launch/speech_recognition.launch index f7562a313..5e2c6439c 100644 --- a/ros_speech_recognition/launch/speech_recognition.launch +++ b/ros_speech_recognition/launch/speech_recognition.launch @@ -11,6 +11,8 @@ + + audio_topic: $(arg audio_topic) @@ -46,6 +49,9 @@ language: $(arg language) continuous: $(arg continuous) enable_sound_effect: $(arg launch_sound_play) + google_cloud_credentials_json: $(arg google_cloud_credentials_json) + diarizationConfig: + enableSpeakerDiarization: $(arg enable_speaker_diarization) diff --git a/ros_speech_recognition/sample/data/may_i_help_you.bag b/ros_speech_recognition/sample/data/may_i_help_you.bag new file mode 100644 index 000000000..457139412 Binary files /dev/null and b/ros_speech_recognition/sample/data/may_i_help_you.bag differ diff --git a/ros_speech_recognition/sample/sample_ros_speech_recognition.launch b/ros_speech_recognition/sample/sample_ros_speech_recognition.launch new file mode 100644 index 000000000..5b669d47e --- /dev/null +++ b/ros_speech_recognition/sample/sample_ros_speech_recognition.launch @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/ros_speech_recognition/scripts/speech_recognition_node.py b/ros_speech_recognition/scripts/speech_recognition_node.py index 8ad9788bf..bad4d3f45 100644 --- a/ros_speech_recognition/scripts/speech_recognition_node.py +++ b/ros_speech_recognition/scripts/speech_recognition_node.py @@ -8,13 +8,18 @@ from ros_speech_recognition.recognize_google_cloud import RecognizerEx import json import array +import os import sys from threading import Lock +import numpy as np +import std_msgs.msg from audio_common_msgs.msg import AudioData from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal +from speech_recognition_msgs.msg import SentenceInfo from speech_recognition_msgs.msg import SpeechRecognitionCandidates +from speech_recognition_msgs.msg import WordInfo from speech_recognition_msgs.srv import SpeechRecognition from speech_recognition_msgs.srv import SpeechRecognitionResponse from std_srvs.srv import Empty @@ -219,6 +224,7 @@ def play_sound(self, key, timeout=5.0): def recognize(self, audio): recog_func = None + self.enable_diarization = False if self.engine == Config.SpeechRecognition_Google: if not self.args: self.args = {'key': rospy.get_param("~google_key", None)} @@ -226,15 +232,24 @@ def recognize(self, audio): elif self.engine == Config.SpeechRecognition_GoogleCloud: if not self.args: credentials_path = rospy.get_param("~google_cloud_credentials_json", None) - if credentials_path is not None: + if credentials_path is not None and len(credentials_path) > 0: + if os.path.exists(credentials_path) is False: + rospy.logerr( + 'google_cloud_credentials_json ' + '{} not exists.'.format(credentials_path)) + sys.exit(1) with open(credentials_path) as j: credentials_json = j.read() else: credentials_json = None self.args = {'credentials_json': credentials_json, - 'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None)} + 'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None), + 'show_all': True} if rospy.has_param('~diarizationConfig') : - self.args.update({'user_config': {'diarizationConfig': rospy.get_param('~diarizationConfig') }}) + diarizationConfig = rospy.get_param('~diarizationConfig') + self.args.update({'user_config': {'diarizationConfig': diarizationConfig}}) + self.enable_diarization = diarizationConfig.get( + 'enableSpeakerDiarization', False) recog_func = self.recognizer.recognize_google_cloud elif self.engine == Config.SpeechRecognition_Sphinx: recog_func = self.recognizer.recognize_sphinx @@ -251,17 +266,72 @@ def recognize(self, audio): return recog_func(audio_data=audio, language=self.language, **self.args) + def make_result_message_from_result(self, result, header=None): + if header is None: + header = std_msgs.msg.Header(stamp=rospy.Time.now()) + if self.engine == Config.SpeechRecognition_GoogleCloud: + if "results" not in result or len(result["results"]) == 0: + raise SR.UnknownValueError() + transcript = [] + confidence = [] + sentences = [] + for res in result["results"]: + sent_info_msg = SentenceInfo(header=header) + if self.enable_diarization is False: + transcript.append( + res["alternatives"][0]["transcript"].strip()) + confidence.append(res["alternatives"][0]['confidence']) + prev_speaker = None + trans = '' + confs = [] + for word in res["alternatives"][0]['words']: + speaker = word.get('spekaerTag', 0) + conf = word.get('confidence', 0.0) + # for more details, please see + # https://cloud.google.com/speech-to-text/docs/reference/rest/v1/speech/recognize#wordinfo + word_info_msg = WordInfo( + start_time=float(word.get( + 'startTime', '0.0s').rstrip('s')), + end_time=float(word.get( + 'endTime', '0.0s').rstrip('s')), + word=word.get('word', ''), + confidence=conf, + speaker_tag=speaker) + sent_info_msg.words.append(word_info_msg) + if self.enable_diarization is True \ + and prev_speaker != speaker: + trans += "[{}]".format(speaker) + prev_speaker = speaker + trans += ' ' + word['word'] + confs.append(conf) + if self.enable_diarization is True: + transcript.append(trans) + confidence.append(np.mean(conf)) + sentences.append(sent_info_msg) + msg = SpeechRecognitionCandidates( + transcript=transcript, + confidence=confidence, + sentences=sentences) + transcript = " ".join(transcript) + else: + transcript = result + msg = SpeechRecognitionCandidates( + transcript=[transcript]) + return msg, transcript + def audio_cb(self, _, audio): if not self.enable_audio_cb: return try: rospy.logdebug("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data())) + header = std_msgs.msg.Header(stamp=rospy.Time.now()) result = self.recognize(audio) self.play_sound("recognized", 0.05) - rospy.loginfo("Result: %s" % result.encode('utf-8')) - self.play_sound("success", 0.1) - msg = SpeechRecognitionCandidates(transcript=[result]) + msg, transcript = self.make_result_message_from_result( + result, header=header) + rospy.loginfo("Result: %s" % transcript) self.pub.publish(msg) + self.play_sound("success", 0.1) return except SR.UnknownValueError as e: if self.dynamic_energy_threshold: @@ -322,11 +392,14 @@ def speech_recognition_srv_cb(self, req): rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data())) try: + header = std_msgs.msg.Header(stamp=rospy.Time.now()) result = self.recognize(audio) rospy.loginfo("Result: %s" % result.encode('utf-8')) if not req.quiet: self.play_sound("success", 0.1) - res.result = SpeechRecognitionCandidates(transcript=[result]) + msg, _ = self.make_result_message_from_result( + result, header=header) + res.result = msg return res except SR.UnknownValueError: if self.dynamic_energy_threshold: diff --git a/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py b/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py index e18644087..16aa09263 100644 --- a/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py +++ b/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py @@ -74,6 +74,7 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en speech_config["speechContext"] = {"phrases": preferred_phrases} if show_all: speech_config["enableWordTimeOffsets"] = True # some useful extra options for when we want all the output + speech_config["enable_word_confidence"] = True request = speech_service.speech().recognize(body={"audio": {"content": base64.b64encode(flac_data).decode("utf8")}, "config": speech_config}) try: diff --git a/ros_speech_recognition/test/sample_ros_speech_recognition.test b/ros_speech_recognition/test/sample_ros_speech_recognition.test index 2451fbe16..6b8c2e3ff 100644 --- a/ros_speech_recognition/test/sample_ros_speech_recognition.test +++ b/ros_speech_recognition/test/sample_ros_speech_recognition.test @@ -1,4 +1,8 @@ + + + + @@ -9,10 +13,12 @@ + +