diff --git a/ros_speech_recognition/README.md b/ros_speech_recognition/README.md
index 266799b94..37ccf657a 100644
--- a/ros_speech_recognition/README.md
+++ b/ros_speech_recognition/README.md
@@ -30,6 +30,106 @@ This package uses Python package [SpeechRecognition](https://pypi.python.org/pyp
print result # => 'Hello, world!'
```
+If you are using `ros_speech_recognition` with `~continuous` is `True`, you can subscribe `/Tablet/voice` (`speech_recognition_msgs/SpeechRecognitionCandidates`) message.
+
+1. Launch sample launch file.
+
+
+ ```bash
+ roslaunch ros_speech_recognition sample_ros_speech_recognition.launch
+ ```
+
+2. echo the message.
+
+
+ ```bash
+ $ rostopic echo /Tablet/voice
+ transcript:
+ - may I help you
+ confidence: [0.9286448955535889]
+ sentences:
+ -
+ header:
+ seq: 0
+ stamp:
+ secs: 1641425262
+ nsecs: 268165588
+ frame_id: ''
+ words:
+ -
+ start_time: 0.0
+ end_time: 0.2
+ word: "may"
+ confidence: 0.91376436
+ speaker_tag: 0
+ -
+ start_time: 0.2
+ end_time: 0.4
+ word: "I"
+ confidence: 0.9366196
+ speaker_tag: 0
+ -
+ start_time: 0.4
+ end_time: 0.5
+ word: "help"
+ confidence: 0.9531065
+ speaker_tag: 0
+ -
+ start_time: 0.5
+ end_time: 0.8
+ word: "you"
+ confidence: 0.9110889
+ speaker_tag: 0
+ ---
+ transcript:
+ - pick up the red kettle
+ confidence: [0.9499567747116089]
+ sentences:
+ -
+ header:
+ seq: 0
+ stamp:
+ secs: 1641425268
+ nsecs: 58182954
+ frame_id: ''
+ words:
+ -
+ start_time: 0.0
+ end_time: 0.4
+ word: "pick"
+ confidence: 0.953269
+ speaker_tag: 0
+ -
+ start_time: 0.4
+ end_time: 0.6
+ word: "up"
+ confidence: 0.95326656
+ speaker_tag: 0
+ -
+ start_time: 0.6
+ end_time: 0.8
+ word: "the"
+ confidence: 0.96866167
+ speaker_tag: 0
+ -
+ start_time: 0.8
+ end_time: 1.1
+ word: "red"
+ confidence: 0.98762906
+ speaker_tag: 0
+ -
+ start_time: 1.1
+ end_time: 1.5
+ word: "kettle"
+ confidence: 0.8869578
+ speaker_tag: 0
+ ```
+
+The `word` is recognized word and the `confidence` means a higher number indicates an estimated greater likelihood that the recognized words are correct.
+`start_time` indicates time offset relative to the beginning of the audio (timestamp of header), and corresponding to the start of the spoken word.
+`end_time` indicates time offset relative to the beginning of the audio, and corresponding to the end of the spoken word.
+
+
## Interface
### Publishing Topics
@@ -38,6 +138,11 @@ This package uses Python package [SpeechRecognition](https://pypi.python.org/pyp
Action client to play sound on events. If the action server is not available or `~enable_sound_effect` is `False`, no sound is played.
+
+* `/Tablet/voice` (`speech_recognition_msgs/SpeechRecognitionCandidates`)
+
+ Publish recognized results when `~continuous` is `True`.
+
### Subscribing Topics
* `audio` (`audio_common_msgs/AudioData`)
diff --git a/ros_speech_recognition/launch/speech_recognition.launch b/ros_speech_recognition/launch/speech_recognition.launch
index f7562a313..5e2c6439c 100644
--- a/ros_speech_recognition/launch/speech_recognition.launch
+++ b/ros_speech_recognition/launch/speech_recognition.launch
@@ -11,6 +11,8 @@
+
+
audio_topic: $(arg audio_topic)
@@ -46,6 +49,9 @@
language: $(arg language)
continuous: $(arg continuous)
enable_sound_effect: $(arg launch_sound_play)
+ google_cloud_credentials_json: $(arg google_cloud_credentials_json)
+ diarizationConfig:
+ enableSpeakerDiarization: $(arg enable_speaker_diarization)
diff --git a/ros_speech_recognition/sample/data/may_i_help_you.bag b/ros_speech_recognition/sample/data/may_i_help_you.bag
new file mode 100644
index 000000000..457139412
Binary files /dev/null and b/ros_speech_recognition/sample/data/may_i_help_you.bag differ
diff --git a/ros_speech_recognition/sample/sample_ros_speech_recognition.launch b/ros_speech_recognition/sample/sample_ros_speech_recognition.launch
new file mode 100644
index 000000000..5b669d47e
--- /dev/null
+++ b/ros_speech_recognition/sample/sample_ros_speech_recognition.launch
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/ros_speech_recognition/scripts/speech_recognition_node.py b/ros_speech_recognition/scripts/speech_recognition_node.py
index 8ad9788bf..bad4d3f45 100644
--- a/ros_speech_recognition/scripts/speech_recognition_node.py
+++ b/ros_speech_recognition/scripts/speech_recognition_node.py
@@ -8,13 +8,18 @@
from ros_speech_recognition.recognize_google_cloud import RecognizerEx
import json
import array
+import os
import sys
from threading import Lock
+import numpy as np
+import std_msgs.msg
from audio_common_msgs.msg import AudioData
from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
+from speech_recognition_msgs.msg import SentenceInfo
from speech_recognition_msgs.msg import SpeechRecognitionCandidates
+from speech_recognition_msgs.msg import WordInfo
from speech_recognition_msgs.srv import SpeechRecognition
from speech_recognition_msgs.srv import SpeechRecognitionResponse
from std_srvs.srv import Empty
@@ -219,6 +224,7 @@ def play_sound(self, key, timeout=5.0):
def recognize(self, audio):
recog_func = None
+ self.enable_diarization = False
if self.engine == Config.SpeechRecognition_Google:
if not self.args:
self.args = {'key': rospy.get_param("~google_key", None)}
@@ -226,15 +232,24 @@ def recognize(self, audio):
elif self.engine == Config.SpeechRecognition_GoogleCloud:
if not self.args:
credentials_path = rospy.get_param("~google_cloud_credentials_json", None)
- if credentials_path is not None:
+ if credentials_path is not None and len(credentials_path) > 0:
+ if os.path.exists(credentials_path) is False:
+ rospy.logerr(
+ 'google_cloud_credentials_json '
+ '{} not exists.'.format(credentials_path))
+ sys.exit(1)
with open(credentials_path) as j:
credentials_json = j.read()
else:
credentials_json = None
self.args = {'credentials_json': credentials_json,
- 'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None)}
+ 'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None),
+ 'show_all': True}
if rospy.has_param('~diarizationConfig') :
- self.args.update({'user_config': {'diarizationConfig': rospy.get_param('~diarizationConfig') }})
+ diarizationConfig = rospy.get_param('~diarizationConfig')
+ self.args.update({'user_config': {'diarizationConfig': diarizationConfig}})
+ self.enable_diarization = diarizationConfig.get(
+ 'enableSpeakerDiarization', False)
recog_func = self.recognizer.recognize_google_cloud
elif self.engine == Config.SpeechRecognition_Sphinx:
recog_func = self.recognizer.recognize_sphinx
@@ -251,17 +266,72 @@ def recognize(self, audio):
return recog_func(audio_data=audio, language=self.language, **self.args)
+ def make_result_message_from_result(self, result, header=None):
+ if header is None:
+ header = std_msgs.msg.Header(stamp=rospy.Time.now())
+ if self.engine == Config.SpeechRecognition_GoogleCloud:
+ if "results" not in result or len(result["results"]) == 0:
+ raise SR.UnknownValueError()
+ transcript = []
+ confidence = []
+ sentences = []
+ for res in result["results"]:
+ sent_info_msg = SentenceInfo(header=header)
+ if self.enable_diarization is False:
+ transcript.append(
+ res["alternatives"][0]["transcript"].strip())
+ confidence.append(res["alternatives"][0]['confidence'])
+ prev_speaker = None
+ trans = ''
+ confs = []
+ for word in res["alternatives"][0]['words']:
+ speaker = word.get('spekaerTag', 0)
+ conf = word.get('confidence', 0.0)
+ # for more details, please see
+ # https://cloud.google.com/speech-to-text/docs/reference/rest/v1/speech/recognize#wordinfo
+ word_info_msg = WordInfo(
+ start_time=float(word.get(
+ 'startTime', '0.0s').rstrip('s')),
+ end_time=float(word.get(
+ 'endTime', '0.0s').rstrip('s')),
+ word=word.get('word', ''),
+ confidence=conf,
+ speaker_tag=speaker)
+ sent_info_msg.words.append(word_info_msg)
+ if self.enable_diarization is True \
+ and prev_speaker != speaker:
+ trans += "[{}]".format(speaker)
+ prev_speaker = speaker
+ trans += ' ' + word['word']
+ confs.append(conf)
+ if self.enable_diarization is True:
+ transcript.append(trans)
+ confidence.append(np.mean(conf))
+ sentences.append(sent_info_msg)
+ msg = SpeechRecognitionCandidates(
+ transcript=transcript,
+ confidence=confidence,
+ sentences=sentences)
+ transcript = " ".join(transcript)
+ else:
+ transcript = result
+ msg = SpeechRecognitionCandidates(
+ transcript=[transcript])
+ return msg, transcript
+
def audio_cb(self, _, audio):
if not self.enable_audio_cb:
return
try:
rospy.logdebug("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
+ header = std_msgs.msg.Header(stamp=rospy.Time.now())
result = self.recognize(audio)
self.play_sound("recognized", 0.05)
- rospy.loginfo("Result: %s" % result.encode('utf-8'))
- self.play_sound("success", 0.1)
- msg = SpeechRecognitionCandidates(transcript=[result])
+ msg, transcript = self.make_result_message_from_result(
+ result, header=header)
+ rospy.loginfo("Result: %s" % transcript)
self.pub.publish(msg)
+ self.play_sound("success", 0.1)
return
except SR.UnknownValueError as e:
if self.dynamic_energy_threshold:
@@ -322,11 +392,14 @@ def speech_recognition_srv_cb(self, req):
rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
try:
+ header = std_msgs.msg.Header(stamp=rospy.Time.now())
result = self.recognize(audio)
rospy.loginfo("Result: %s" % result.encode('utf-8'))
if not req.quiet:
self.play_sound("success", 0.1)
- res.result = SpeechRecognitionCandidates(transcript=[result])
+ msg, _ = self.make_result_message_from_result(
+ result, header=header)
+ res.result = msg
return res
except SR.UnknownValueError:
if self.dynamic_energy_threshold:
diff --git a/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py b/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py
index e18644087..16aa09263 100644
--- a/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py
+++ b/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py
@@ -74,6 +74,7 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en
speech_config["speechContext"] = {"phrases": preferred_phrases}
if show_all:
speech_config["enableWordTimeOffsets"] = True # some useful extra options for when we want all the output
+ speech_config["enable_word_confidence"] = True
request = speech_service.speech().recognize(body={"audio": {"content": base64.b64encode(flac_data).decode("utf8")}, "config": speech_config})
try:
diff --git a/ros_speech_recognition/test/sample_ros_speech_recognition.test b/ros_speech_recognition/test/sample_ros_speech_recognition.test
index 2451fbe16..6b8c2e3ff 100644
--- a/ros_speech_recognition/test/sample_ros_speech_recognition.test
+++ b/ros_speech_recognition/test/sample_ros_speech_recognition.test
@@ -1,4 +1,8 @@
+
+
+
+
@@ -9,10 +13,12 @@
+
+