jsk-ros-pkg · iory · Jan 4, 2022 · Jan 4, 2022 · Jan 4, 2022 · Jan 5, 2022
diff --git a/ros_speech_recognition/README.md b/ros_speech_recognition/README.md
@@ -30,6 +30,106 @@ This package uses Python package [SpeechRecognition](https://pypi.python.org/pyp
   print result # => 'Hello, world!'
   ```
 
+If you are using `ros_speech_recognition` with `~continuous` is `True`, you can subscribe `/Tablet/voice` (`speech_recognition_msgs/SpeechRecognitionCandidates`) message.
+
+1. Launch sample launch file.
+
+
+    ```bash
+    roslaunch ros_speech_recognition sample_ros_speech_recognition.launch
+    ```
+
+2. echo the message.
+
+
+    ```bash
+    $ rostopic echo /Tablet/voice
+    transcript:
+      - may I help you
+    confidence: [0.9286448955535889]
+    sentences:
+      -
+        header:
+          seq: 0
+          stamp:
+            secs: 1641425262
+            nsecs: 268165588
+          frame_id: ''
+        words:
+          -
+            start_time: 0.0
+            end_time: 0.2
+            word: "may"
+            confidence: 0.91376436
+            speaker_tag: 0
+          -
+            start_time: 0.2
+            end_time: 0.4
+            word: "I"
+            confidence: 0.9366196
+            speaker_tag: 0
+          -
+            start_time: 0.4
+            end_time: 0.5
+            word: "help"
+            confidence: 0.9531065
+            speaker_tag: 0
+          -
+            start_time: 0.5
+            end_time: 0.8
+            word: "you"
+            confidence: 0.9110889
+            speaker_tag: 0
+    ---
+    transcript:
+      - pick up the red kettle
+    confidence: [0.9499567747116089]
+    sentences:
+      -
+        header:
+          seq: 0
+          stamp:
+            secs: 1641425268
+            nsecs:  58182954
+          frame_id: ''
+        words:
+          -
+            start_time: 0.0
+            end_time: 0.4
+            word: "pick"
+            confidence: 0.953269
+            speaker_tag: 0
+          -
+            start_time: 0.4
+            end_time: 0.6
+            word: "up"
+            confidence: 0.95326656
+            speaker_tag: 0
+          -
+            start_time: 0.6
+            end_time: 0.8
+            word: "the"
+            confidence: 0.96866167
+            speaker_tag: 0
+          -
+            start_time: 0.8
+            end_time: 1.1
+            word: "red"
+            confidence: 0.98762906
+            speaker_tag: 0
+          -
+            start_time: 1.1
+            end_time: 1.5
+            word: "kettle"
+            confidence: 0.8869578
+            speaker_tag: 0
+      ```
+
+The `word` is recognized word and the `confidence` means a higher number indicates an estimated greater likelihood that the recognized words are correct.
+`start_time` indicates time offset relative to the beginning of the audio (timestamp of header), and corresponding to the start of the spoken word.
+`end_time` indicates time offset relative to the beginning of the audio, and corresponding to the end of the spoken word.
+
+
 ## Interface
 
 ### Publishing Topics
@@ -38,6 +138,11 @@ This package uses Python package [SpeechRecognition](https://pypi.python.org/pyp
 
   Action client to play sound on events. If the action server is not available or `~enable_sound_effect` is `False`, no sound is played.
 
+
+* `/Tablet/voice` (`speech_recognition_msgs/SpeechRecognitionCandidates`)
+
+    Publish recognized results when `~continuous` is `True`.
+
 ### Subscribing Topics
 
 * `audio` (`audio_common_msgs/AudioData`)

diff --git a/ros_speech_recognition/launch/speech_recognition.launch b/ros_speech_recognition/launch/speech_recognition.launch
@@ -11,6 +11,8 @@
   <arg name="engine" default="Google" doc="Speech to text engine. TTS engine, Google, GoogleCloud, Sphinx, Wit, Bing Houndify, IBM" />
   <arg name="language" default="en-US" doc="Speech to text language. For Japanese, set ja-JP." />
   <arg name="continuous" default="false" doc="If false, /speech_recognition service is published. If true, /Tablet/voice topic is published." />
+  <arg name="google_cloud_credentials_json" default="''" doc="Credential JSON is only used when the engine is GoogleCloud." />
+  <arg name="enable_speaker_diarization" default="false" doc="Enable speaker diarization option when the engine is GoogleCloud." />
 
   <!-- sound play -->
   <node name="sound_play" pkg="sound_play" type="soundplay_node.py"
@@ -35,6 +37,7 @@
   <node name="speech_recognition"
         pkg="ros_speech_recognition" type="speech_recognition_node.py"
         respawn="true"
+        clear_params="true"
         output="screen">
     <rosparam subst_value="true">
       audio_topic: $(arg audio_topic)
@@ -46,6 +49,9 @@
       language: $(arg language)
       continuous: $(arg continuous)
       enable_sound_effect: $(arg launch_sound_play)
+      google_cloud_credentials_json: $(arg google_cloud_credentials_json)
+      diarizationConfig:
+        enableSpeakerDiarization: $(arg enable_speaker_diarization)
     </rosparam>
   </node>
 

diff --git a/ros_speech_recognition/sample/data/may_i_help_you.bag b/ros_speech_recognition/sample/data/may_i_help_you.bag
diff --git a/ros_speech_recognition/sample/sample_ros_speech_recognition.launch b/ros_speech_recognition/sample/sample_ros_speech_recognition.launch
@@ -0,0 +1,23 @@
+<launch>
+
+  <arg name="google_cloud_credentials_json" default="''" doc="Credential JSON is only used when the engine is GoogleCloud." />
+  <arg name="engine" default="GoogleCloud" doc="Speech to text engine. TTS engine, Google, GoogleCloud, Sphinx, Wit, Bing Houndify, IBM" />
+  <arg name="launch_sound_play" default="false" doc="Launch sound_play node to speak" />
+
+  <!-- launch bag file -->
+  <node name="rosbag_play"
+        pkg="rosbag" type="play"
+        args="$(find ros_speech_recognition)/sample/data/may_i_help_you.bag --loop">
+  </node>
+
+  <include file="$(find ros_speech_recognition)/launch/speech_recognition.launch">
+    <arg name="launch_sound_play" value="$(arg launch_sound_play)" />
+    <arg name="engine" value="$(arg engine)" />
+    <arg name="launch_audio_capture" value="false" />
+    <arg name="audio_topic" value="/speech_audio" />
+    <arg name="language" value="en-US" />
+    <arg name="continuous" value="true" />
+    <arg name="google_cloud_credentials_json" value="$(arg google_cloud_credentials_json)" />
+  </include>
+
+</launch>
diff --git a/ros_speech_recognition/scripts/speech_recognition_node.py b/ros_speech_recognition/scripts/speech_recognition_node.py
@@ -8,13 +8,18 @@
 from ros_speech_recognition.recognize_google_cloud import RecognizerEx
 import json
 import array
+import os
 import sys
 from threading import Lock
 
+import numpy as np
+import std_msgs.msg
 from audio_common_msgs.msg import AudioData
 from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
 
+from speech_recognition_msgs.msg import SentenceInfo
 from speech_recognition_msgs.msg import SpeechRecognitionCandidates
+from speech_recognition_msgs.msg import WordInfo
 from speech_recognition_msgs.srv import SpeechRecognition
 from speech_recognition_msgs.srv import SpeechRecognitionResponse
 from std_srvs.srv import Empty
@@ -219,22 +224,32 @@ def play_sound(self, key, timeout=5.0):
 
     def recognize(self, audio):
         recog_func = None
+        self.enable_diarization = False
         if self.engine == Config.SpeechRecognition_Google:
             if not self.args:
                 self.args = {'key': rospy.get_param("~google_key", None)}
             recog_func = self.recognizer.recognize_google
         elif self.engine == Config.SpeechRecognition_GoogleCloud:
             if not self.args:
                 credentials_path = rospy.get_param("~google_cloud_credentials_json", None)
-                if credentials_path is not None:
+                if credentials_path is not None and len(credentials_path) > 0:
+                    if os.path.exists(credentials_path) is False:
+                        rospy.logerr(
+                            'google_cloud_credentials_json '
+                            '{} not exists.'.format(credentials_path))
+                        sys.exit(1)
                     with open(credentials_path) as j:
                         credentials_json = j.read()
                 else:
                     credentials_json = None
                 self.args = {'credentials_json': credentials_json,
-                             'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None)}
+                             'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None),
+                             'show_all': True}
                 if rospy.has_param('~diarizationConfig') :
-                    self.args.update({'user_config': {'diarizationConfig': rospy.get_param('~diarizationConfig') }})
+                    diarizationConfig = rospy.get_param('~diarizationConfig')
+                    self.args.update({'user_config': {'diarizationConfig': diarizationConfig}})
+                    self.enable_diarization = diarizationConfig.get(
+                        'enableSpeakerDiarization', False)
             recog_func = self.recognizer.recognize_google_cloud
         elif self.engine == Config.SpeechRecognition_Sphinx:
             recog_func = self.recognizer.recognize_sphinx
@@ -251,17 +266,72 @@ def recognize(self, audio):
 
         return recog_func(audio_data=audio, language=self.language, **self.args)
 
+    def make_result_message_from_result(self, result, header=None):
+        if header is None:
+            header = std_msgs.msg.Header(stamp=rospy.Time.now())
+        if self.engine == Config.SpeechRecognition_GoogleCloud:
+            if "results" not in result or len(result["results"]) == 0:
+                raise SR.UnknownValueError()
+            transcript = []
+            confidence = []
+            sentences = []
+            for res in result["results"]:
+                sent_info_msg = SentenceInfo(header=header)
+                if self.enable_diarization is False:
+                    transcript.append(
+                        res["alternatives"][0]["transcript"].strip())
+                    confidence.append(res["alternatives"][0]['confidence'])
+                prev_speaker = None
+                trans = ''
+                confs = []
+                for word in res["alternatives"][0]['words']:
+                    speaker = word.get('spekaerTag', 0)
+                    conf = word.get('confidence', 0.0)
+                    # for more details, please see
+                    # https://cloud.google.com/speech-to-text/docs/reference/rest/v1/speech/recognize#wordinfo
+                    word_info_msg = WordInfo(
+                        start_time=float(word.get(
+                            'startTime', '0.0s').rstrip('s')),
+                        end_time=float(word.get(
+                            'endTime', '0.0s').rstrip('s')),
+                        word=word.get('word', ''),
+                        confidence=conf,
+                        speaker_tag=speaker)
+                    sent_info_msg.words.append(word_info_msg)
+                    if self.enable_diarization is True \
+                        and prev_speaker != speaker:
+                        trans += "[{}]".format(speaker)
+                    prev_speaker = speaker
+                    trans += ' ' + word['word']
+                    confs.append(conf)
+                if self.enable_diarization is True:
+                    transcript.append(trans)
+                    confidence.append(np.mean(conf))
+                sentences.append(sent_info_msg)
+            msg = SpeechRecognitionCandidates(
+                transcript=transcript,
+                confidence=confidence,
+                sentences=sentences)
+            transcript = " ".join(transcript)
+        else:
+            transcript = result
+            msg = SpeechRecognitionCandidates(
+                transcript=[transcript])
+        return msg, transcript
+
     def audio_cb(self, _, audio):
         if not self.enable_audio_cb:
             return
         try:
             rospy.logdebug("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
+            header = std_msgs.msg.Header(stamp=rospy.Time.now())
             result = self.recognize(audio)
             self.play_sound("recognized", 0.05)
-            rospy.loginfo("Result: %s" % result.encode('utf-8'))
-            self.play_sound("success", 0.1)
-            msg = SpeechRecognitionCandidates(transcript=[result])
+            msg, transcript = self.make_result_message_from_result(
+                result, header=header)
+            rospy.loginfo("Result: %s" % transcript)
             self.pub.publish(msg)
+            self.play_sound("success", 0.1)
             return
         except SR.UnknownValueError as e:
             if self.dynamic_energy_threshold:
@@ -322,11 +392,14 @@ def speech_recognition_srv_cb(self, req):
                 rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
 
                 try:
+                    header = std_msgs.msg.Header(stamp=rospy.Time.now())
                     result = self.recognize(audio)
                     rospy.loginfo("Result: %s" % result.encode('utf-8'))
                     if not req.quiet:
                         self.play_sound("success", 0.1)
-                    res.result = SpeechRecognitionCandidates(transcript=[result])
+                    msg, _ = self.make_result_message_from_result(
+                        result, header=header)
+                    res.result = msg
                     return res
                 except SR.UnknownValueError:
                     if self.dynamic_energy_threshold:

diff --git a/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py b/ros_speech_recognition/src/ros_speech_recognition/recognize_google_cloud.py
@@ -74,6 +74,7 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en
             speech_config["speechContext"] = {"phrases": preferred_phrases}
         if show_all:
             speech_config["enableWordTimeOffsets"] = True  # some useful extra options for when we want all the output
+            speech_config["enable_word_confidence"] = True
         request = speech_service.speech().recognize(body={"audio": {"content": base64.b64encode(flac_data).decode("utf8")}, "config": speech_config})
 
         try:

diff --git a/ros_speech_recognition/test/sample_ros_speech_recognition.test b/ros_speech_recognition/test/sample_ros_speech_recognition.test
@@ -1,4 +1,8 @@
 <launch>
+
+  <arg name="google_cloud_credentials_json" default="''" doc="Credential JSON is only used when the engine is GoogleCloud." />
+  <arg name="engine" default="Google" doc="Speech to text engine. TTS engine, Google, GoogleCloud, Sphinx, Wit, Bing Houndify, IBM" />
+
   <!-- launch speech recognition -->
 
   <arg name="launch_sound_play" default="false" doc="Launch sound_play node to speak" />
@@ -9,10 +13,12 @@
   </rosparam>
   <include file="$(find ros_speech_recognition)/launch/speech_recognition.launch">
     <arg name="launch_sound_play" value="$(arg launch_sound_play)" />
+    <arg name="engine" value="$(arg engine)" />
     <arg name="launch_audio_capture" value="false" />
     <arg name="audio_topic" value="/speech_audio" />
     <arg name="language" value="ja" />
     <arg name="continuous" value="true" />
+    <arg name="google_cloud_credentials_json" value="$(arg google_cloud_credentials_json)" />
   </include>
 
   <!-- launch bag file -->