feat: support for custom phrase aggregator

csdcorp · Oct 3, 2024 · 685d931 · 685d931
1 parent eb3e824
commit 685d931
Show file tree

Hide file tree

Showing 15 changed files with 225 additions and 87 deletions.
diff --git a/speech_to_text/darwin/Classes/SpeechToTextPlugin.swift b/speech_to_text/darwin/Classes/SpeechToTextPlugin.swift
@@ -53,6 +53,7 @@ public enum ListenMode: Int {
 
 struct SpeechRecognitionWords: Codable {
   let recognizedWords: String
+  let recognizedPhrases: [String]?
   let confidence: Decimal
 }
 
@@ -889,8 +890,10 @@ private class SpeechResultAggregator {
         if hasPreviousTranscriptions {
             var lowestConfidence: Decimal = 1.0
             var aggregatePhrase = ""
+            var recognizedPhrases: [String] = []
             for previousTranscription in previousTranscriptions {
                 if let transcription = previousTranscription.first {
+                    recognizedPhrases.append(transcription.formattedString)
                     lowestConfidence = min( lowestConfidence, confidenceIn(transcription))
                     if aggregatePhrase.count > 0 && aggregatePhrase.last != " " {
                         aggregatePhrase += " "
@@ -899,17 +902,18 @@ private class SpeechResultAggregator {
                 }
             }
             if let transcription = speechTranscriptions.first {
+                recognizedPhrases.append(transcription.formattedString)
                 lowestConfidence = min( lowestConfidence, confidenceIn(transcription))
                 if aggregatePhrase.count > 0 && aggregatePhrase.last != " " {
                     aggregatePhrase += " "
                 }
                 aggregatePhrase += transcription.formattedString
             }
-            speechWords.append(SpeechRecognitionWords(recognizedWords: aggregatePhrase, confidence: lowestConfidence))
+            speechWords.append(SpeechRecognitionWords(recognizedWords: aggregatePhrase, recognizedPhrases: recognizedPhrases, confidence: lowestConfidence))
         }
         for transcription in speechTranscriptions {
             let words: SpeechRecognitionWords = SpeechRecognitionWords(
-                recognizedWords: transcription.formattedString, confidence: confidenceIn(transcription))
+                recognizedWords: transcription.formattedString, recognizedPhrases: nil, confidence: confidenceIn(transcription))
             speechWords.append(words)
         }
         return SpeechRecognitionResult(alternates: speechWords, finalResult: isFinal )

diff --git a/speech_to_text/example/lib/main.dart b/speech_to_text/example/lib/main.dart
@@ -54,6 +54,7 @@ class _SpeechSampleAppState extends State<SpeechSampleApp> {
         debugLogging: _logEvents,
       );
       if (hasSpeech) {
+        speech.unexpectedPhraseAggregator = _punctAggregator;
         // Get the list of languages installed on the supporting platform so they
         // can be displayed in the UI for selection by the user.
         _localeNames = await speech.locales();
@@ -74,6 +75,10 @@ class _SpeechSampleAppState extends State<SpeechSampleApp> {
     }
   }
 
+  String _punctAggregator(List<String> phrases) {
+    return phrases.join('. ');
+  }
+
   @override
   Widget build(BuildContext context) {
     return MaterialApp(
@@ -128,7 +133,7 @@ class _SpeechSampleAppState extends State<SpeechSampleApp> {
         listenMode: ListenMode.confirmation,
         cancelOnError: true,
         partialResults: true,
-        autoPunctuation: true,
+        autoPunctuation: false,
         enableHapticFeedback: true);
     // Note that `listenFor` is the maximum, not the minimum, on some
     // systems recognition will be stopped before this value is reached.

diff --git a/speech_to_text/example/pubspec.lock b/speech_to_text/example/pubspec.lock
@@ -58,10 +58,10 @@ packages:
     dependency: "direct dev"
     description:
       name: flutter_lints
-      sha256: e2a421b7e59244faef694ba7b30562e489c2b489866e505074eb005cd7060db7
+      sha256: "9e8c3858111da373efc5aa341de011d9bd23e2c5c5e0c62bccf32438e192d7b1"
       url: "https://pub.dev"
     source: hosted
-    version: "3.0.1"
+    version: "3.0.2"
   flutter_test:
     dependency: "direct dev"
     description: flutter
@@ -76,10 +76,10 @@ packages:
     dependency: transitive
     description:
       name: json_annotation
-      sha256: b10a7b2ff83d83c777edba3c6a0f97045ddadd56c944e1a23a3fdf43a1bf4467
+      sha256: "1ce844379ca14835a50d2f019a3099f419082cfdd231cd86a142af94dd5c6bb1"
       url: "https://pub.dev"
     source: hosted
-    version: "4.8.1"
+    version: "4.9.0"
   leak_tracker:
     dependency: transitive
     description:
@@ -172,10 +172,10 @@ packages:
     dependency: "direct main"
     description:
       name: provider
-      sha256: "9a96a0a19b594dbc5bf0f1f27d2bc67d5f95957359b461cd9feb44ed6ae75096"
+      sha256: c8a055ee5ce3fd98d6fc872478b03823ffdb448699c6ebdbbc71d59b596fd48c
       url: "https://pub.dev"
     source: hosted
-    version: "6.1.1"
+    version: "6.1.2"
   sky_engine:
     dependency: transitive
     description: flutter
@@ -195,7 +195,7 @@ packages:
       path: ".."
       relative: true
     source: path
-    version: "7.0.0"
+    version: "7.1.0-beta.1"
   speech_to_text_platform_interface:
     dependency: transitive
     description:
@@ -264,10 +264,10 @@ packages:
     dependency: transitive
     description:
       name: web
-      sha256: d43c1d6b787bf0afad444700ae7f4db8827f701bc61c255ac8d328c6f4d52062
+      sha256: cd3543bd5798f6ad290ea73d210f423502e71900302dde696f8bff84bf89a1cb
       url: "https://pub.dev"
     source: hosted
-    version: "1.0.0"
+    version: "1.1.0"
 sdks:
   dart: ">=3.4.0 <4.0.0"
   flutter: ">=3.18.0-18.0.pre.54"
diff --git a/speech_to_text/lib/balanced_alternates.dart b/speech_to_text/lib/balanced_alternates.dart
@@ -24,7 +24,7 @@ class BalancedAlternates {
   ///
   void add(int phrase, String words, double confidence) {
     _alternates[phrase] ??= [];
-    _alternates[phrase]?.add(SpeechRecognitionWords(words, confidence));
+    _alternates[phrase]?.add(SpeechRecognitionWords(words, null, confidence));
   }
 
   /// Return the full speech recognition results which is the concatenation
@@ -62,8 +62,8 @@ class BalancedAlternates {
           alternateConfidence = min(alternateConfidence,
               _alternates[phraseIndex]![altCount].confidence);
         }
-        result
-            .add(SpeechRecognitionWords(alternatePhrase, alternateConfidence));
+        result.add(
+            SpeechRecognitionWords(alternatePhrase, null, alternateConfidence));
       }
     } else {
       for (var phraseIndex = phraseCount - 1; phraseIndex >= 0; --phraseIndex) {

diff --git a/speech_to_text/lib/speech_recognition_error.g.dart b/speech_to_text/lib/speech_recognition_error.g.dart
diff --git a/speech_to_text/lib/speech_recognition_result.dart b/speech_to_text/lib/speech_recognition_result.dart
@@ -96,6 +96,18 @@ class SpeechRecognitionWords {
   /// The sequence of words recognized
   final String recognizedWords;
 
+  /// If the platform provides it, a list of phrases that were recognized
+  /// as individual utterances. This can generally be ignored as it
+  /// is usually null and where it is not [recognizedWords] will contain
+  /// the same information aggregated into a single string.
+  /// Currently this is only populated on iOS 17.5 and 18 where a bug in
+  /// the speech recognizer causes unexpected extra phrases. These are
+  /// automatically handled by the plugin and recognizedWords will be
+  /// an aggregate of all the phrases. To customize the handling of
+  /// these phrases, use the [SpeechToText.unexpectedPhraseAggregator] property
+  /// to customize the aggregation.
+  final List<String>? recognizedPhrases;
+
   /// The confidence that the [recognizedWords] are correct.
   ///
   /// Confidence is expressed as a value between 0 and 1. 0
@@ -106,7 +118,8 @@ class SpeechRecognitionWords {
   static const double confidenceThreshold = 0.8;
   static const double missingConfidence = -1;
 
-  const SpeechRecognitionWords(this.recognizedWords, this.confidence);
+  const SpeechRecognitionWords(
+      this.recognizedWords, this.recognizedPhrases, this.confidence);
 
   /// true if there is confidence in this recognition, false otherwise.
   ///

diff --git a/speech_to_text/lib/speech_recognition_result.g.dart b/speech_to_text/lib/speech_recognition_result.g.dart
diff --git a/speech_to_text/lib/speech_to_text.dart b/speech_to_text/lib/speech_to_text.dart
@@ -71,6 +71,14 @@ typedef SpeechErrorListener = void Function(
 /// See the [onStatus] argument on the [SpeechToText.initialize] method for use.
 typedef SpeechStatusListener = void Function(String status);
 
+/// Aggregates multiple phrases into a single result. This is used when
+/// the platform returns multiple phrases for a single utterance. The default
+/// behaviour is to concatenate the phrases into a single result with spaces
+/// separating the phrases and no change to capitalization. This can
+/// be overridden to provide a different aggregation strategy.
+/// see [_defaultPhraseAggregator] for the default implementation.
+typedef SpeechPhraseAggregator = String Function(List<String> phrases);
+
 /// Notified when the sound level changes during a listen method.
 ///
 /// [level] is a measure of the decibels of the current sound on
@@ -196,6 +204,13 @@ class SpeechToText {
   SpeechStatusListener? statusListener;
   SpeechSoundLevelChange? _soundLevelChange;
 
+  /// This overrides the default phrase aggregator to allow for
+  /// different strategies for aggregating multiple phrases into
+  /// a single result. This is used when the platform unexpectedly
+  /// returns multiple phrases for a single utterance. Currently
+  /// this happens only due to a bug in iOS 17.5/18
+  SpeechPhraseAggregator? unexpectedPhraseAggregator;
+
   factory SpeechToText() => _instance;
 
   @visibleForTesting
@@ -615,9 +630,29 @@ class SpeechToText {
     // print('onTextRecognition');
     Map<String, dynamic> resultMap = jsonDecode(resultJson);
     var speechResult = SpeechRecognitionResult.fromJson(resultMap);
+    speechResult = _checkAggregates(speechResult);
     _notifyResults(speechResult);
   }
 
+  /// Checks the result for multiple phrases and aggregates them if needed.
+  /// Returns a new result with the aggregated phrases.
+  SpeechRecognitionResult _checkAggregates(SpeechRecognitionResult result) {
+    var alternates = <SpeechRecognitionWords>[];
+    for (var alternate in result.alternates) {
+      if (alternate.recognizedPhrases != null) {
+        final aggregated = (unexpectedPhraseAggregator ??
+            _defaultPhraseAggregator)(alternate.recognizedPhrases!);
+        final aggregatedWords = SpeechRecognitionWords(
+            aggregated, alternate.recognizedPhrases, alternate.confidence);
+        alternates.add(aggregatedWords);
+      } else {
+        alternates.add(alternate);
+      }
+      // print('  ${alternate.recognizedWords} ${alternate.confidence}');
+    }
+    return SpeechRecognitionResult(alternates, result.finalResult);
+  }
+
   void _onFinalTimeout() {
     // print('onFinalTimeout $_finalTimeout');
     if (_notifiedFinal) return;
@@ -715,6 +750,10 @@ class SpeechToText {
     _notifyFinalTimer = null;
     _listenTimer = null;
   }
+
+  String _defaultPhraseAggregator(List<String> phrases) {
+    return phrases.join(' ');
+  }
 }
 
 /// Thrown when a method is called that requires successful