Skip to content

Commit

Permalink
feat: support for custom phrase aggregator
Browse files Browse the repository at this point in the history
  • Loading branch information
sowens-csd committed Oct 3, 2024
1 parent eb3e824 commit 685d931
Show file tree
Hide file tree
Showing 15 changed files with 225 additions and 87 deletions.
8 changes: 6 additions & 2 deletions speech_to_text/darwin/Classes/SpeechToTextPlugin.swift
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ public enum ListenMode: Int {

struct SpeechRecognitionWords: Codable {
let recognizedWords: String
let recognizedPhrases: [String]?
let confidence: Decimal
}

Expand Down Expand Up @@ -889,8 +890,10 @@ private class SpeechResultAggregator {
if hasPreviousTranscriptions {
var lowestConfidence: Decimal = 1.0
var aggregatePhrase = ""
var recognizedPhrases: [String] = []
for previousTranscription in previousTranscriptions {
if let transcription = previousTranscription.first {
recognizedPhrases.append(transcription.formattedString)
lowestConfidence = min( lowestConfidence, confidenceIn(transcription))
if aggregatePhrase.count > 0 && aggregatePhrase.last != " " {
aggregatePhrase += " "
Expand All @@ -899,17 +902,18 @@ private class SpeechResultAggregator {
}
}
if let transcription = speechTranscriptions.first {
recognizedPhrases.append(transcription.formattedString)
lowestConfidence = min( lowestConfidence, confidenceIn(transcription))
if aggregatePhrase.count > 0 && aggregatePhrase.last != " " {
aggregatePhrase += " "
}
aggregatePhrase += transcription.formattedString
}
speechWords.append(SpeechRecognitionWords(recognizedWords: aggregatePhrase, confidence: lowestConfidence))
speechWords.append(SpeechRecognitionWords(recognizedWords: aggregatePhrase, recognizedPhrases: recognizedPhrases, confidence: lowestConfidence))
}
for transcription in speechTranscriptions {
let words: SpeechRecognitionWords = SpeechRecognitionWords(
recognizedWords: transcription.formattedString, confidence: confidenceIn(transcription))
recognizedWords: transcription.formattedString, recognizedPhrases: nil, confidence: confidenceIn(transcription))
speechWords.append(words)
}
return SpeechRecognitionResult(alternates: speechWords, finalResult: isFinal )
Expand Down
7 changes: 6 additions & 1 deletion speech_to_text/example/lib/main.dart
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class _SpeechSampleAppState extends State<SpeechSampleApp> {
debugLogging: _logEvents,
);
if (hasSpeech) {
speech.unexpectedPhraseAggregator = _punctAggregator;
// Get the list of languages installed on the supporting platform so they
// can be displayed in the UI for selection by the user.
_localeNames = await speech.locales();
Expand All @@ -74,6 +75,10 @@ class _SpeechSampleAppState extends State<SpeechSampleApp> {
}
}

String _punctAggregator(List<String> phrases) {
return phrases.join('. ');
}

@override
Widget build(BuildContext context) {
return MaterialApp(
Expand Down Expand Up @@ -128,7 +133,7 @@ class _SpeechSampleAppState extends State<SpeechSampleApp> {
listenMode: ListenMode.confirmation,
cancelOnError: true,
partialResults: true,
autoPunctuation: true,
autoPunctuation: false,
enableHapticFeedback: true);
// Note that `listenFor` is the maximum, not the minimum, on some
// systems recognition will be stopped before this value is reached.
Expand Down
18 changes: 9 additions & 9 deletions speech_to_text/example/pubspec.lock
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ packages:
dependency: "direct dev"
description:
name: flutter_lints
sha256: e2a421b7e59244faef694ba7b30562e489c2b489866e505074eb005cd7060db7
sha256: "9e8c3858111da373efc5aa341de011d9bd23e2c5c5e0c62bccf32438e192d7b1"
url: "https://pub.dev"
source: hosted
version: "3.0.1"
version: "3.0.2"
flutter_test:
dependency: "direct dev"
description: flutter
Expand All @@ -76,10 +76,10 @@ packages:
dependency: transitive
description:
name: json_annotation
sha256: b10a7b2ff83d83c777edba3c6a0f97045ddadd56c944e1a23a3fdf43a1bf4467
sha256: "1ce844379ca14835a50d2f019a3099f419082cfdd231cd86a142af94dd5c6bb1"
url: "https://pub.dev"
source: hosted
version: "4.8.1"
version: "4.9.0"
leak_tracker:
dependency: transitive
description:
Expand Down Expand Up @@ -172,10 +172,10 @@ packages:
dependency: "direct main"
description:
name: provider
sha256: "9a96a0a19b594dbc5bf0f1f27d2bc67d5f95957359b461cd9feb44ed6ae75096"
sha256: c8a055ee5ce3fd98d6fc872478b03823ffdb448699c6ebdbbc71d59b596fd48c
url: "https://pub.dev"
source: hosted
version: "6.1.1"
version: "6.1.2"
sky_engine:
dependency: transitive
description: flutter
Expand All @@ -195,7 +195,7 @@ packages:
path: ".."
relative: true
source: path
version: "7.0.0"
version: "7.1.0-beta.1"
speech_to_text_platform_interface:
dependency: transitive
description:
Expand Down Expand Up @@ -264,10 +264,10 @@ packages:
dependency: transitive
description:
name: web
sha256: d43c1d6b787bf0afad444700ae7f4db8827f701bc61c255ac8d328c6f4d52062
sha256: cd3543bd5798f6ad290ea73d210f423502e71900302dde696f8bff84bf89a1cb
url: "https://pub.dev"
source: hosted
version: "1.0.0"
version: "1.1.0"
sdks:
dart: ">=3.4.0 <4.0.0"
flutter: ">=3.18.0-18.0.pre.54"
6 changes: 3 additions & 3 deletions speech_to_text/lib/balanced_alternates.dart
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class BalancedAlternates {
///
void add(int phrase, String words, double confidence) {
_alternates[phrase] ??= [];
_alternates[phrase]?.add(SpeechRecognitionWords(words, confidence));
_alternates[phrase]?.add(SpeechRecognitionWords(words, null, confidence));
}

/// Return the full speech recognition results which is the concatenation
Expand Down Expand Up @@ -62,8 +62,8 @@ class BalancedAlternates {
alternateConfidence = min(alternateConfidence,
_alternates[phraseIndex]![altCount].confidence);
}
result
.add(SpeechRecognitionWords(alternatePhrase, alternateConfidence));
result.add(
SpeechRecognitionWords(alternatePhrase, null, alternateConfidence));
}
} else {
for (var phraseIndex = phraseCount - 1; phraseIndex >= 0; --phraseIndex) {
Expand Down
11 changes: 5 additions & 6 deletions speech_to_text/lib/speech_recognition_error.g.dart

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 14 additions & 1 deletion speech_to_text/lib/speech_recognition_result.dart
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,18 @@ class SpeechRecognitionWords {
/// The sequence of words recognized
final String recognizedWords;

/// If the platform provides it, a list of phrases that were recognized
/// as individual utterances. This can generally be ignored as it
/// is usually null and where it is not [recognizedWords] will contain
/// the same information aggregated into a single string.
/// Currently this is only populated on iOS 17.5 and 18 where a bug in
/// the speech recognizer causes unexpected extra phrases. These are
/// automatically handled by the plugin and recognizedWords will be
/// an aggregate of all the phrases. To customize the handling of
/// these phrases, use the [SpeechToText.unexpectedPhraseAggregator] property
/// to customize the aggregation.
final List<String>? recognizedPhrases;

/// The confidence that the [recognizedWords] are correct.
///
/// Confidence is expressed as a value between 0 and 1. 0
Expand All @@ -106,7 +118,8 @@ class SpeechRecognitionWords {
static const double confidenceThreshold = 0.8;
static const double missingConfidence = -1;

const SpeechRecognitionWords(this.recognizedWords, this.confidence);
const SpeechRecognitionWords(
this.recognizedWords, this.recognizedPhrases, this.confidence);

/// true if there is confidence in this recognition, false otherwise.
///
Expand Down
31 changes: 17 additions & 14 deletions speech_to_text/lib/speech_recognition_result.g.dart

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions speech_to_text/lib/speech_to_text.dart
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ typedef SpeechErrorListener = void Function(
/// See the [onStatus] argument on the [SpeechToText.initialize] method for use.
typedef SpeechStatusListener = void Function(String status);

/// Aggregates multiple phrases into a single result. This is used when
/// the platform returns multiple phrases for a single utterance. The default
/// behaviour is to concatenate the phrases into a single result with spaces
/// separating the phrases and no change to capitalization. This can
/// be overridden to provide a different aggregation strategy.
/// see [_defaultPhraseAggregator] for the default implementation.
typedef SpeechPhraseAggregator = String Function(List<String> phrases);

/// Notified when the sound level changes during a listen method.
///
/// [level] is a measure of the decibels of the current sound on
Expand Down Expand Up @@ -196,6 +204,13 @@ class SpeechToText {
SpeechStatusListener? statusListener;
SpeechSoundLevelChange? _soundLevelChange;

/// This overrides the default phrase aggregator to allow for
/// different strategies for aggregating multiple phrases into
/// a single result. This is used when the platform unexpectedly
/// returns multiple phrases for a single utterance. Currently
/// this happens only due to a bug in iOS 17.5/18
SpeechPhraseAggregator? unexpectedPhraseAggregator;

factory SpeechToText() => _instance;

@visibleForTesting
Expand Down Expand Up @@ -615,9 +630,29 @@ class SpeechToText {
// print('onTextRecognition');
Map<String, dynamic> resultMap = jsonDecode(resultJson);
var speechResult = SpeechRecognitionResult.fromJson(resultMap);
speechResult = _checkAggregates(speechResult);
_notifyResults(speechResult);
}

/// Checks the result for multiple phrases and aggregates them if needed.
/// Returns a new result with the aggregated phrases.
SpeechRecognitionResult _checkAggregates(SpeechRecognitionResult result) {
var alternates = <SpeechRecognitionWords>[];
for (var alternate in result.alternates) {
if (alternate.recognizedPhrases != null) {
final aggregated = (unexpectedPhraseAggregator ??
_defaultPhraseAggregator)(alternate.recognizedPhrases!);
final aggregatedWords = SpeechRecognitionWords(
aggregated, alternate.recognizedPhrases, alternate.confidence);
alternates.add(aggregatedWords);
} else {
alternates.add(alternate);
}
// print(' ${alternate.recognizedWords} ${alternate.confidence}');
}
return SpeechRecognitionResult(alternates, result.finalResult);
}

void _onFinalTimeout() {
// print('onFinalTimeout $_finalTimeout');
if (_notifiedFinal) return;
Expand Down Expand Up @@ -715,6 +750,10 @@ class SpeechToText {
_notifyFinalTimer = null;
_listenTimer = null;
}

String _defaultPhraseAggregator(List<String> phrases) {
return phrases.join(' ');
}
}

/// Thrown when a method is called that requires successful
Expand Down
Loading

0 comments on commit 685d931

Please sign in to comment.