From a39903087a0ea996342c53133c4a954510afbde1 Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Tue, 19 Nov 2024 16:23:50 +0100 Subject: [PATCH 01/17] Reimplement part of speech to text Enabled the c++ part and added the base UI. Actual video and subtitles showing etc needs to be implemented --- lib/interop/generated_bindings.dart | 71 +++++++++++ lib/interop/speech_to_text.dart | 109 ++++++++-------- .../computer_vision/batch_inference.dart | 2 +- lib/pages/computer_vision/live_inference.dart | 2 +- .../widgets/model_properties.dart | 107 ++++++++-------- lib/pages/models/inference.dart | 3 +- lib/pages/transcription/playground.dart | 101 +++++++++++++++ .../providers/speech_inference_provider.dart | 88 +++++++++++++ lib/pages/transcription/transcription.dart | 120 ++++++++++++++++++ lib/pages/transcription/utils/section.dart | 98 ++++++++++++++ lib/utils/drop_area.dart | 84 ++++++------ macos/Runner.xcodeproj/project.pbxproj | 30 +++++ openvino_bindings/src/BUILD | 1 + openvino_bindings/src/bindings.cc | 86 ++++++------- openvino_bindings/src/bindings.h | 8 +- 15 files changed, 706 insertions(+), 204 deletions(-) create mode 100644 lib/pages/transcription/playground.dart create mode 100644 lib/pages/transcription/providers/speech_inference_provider.dart create mode 100644 lib/pages/transcription/transcription.dart create mode 100644 lib/pages/transcription/utils/section.dart diff --git a/lib/interop/generated_bindings.dart b/lib/interop/generated_bindings.dart index f37fd89..ea541a8 100644 --- a/lib/interop/generated_bindings.dart +++ b/lib/interop/generated_bindings.dart @@ -569,6 +569,77 @@ class OpenVINO { late final _graphRunnerStop = _graphRunnerStopPtr .asFunction Function(CGraphRunner)>(); + ffi.Pointer speechToTextOpen( + ffi.Pointer model_path, + ffi.Pointer device, + ) { + return _speechToTextOpen( + model_path, + device, + ); + } + + late final _speechToTextOpenPtr = _lookup< + ffi.NativeFunction< + ffi.Pointer Function(ffi.Pointer, + ffi.Pointer)>>('speechToTextOpen'); + late final _speechToTextOpen = _speechToTextOpenPtr.asFunction< + ffi.Pointer Function( + ffi.Pointer, ffi.Pointer)>(); + + ffi.Pointer speechToTextLoadVideo( + CSpeechToText instance, + ffi.Pointer video_path, + ) { + return _speechToTextLoadVideo( + instance, + video_path, + ); + } + + late final _speechToTextLoadVideoPtr = _lookup< + ffi.NativeFunction< + ffi.Pointer Function(CSpeechToText, + ffi.Pointer)>>('speechToTextLoadVideo'); + late final _speechToTextLoadVideo = _speechToTextLoadVideoPtr.asFunction< + ffi.Pointer Function(CSpeechToText, ffi.Pointer)>(); + + ffi.Pointer speechToTextVideoDuration( + CSpeechToText instance, + ) { + return _speechToTextVideoDuration( + instance, + ); + } + + late final _speechToTextVideoDurationPtr = _lookup< + ffi.NativeFunction Function(CSpeechToText)>>( + 'speechToTextVideoDuration'); + late final _speechToTextVideoDuration = _speechToTextVideoDurationPtr + .asFunction Function(CSpeechToText)>(); + + ffi.Pointer speechToTextTranscribe( + CSpeechToText instance, + int start, + int duration, + ffi.Pointer language, + ) { + return _speechToTextTranscribe( + instance, + start, + duration, + language, + ); + } + + late final _speechToTextTranscribePtr = _lookup< + ffi.NativeFunction< + ffi.Pointer Function(CSpeechToText, ffi.Int, + ffi.Int, ffi.Pointer)>>('speechToTextTranscribe'); + late final _speechToTextTranscribe = _speechToTextTranscribePtr.asFunction< + ffi.Pointer Function( + CSpeechToText, int, int, ffi.Pointer)>(); + ffi.Pointer getAvailableDevices() { return _getAvailableDevices(); } diff --git a/lib/interop/speech_to_text.dart b/lib/interop/speech_to_text.dart index c8635ae..b81ed02 100644 --- a/lib/interop/speech_to_text.dart +++ b/lib/interop/speech_to_text.dart @@ -14,67 +14,64 @@ class SpeechToText { SpeechToText(this.instance); static Future init(String modelPath, String device) async { - throw UnimplementedError(); - //final result = await Isolate.run(() { - // final modelPathPtr = modelPath.toNativeUtf8(); - // final devicePtr = device.toNativeUtf8(); - // final status = ov.speechToTextOpen(modelPathPtr, devicePtr); - // calloc.free(modelPathPtr); - // calloc.free(devicePtr); - - // return status; - //}); - - //print("${result.ref.status}, ${result.ref.message}"); - //if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) { - // throw "SpeechToText open error: ${result.ref.status} ${result.ref.message.toDartString()}"; - //} - - //return SpeechToText(result); + final result = await Isolate.run(() { + final modelPathPtr = modelPath.toNativeUtf8(); + final devicePtr = device.toNativeUtf8(); + final status = ov.speechToTextOpen(modelPathPtr, devicePtr); + calloc.free(modelPathPtr); + calloc.free(devicePtr); + + return status; + }); + + print("${result.ref.status}, ${result.ref.message}"); + if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) { + throw "SpeechToText open error: ${result.ref.status} ${result.ref.message.toDartString()}"; + } + + return SpeechToText(result); } Future loadVideo(String videoPath) async{ - throw UnimplementedError(); - //int instanceAddress = instance.ref.value.address; - //{ - // final result = await Isolate.run(() { - // final videoPathPtr = videoPath.toNativeUtf8(); - // final status = ov.speechToTextLoadVideo(Pointer.fromAddress(instanceAddress), videoPathPtr); - // calloc.free(videoPathPtr); - // return status; - // }); - - // if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) { - // throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}"; - // } - //} - - //{ - // final result = await Isolate.run(() { - // final status = ov.speechToTextVideoDuration(Pointer.fromAddress(instanceAddress)); - // return status; - // }); - // if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) { - // throw "SpeechToText VideoDuration error: ${result.ref.status} ${result.ref.message.toDartString()}"; - // } - // return result.ref.value; - //} + int instanceAddress = instance.ref.value.address; + { + final result = await Isolate.run(() { + final videoPathPtr = videoPath.toNativeUtf8(); + final status = ov.speechToTextLoadVideo(Pointer.fromAddress(instanceAddress), videoPathPtr); + calloc.free(videoPathPtr); + return status; + }); + + if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) { + throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}"; + } + } + + { + final result = await Isolate.run(() { + final status = ov.speechToTextVideoDuration(Pointer.fromAddress(instanceAddress)); + return status; + }); + if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) { + throw "SpeechToText VideoDuration error: ${result.ref.status} ${result.ref.message.toDartString()}"; + } + return result.ref.value; + } } Future transcribe(int start, int duration, String language) async{ - throw UnimplementedError(); - //int instanceAddress = instance.ref.value.address; - //final result = await Isolate.run(() { - // final languagePtr = language.toNativeUtf8(); - // final status = ov.speechToTextTranscribe(Pointer.fromAddress(instanceAddress), start, duration, languagePtr); - // calloc.free(languagePtr); - // return status; - //}); - - //if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) { - // throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}"; - //} - - //return result.ref.value.toDartString(); + int instanceAddress = instance.ref.value.address; + final result = await Isolate.run(() { + final languagePtr = language.toNativeUtf8(); + final status = ov.speechToTextTranscribe(Pointer.fromAddress(instanceAddress), start, duration, languagePtr); + calloc.free(languagePtr); + return status; + }); + + if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) { + throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}"; + } + + return result.ref.value.toDartString(); } } diff --git a/lib/pages/computer_vision/batch_inference.dart b/lib/pages/computer_vision/batch_inference.dart index d2f34cf..53c74ff 100644 --- a/lib/pages/computer_vision/batch_inference.dart +++ b/lib/pages/computer_vision/batch_inference.dart @@ -99,7 +99,7 @@ class BatchInference extends StatelessWidget { ), ), ), - const ModelProperties(), + ModelProperties(project: batchInference.imageInference.project), ], ); } diff --git a/lib/pages/computer_vision/live_inference.dart b/lib/pages/computer_vision/live_inference.dart index 0b089bf..9c78f25 100644 --- a/lib/pages/computer_vision/live_inference.dart +++ b/lib/pages/computer_vision/live_inference.dart @@ -135,7 +135,7 @@ class _LiveInferenceState extends State { ], ), ), - const ModelProperties(), + ModelProperties(project: widget.project), ], ); } diff --git a/lib/pages/computer_vision/widgets/model_properties.dart b/lib/pages/computer_vision/widgets/model_properties.dart index d333243..5d7e932 100644 --- a/lib/pages/computer_vision/widgets/model_properties.dart +++ b/lib/pages/computer_vision/widgets/model_properties.dart @@ -1,70 +1,67 @@ import 'package:fluent_ui/fluent_ui.dart'; +import 'package:inference/project.dart'; import 'package:inference/theme_fluent.dart'; -import 'package:inference/utils.dart'; import 'package:inference/pages/models/widgets/grid_container.dart'; -import 'package:inference/providers/image_inference_provider.dart'; import 'package:intl/intl.dart'; -import 'package:provider/provider.dart'; +import 'package:inference/utils.dart'; class ModelProperties extends StatelessWidget { - const ModelProperties({super.key}); + final Project project; + const ModelProperties({super.key, required this.project}); @override Widget build(BuildContext context) { - return Consumer(builder: (context, inference, child) { - Locale locale = Localizations.localeOf(context); - final formatter = NumberFormat.percentPattern(locale.languageCode); + Locale locale = Localizations.localeOf(context); + final formatter = NumberFormat.percentPattern(locale.languageCode); - return SizedBox( - width: 280, - child: GridContainer( - padding: const EdgeInsets.symmetric(vertical: 18, horizontal: 24), - child: Column( - crossAxisAlignment: CrossAxisAlignment.start, - children: [ - const Text("Model parameters", style: TextStyle( - fontSize: 20, - )), - Container( - padding: const EdgeInsets.only(top: 16), - child: Column( - crossAxisAlignment: CrossAxisAlignment.start, - children: [ - ModelProperty( - title: "Model name", - value: inference.project.name, - ), - ModelProperty( - title: "Task", - value: inference.project.taskName(), - ), - ModelProperty( - title: "Architecture", - value: inference.project.architecture, - ), - ModelProperty( - title: "Size", - value: inference.project.size?.readableFileSize() ?? "", - ), - Builder( - builder: (context) { - if (inference.project.tasks.first.performance == null) { - return Container(); - } - return ModelProperty( - title: "Accuracy", - value: formatter.format(inference.project.tasks.first.performance!.score) - ); - } - ), - ], + return SizedBox( + width: 280, + child: GridContainer( + padding: const EdgeInsets.symmetric(vertical: 18, horizontal: 24), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + const Text("Model parameters", style: TextStyle( + fontSize: 20, + )), + Container( + padding: const EdgeInsets.only(top: 16), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + ModelProperty( + title: "Model name", + value: project.name, + ), + ModelProperty( + title: "Task", + value: project.taskName(), + ), + ModelProperty( + title: "Architecture", + value: project.architecture, + ), + ModelProperty( + title: "Size", + value: project.size?.readableFileSize() ?? "", + ), + Builder( + builder: (context) { + if (project.tasks.first.performance == null) { + return Container(); + } + return ModelProperty( + title: "Accuracy", + value: formatter.format(project.tasks.first.performance!.score) + ); + } ), - ) - ], + ], + ), ) - ), - ); - } + ], + ) + ), ); } } diff --git a/lib/pages/models/inference.dart b/lib/pages/models/inference.dart index 1445420..18b022a 100644 --- a/lib/pages/models/inference.dart +++ b/lib/pages/models/inference.dart @@ -1,5 +1,6 @@ import 'package:fluent_ui/fluent_ui.dart'; import 'package:inference/pages/computer_vision/computer_vision.dart'; +import 'package:inference/pages/transcription/transcription.dart'; import 'package:inference/project.dart'; class InferencePage extends StatelessWidget { @@ -14,7 +15,7 @@ class InferencePage extends StatelessWidget { case ProjectType.text: return Container(); case ProjectType.speech: - return Container(); + return TranscriptionPage(project); } } diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart new file mode 100644 index 0000000..28417b8 --- /dev/null +++ b/lib/pages/transcription/playground.dart @@ -0,0 +1,101 @@ +import 'package:file_picker/file_picker.dart'; +import 'package:fluent_ui/fluent_ui.dart'; +import 'package:inference/pages/computer_vision/widgets/model_properties.dart'; +import 'package:inference/pages/models/widgets/grid_container.dart'; +import 'package:inference/project.dart'; +import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; +import 'package:inference/theme_fluent.dart'; +import 'package:inference/utils/drop_area.dart'; +import 'package:inference/widgets/controls/no_outline_button.dart'; +import 'package:inference/widgets/device_selector.dart'; +//import 'package:media_kit/media_kit.dart'; +//import 'package:media_kit_video/media_kit_video.dart'; +import 'package:provider/provider.dart'; + +class Playground extends StatefulWidget { + final Project project; + const Playground({super.key, required this.project}); + + @override + State createState() => _PlaygroundState(); +} + +class _PlaygroundState extends State { + //late final player = Player(); + //late final controller = VideoController(player); + + void showUploadMenu() async { + FilePickerResult? result = await FilePicker.platform.pickFiles(type: FileType.video); + + if (result != null) { + uploadFile(result.files.single.path!); + } + } + + void uploadFile(String file) async { + final inference = Provider.of(context, listen: false); + await inference.loadVideo(file); + inference.startTranscribing(); + } + + @override + Widget build(BuildContext context) { + final theme = FluentTheme.of(context); + return Row( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Expanded( + child: Column( + children: [ + SizedBox( + height: 64, + child: GridContainer( + child: Padding( + padding: const EdgeInsets.symmetric(horizontal: 16), + child: Row( + children: [ + NoOutlineButton( + onPressed: showUploadMenu, + child: Row( + children: [ + const Text("Choose video"), + const Padding( + padding: EdgeInsets.only(left: 8), + child: Icon(FluentIcons.chevron_down, size: 12), + ), + ], + ), + ), + const DeviceSelector(), + ], + ), + ), + ), + ), + Expanded( + child: GridContainer( + color: backgroundColor.of(theme), + child: Builder( + builder: (context) { + return DropArea( + type: "video", + showChild: false, + onUpload: (String file) { uploadFile(file); }, + extensions: const [], + child: Padding( + padding: const EdgeInsets.all(8.0), + child: Container(), + ), + ); + } + ), + ), + ) + ], + ), + ), + ModelProperties(project: widget.project), + ] + ); + } +} diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart new file mode 100644 index 0000000..9f658fe --- /dev/null +++ b/lib/pages/transcription/providers/speech_inference_provider.dart @@ -0,0 +1,88 @@ +import 'dart:async'; + +import 'package:flutter/material.dart'; +import 'package:inference/interop/speech_to_text.dart'; +import 'package:inference/pages/transcription/utils/section.dart'; +import 'package:inference/project.dart'; + +const transcriptionPeriod = 10; + +class SpeechInferenceProvider extends ChangeNotifier { + Completer loaded = Completer(); + + + Project? _project; + String? _device; + + String? _videoPath; + String? get videoPath => _videoPath; + + bool get videoLoaded => _videoPath != null; + + DynamicRangeLoading>? _transcription; + Map>? get transcription => _transcription?.data; + + String _language = ""; + + String get language => _language; + set language(String val) { + _language = val; + notifyListeners(); + } + + SpeechToText? _inference; + + SpeechInferenceProvider(Project? project, String? device) { + _project = project; + _device = device; + + if (project != null && device != null) { + SpeechToText.init(project.storagePath, device).then((instance) { + _inference = instance; + loaded.complete(); + notifyListeners(); + }); + } + } + + void skipTo(int index) { + _transcription!.skipTo(index); + } + + Future loadVideo(String path) async { + await loaded.future; + _videoPath = path; + final duration = await _inference!.loadVideo(path); + final sections = (duration / transcriptionPeriod).ceil(); + _transcription = DynamicRangeLoading>(Section(0, sections)); + notifyListeners(); + } + + Future startTranscribing() async { + if (_transcription == null) { + throw Exception("Can't transcribe before loading video"); + } + + while (!_transcription!.complete) { + if (_transcription == null) { + return; + } + await _transcription!.process((int i) { + return transcribe(i * transcriptionPeriod, transcriptionPeriod); + }); + if (hasListeners) { + notifyListeners(); + } + } + } + + Future transcribe(int start, int duration) async { + await loaded.future; + return await _inference!.transcribe(start, duration, _language); + } + + bool sameProps(Project? project, String? device) { + return _project == project && _device == device; + } + +} diff --git a/lib/pages/transcription/transcription.dart b/lib/pages/transcription/transcription.dart new file mode 100644 index 0000000..14353d9 --- /dev/null +++ b/lib/pages/transcription/transcription.dart @@ -0,0 +1,120 @@ +import 'package:fluent_ui/fluent_ui.dart'; +import 'package:go_router/go_router.dart'; +import 'package:inference/project.dart'; +import 'package:inference/providers/preference_provider.dart'; +import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; +import 'package:inference/pages/transcription/playground.dart'; +import 'package:provider/provider.dart'; + +class TranscriptionPage extends StatefulWidget { + final Project project; + const TranscriptionPage(this.project, {super.key}); + + @override + State createState() => _TranscriptionPageState(); +} + +class _TranscriptionPageState extends State { + + + int selected = 0; + @override + Widget build(BuildContext context) { + final theme = FluentTheme.of(context); + final updatedTheme = theme.copyWith( + navigationPaneTheme: theme.navigationPaneTheme.merge(NavigationPaneThemeData( + backgroundColor: theme.scaffoldBackgroundColor, + )) + ); + return ChangeNotifierProxyProvider( + lazy: false, + create: (_) { + final device = Provider.of(context, listen: false).device; + return SpeechInferenceProvider(widget.project, device); + }, + update: (_, preferences, imageInferenceProvider) { + if (imageInferenceProvider != null && imageInferenceProvider.sameProps(widget.project, preferences.device)) { + return imageInferenceProvider; + } + return SpeechInferenceProvider(widget.project, preferences.device); + }, + child: Stack( + children: [ + FluentTheme( + data: updatedTheme, + child: NavigationView( + pane: NavigationPane( + size: const NavigationPaneSize(topHeight: 64), + header: Row( + children: [ + Padding( + padding: const EdgeInsets.only(left: 12.0), + child: ClipRRect( + borderRadius: BorderRadius.circular(4.0), + child: Container( + width: 40, + height: 40, + decoration: BoxDecoration( + image: DecorationImage( + image: widget.project.thumbnailImage(), + fit: BoxFit.cover), + ), + ), + ), + ), + Padding( + padding: const EdgeInsets.symmetric(horizontal: 16), + child: Text(widget.project.name, + style: const TextStyle(fontSize: 20, fontWeight: FontWeight.bold), + ), + ), + ], + ), + //customPane: CustomNavigationPane(), + selected: selected, + onChanged: (i) => setState(() {selected = i;}), + displayMode: PaneDisplayMode.top, + items: [ + PaneItem( + icon: const Icon(FluentIcons.processing), + title: const Text("Playground"), + body: Playground(project: widget.project), + ), + PaneItem( + icon: const Icon(FluentIcons.project_collection), + title: const Text("Performance metrics"), + body: Container(), + ), + ], + ) + ), + ), + SizedBox( + height: 64, + child: Padding( + padding: const EdgeInsets.symmetric(horizontal: 25), + child: Row( + mainAxisAlignment: MainAxisAlignment.end, + children: [ + Padding( + padding: const EdgeInsets.all(4), + child: OutlinedButton( + style: ButtonStyle( + shape:WidgetStatePropertyAll(RoundedRectangleBorder( + borderRadius: BorderRadius.circular(4.0), + side: const BorderSide(color: Color(0XFF545454)), + )), + ), + child: const Text("Close"), + onPressed: () => GoRouter.of(context).go("/models"), + ), + ), + ] + ), + ), + ) + ], + ) + ); + } +} diff --git a/lib/pages/transcription/utils/section.dart b/lib/pages/transcription/utils/section.dart new file mode 100644 index 0000000..27ede1d --- /dev/null +++ b/lib/pages/transcription/utils/section.dart @@ -0,0 +1,98 @@ +void moveToFront(List list, I item) { + list.remove(item); + list.insert(0, item); +} + +void moveToEnd(List list, I item) { + list.remove(item); + list.add(item); +} + +class DynamicRangeLoading { + List
sections = []; + Map data = {}; + + DynamicRangeLoading(Section section): sections = [section]; + + Section get activeSection => sections.first; + + // The incomplete sections will always be in front + bool get complete => activeSection.complete; + + void skipTo(int i) { + for (var section in sections) { + if (section.contains(i)) { + if (i > section.index) { + // Section has not progressed until the requested index + // Split the section and move the new section to the front + final newSection = section.split(i); + sections.insert(0, newSection); + } else { + // Section is further ahead than requested skipTo + // move section to front since that work has higher prio + if (!section.complete && section != activeSection) { + moveToFront(sections, section); + } + } + return; + } + } + + throw Exception("Out of range"); + } + + int getNextIndex() { + if (complete) { + throw Exception("Cannot get next index. All work is done"); + } + return activeSection.index; + } + + void pumpIndex() { + if (activeSection.pump()) { + //activeSection has ended + if (sections.length > 1) { + moveToEnd(sections,activeSection); + } + } + } + + Future process(Future Function(int) func) async{ + final index = getNextIndex(); + final val = await func(index); + data[index] = val; + pumpIndex(); + return val; + } + + void setData(I value) { + data[activeSection.index] = value; + activeSection.index += 1; + } +} + +class Section { + int begin; + int? end; + int index; + + Section(this.begin, this.end): index = begin; + + bool contains(int i) => begin <= i && (end == null ? true : i < end!); + + Section split(int i) { + final newSection = Section(i, end); + end = i; + return newSection; + } + + bool get complete => index == end; + + //returns false if there is still work to do in the section + bool pump() { + if (end == null || index < end!) { + index += 1; + } + return complete; + } +} diff --git a/lib/utils/drop_area.dart b/lib/utils/drop_area.dart index 61bb2f8..dcdf761 100644 --- a/lib/utils/drop_area.dart +++ b/lib/utils/drop_area.dart @@ -50,51 +50,49 @@ class _DropAreaState extends State { @override Widget build(BuildContext context) { - return Expanded( - child: DropTarget( - onDragDone: (details) => handleDrop(details), - onDragExited: (val) => hideReleaseMessage(), - onDragEntered: (val) => showReleaseMessage(), - child: Container( - decoration: BoxDecoration( - borderRadius: BorderRadius.circular(4.0), - color: intelGray, - ), - child: Builder( - builder: (context) { - if (!_showReleaseMessage && widget.showChild) { - return widget.child!; - } - return Center( - child: SizedBox( - height: 310, - child: Column( - crossAxisAlignment: CrossAxisAlignment.center, - mainAxisAlignment: MainAxisAlignment.spaceBetween, - children: [ - SvgPicture.asset('images/drop.svg'), - ( _showReleaseMessage - ? const Text("Release to drop media") - : Text("Drop ${widget.type} here") - ), - ElevatedButton( - onPressed: () => showUploadMenu(), - child: const Text("Upload") - ), - Builder( - builder: (context) { - if (widget.extensions == null) { - return Container(); - } - return Text(widget.extensions!.join(", ")); + return DropTarget( + onDragDone: (details) => handleDrop(details), + onDragExited: (val) => hideReleaseMessage(), + onDragEntered: (val) => showReleaseMessage(), + child: Container( + decoration: BoxDecoration( + borderRadius: BorderRadius.circular(4.0), + color: intelGray, + ), + child: Builder( + builder: (context) { + if (!_showReleaseMessage && widget.showChild) { + return widget.child!; + } + return Center( + child: SizedBox( + height: 310, + child: Column( + crossAxisAlignment: CrossAxisAlignment.center, + mainAxisAlignment: MainAxisAlignment.spaceBetween, + children: [ + SvgPicture.asset('images/drop.svg'), + ( _showReleaseMessage + ? const Text("Release to drop media") + : Text("Drop ${widget.type} here") + ), + ElevatedButton( + onPressed: () => showUploadMenu(), + child: const Text("Upload") + ), + Builder( + builder: (context) { + if (widget.extensions == null) { + return Container(); } - ) - ], - ), + return Text(widget.extensions!.join(", ")); + } + ) + ], ), - ); - } - ), + ), + ); + } ), ), ); diff --git a/macos/Runner.xcodeproj/project.pbxproj b/macos/Runner.xcodeproj/project.pbxproj index 8e01daa..9d53cd0 100644 --- a/macos/Runner.xcodeproj/project.pbxproj +++ b/macos/Runner.xcodeproj/project.pbxproj @@ -39,6 +39,16 @@ 0C42C76A2CE386680079F72B /* libopenvino_tensorflow_lite_frontend.2450.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C42C7592CE386520079F72B /* libopenvino_tensorflow_lite_frontend.2450.dylib */; }; 0C42C76B2CE388D90079F72B /* libopenvino_c.2450.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C42C7522CE386520079F72B /* libopenvino_c.2450.dylib */; }; 0C42C76C2CE388DC0079F72B /* libopenvino.2450.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C42C75A2CE386520079F72B /* libopenvino.2450.dylib */; }; + 0C4E1F6C2CECC22800124339 /* libavformat.60.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F692CECC22800124339 /* libavformat.60.dylib */; }; + 0C4E1F6D2CECC22800124339 /* libavutil.58.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F6A2CECC22800124339 /* libavutil.58.dylib */; }; + 0C4E1F6E2CECC22800124339 /* libswresample.4.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F6B2CECC22800124339 /* libswresample.4.dylib */; }; + 0C4E1F6F2CECC22800124339 /* libavcodec.60.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F672CECC22800124339 /* libavcodec.60.dylib */; }; + 0C4E1F702CECC22800124339 /* libavdevice.60.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F682CECC22800124339 /* libavdevice.60.dylib */; }; + 0C4E1F712CECC24900124339 /* libswresample.4.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F6B2CECC22800124339 /* libswresample.4.dylib */; }; + 0C4E1F722CECC25400124339 /* libavcodec.60.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F672CECC22800124339 /* libavcodec.60.dylib */; }; + 0C4E1F732CECC25400124339 /* libavdevice.60.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F682CECC22800124339 /* libavdevice.60.dylib */; }; + 0C4E1F742CECC25400124339 /* libavformat.60.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F692CECC22800124339 /* libavformat.60.dylib */; }; + 0C4E1F752CECC25400124339 /* libavutil.58.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F6A2CECC22800124339 /* libavutil.58.dylib */; }; 0C5D47382C6F2F9500307B37 /* libmacos_bindings.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */; settings = {ATTRIBUTES = (Weak, ); }; }; 0C5D47392C6F2FB200307B37 /* libmacos_bindings.dylib in Resources */ = {isa = PBXBuildFile; fileRef = 0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */; }; 0C5D473A2C6F308000307B37 /* libmacos_bindings.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; }; @@ -117,15 +127,19 @@ dstSubfolderSpec = 10; files = ( 0C42C7672CE386680079F72B /* libopenvino_paddle_frontend.2450.dylib in Bundle Framework */, + 0C4E1F712CECC24900124339 /* libswresample.4.dylib in Bundle Framework */, 0C5D47B32C6F5C1300307B37 /* libopenvino_hetero_plugin.so in Bundle Framework */, + 0C4E1F752CECC25400124339 /* libavutil.58.dylib in Bundle Framework */, 0C42C76C2CE388DC0079F72B /* libopenvino.2450.dylib in Bundle Framework */, 0C42C7662CE386680079F72B /* libopenvino_onnx_frontend.2450.dylib in Bundle Framework */, 0C5D47B12C6F5C0A00307B37 /* libopenvino_auto_batch_plugin.so in Bundle Framework */, 0C5D47B22C6F5C0E00307B37 /* libopenvino_auto_plugin.so in Bundle Framework */, 0C5D473E2C6F35E500307B37 /* libblend2d.dylib in Bundle Framework */, + 0C4E1F732CECC25400124339 /* libavdevice.60.dylib in Bundle Framework */, 0C5D47782C6F398400307B37 /* libopencv_core.407.dylib in Bundle Framework */, 0C42C7642CE386680079F72B /* libopenvino_genai.2450.dylib in Bundle Framework */, 0C5D47B02C6F5C0200307B37 /* libopenvino_arm_cpu_plugin.so in Bundle Framework */, + 0C4E1F742CECC25400124339 /* libavformat.60.dylib in Bundle Framework */, 0C5D47802C6F398400307B37 /* libopencv_videoio.407.dylib in Bundle Framework */, 0C5D47792C6F398400307B37 /* libopencv_features2d.407.dylib in Bundle Framework */, 0C42C7682CE386680079F72B /* libopenvino_pytorch_frontend.2450.dylib in Bundle Framework */, @@ -139,6 +153,7 @@ 0C5D477F2C6F398400307B37 /* libopencv_video.407.dylib in Bundle Framework */, 0C5D47812C6F398400307B37 /* libopencv_ximgproc.407.dylib in Bundle Framework */, 0C5D473A2C6F308000307B37 /* libmacos_bindings.dylib in Bundle Framework */, + 0C4E1F722CECC25400124339 /* libavcodec.60.dylib in Bundle Framework */, 0C42C7692CE386680079F72B /* libopenvino_tensorflow_frontend.2450.dylib in Bundle Framework */, 0C5D47A52C6F3B7700307B37 /* libtbb.12.dylib in Bundle Framework */, 0C5D477C2C6F398400307B37 /* libopencv_imgcodecs.407.dylib in Bundle Framework */, @@ -161,6 +176,11 @@ 0C42C7582CE386520079F72B /* libopenvino_tensorflow_frontend.2450.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libopenvino_tensorflow_frontend.2450.dylib; path = ../bindings/libopenvino_tensorflow_frontend.2450.dylib; sourceTree = SOURCE_ROOT; }; 0C42C7592CE386520079F72B /* libopenvino_tensorflow_lite_frontend.2450.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libopenvino_tensorflow_lite_frontend.2450.dylib; path = ../bindings/libopenvino_tensorflow_lite_frontend.2450.dylib; sourceTree = SOURCE_ROOT; }; 0C42C75A2CE386520079F72B /* libopenvino.2450.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libopenvino.2450.dylib; path = ../bindings/libopenvino.2450.dylib; sourceTree = SOURCE_ROOT; }; + 0C4E1F672CECC22800124339 /* libavcodec.60.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libavcodec.60.dylib; path = ../bindings/libavcodec.60.dylib; sourceTree = SOURCE_ROOT; }; + 0C4E1F682CECC22800124339 /* libavdevice.60.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libavdevice.60.dylib; path = ../bindings/libavdevice.60.dylib; sourceTree = SOURCE_ROOT; }; + 0C4E1F692CECC22800124339 /* libavformat.60.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libavformat.60.dylib; path = ../bindings/libavformat.60.dylib; sourceTree = SOURCE_ROOT; }; + 0C4E1F6A2CECC22800124339 /* libavutil.58.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libavutil.58.dylib; path = ../bindings/libavutil.58.dylib; sourceTree = SOURCE_ROOT; }; + 0C4E1F6B2CECC22800124339 /* libswresample.4.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libswresample.4.dylib; path = ../bindings/libswresample.4.dylib; sourceTree = SOURCE_ROOT; }; 0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libmacos_bindings.dylib; path = ../bindings/libmacos_bindings.dylib; sourceTree = ""; }; 0C5D473B2C6F357C00307B37 /* libblend2d.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libblend2d.dylib; path = ../bindings/libblend2d.dylib; sourceTree = ""; }; 0C5D47602C6F382800307B37 /* libopencv_calib3d.407.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libopencv_calib3d.407.dylib; path = ../bindings/libopencv_calib3d.407.dylib; sourceTree = ""; }; @@ -241,6 +261,11 @@ 0C42C75D2CE386520079F72B /* libopenvino_paddle_frontend.2450.dylib in Frameworks */, 0C42C75E2CE386520079F72B /* libopenvino_onnx_frontend.2450.dylib in Frameworks */, 0C42C75F2CE386520079F72B /* libopenvino_c.2450.dylib in Frameworks */, + 0C4E1F6C2CECC22800124339 /* libavformat.60.dylib in Frameworks */, + 0C4E1F6D2CECC22800124339 /* libavutil.58.dylib in Frameworks */, + 0C4E1F6E2CECC22800124339 /* libswresample.4.dylib in Frameworks */, + 0C4E1F6F2CECC22800124339 /* libavcodec.60.dylib in Frameworks */, + 0C4E1F702CECC22800124339 /* libavdevice.60.dylib in Frameworks */, 0C42C7602CE386520079F72B /* libopenvino_genai.2450.dylib in Frameworks */, 0C42C7612CE386520079F72B /* libopenvino.2450.dylib in Frameworks */, 0C42C7622CE386520079F72B /* libopenvino_ir_frontend.2450.dylib in Frameworks */, @@ -361,6 +386,11 @@ 0C5D47642C6F397900307B37 /* libopencv_ximgproc.407.dylib */, 0C5D473B2C6F357C00307B37 /* libblend2d.dylib */, 0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */, + 0C4E1F672CECC22800124339 /* libavcodec.60.dylib */, + 0C4E1F682CECC22800124339 /* libavdevice.60.dylib */, + 0C4E1F692CECC22800124339 /* libavformat.60.dylib */, + 0C4E1F6A2CECC22800124339 /* libavutil.58.dylib */, + 0C4E1F6B2CECC22800124339 /* libswresample.4.dylib */, 11E6C6B7198D7B3B20F4A75C /* Pods_Runner.framework */, CB5E7865DB70376BADAAEAE6 /* Pods_RunnerTests.framework */, ); diff --git a/openvino_bindings/src/BUILD b/openvino_bindings/src/BUILD index 13b1000..3bb179c 100644 --- a/openvino_bindings/src/BUILD +++ b/openvino_bindings/src/BUILD @@ -9,6 +9,7 @@ cc_library( "//src/utils:utils", "//src/image:image_inference", "//src/llm:llm_inference", + "//src/audio:speech_to_text", "//src/mediapipe:graph_runner", ], ) diff --git a/openvino_bindings/src/bindings.cc b/openvino_bindings/src/bindings.cc index fcefbec..04b0efa 100644 --- a/openvino_bindings/src/bindings.cc +++ b/openvino_bindings/src/bindings.cc @@ -4,7 +4,7 @@ #include #include -//#include "src/audio/speech_to_text.h" +#include "src/audio/speech_to_text.h" #include "src/image/image_inference.h" #include "src/mediapipe/graph_runner.h" #include "src/mediapipe/serialization/serialization_calculators.h" @@ -290,48 +290,48 @@ Status* graphRunnerStop(CGraphRunner instance) { } } -//StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device) { -// try { -// auto instance = new SpeechToText(model_path, device); -// return new StatusOrSpeechToText{OkStatus, "", instance}; -// } catch (...) { -// auto except = handle_exceptions(); -// return new StatusOrSpeechToText{except->status, except->message}; -// } -//} -// -//Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path) { -// try { -// auto object = reinterpret_cast(instance); -// object->load_video(video_path); -// return new Status{OkStatus, ""}; -// } catch (...) { -// return handle_exceptions(); -// } -//} -// -//StatusOrInt* speechToTextVideoDuration(CSpeechToText instance) { -// try { -// auto object = reinterpret_cast(instance); -// object->video_duration(); -// // Deal with long in the future -// return new StatusOrInt{OkStatus, "", (int)object->video_duration()}; -// } catch (...) { -// return new StatusOrInt{OkStatus, ""}; -// } -//} -// -//StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language) { -// try { -// auto object = reinterpret_cast(instance); -// auto result = object->transcribe(start, duration, language); -// std::string text = result; -// return new StatusOrModelResponse{OkStatus, "", convertToMetricsStruct(result.perf_metrics), strdup(text.c_str())}; -// } catch (...) { -// auto except = handle_exceptions(); -// return new StatusOrModelResponse{except->status, except->message}; -// } -//} +StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device) { + try { + auto instance = new SpeechToText(model_path, device); + return new StatusOrSpeechToText{OkStatus, "", instance}; + } catch (...) { + auto except = handle_exceptions(); + return new StatusOrSpeechToText{except->status, except->message}; + } +} + +Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path) { + try { + auto object = reinterpret_cast(instance); + object->load_video(video_path); + return new Status{OkStatus, ""}; + } catch (...) { + return handle_exceptions(); + } +} + +StatusOrInt* speechToTextVideoDuration(CSpeechToText instance) { + try { + auto object = reinterpret_cast(instance); + object->video_duration(); + // Deal with long in the future + return new StatusOrInt{OkStatus, "", (int)object->video_duration()}; + } catch (...) { + return new StatusOrInt{OkStatus, ""}; + } +} + +StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language) { + try { + auto object = reinterpret_cast(instance); + auto result = object->transcribe(start, duration, language); + std::string text = result; + return new StatusOrModelResponse{OkStatus, "", convertToMetricsStruct(result.perf_metrics), strdup(text.c_str())}; + } catch (...) { + auto except = handle_exceptions(); + return new StatusOrModelResponse{except->status, except->message}; + } +} //void report_rss() { // struct rusage r_usage; diff --git a/openvino_bindings/src/bindings.h b/openvino_bindings/src/bindings.h index e496c5a..4528916 100644 --- a/openvino_bindings/src/bindings.h +++ b/openvino_bindings/src/bindings.h @@ -123,10 +123,10 @@ EXPORT Status* graphRunnerQueueSerializationOutput(CGraphRunner instance, const EXPORT StatusOrString* graphRunnerGet(CGraphRunner instance); EXPORT Status* graphRunnerStop(CGraphRunner instance); -//EXPORT StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device); -//EXPORT Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path); -//EXPORT StatusOrInt* speechToTextVideoDuration(CSpeechToText instance); -//EXPORT StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language); +EXPORT StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device); +EXPORT Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path); +EXPORT StatusOrInt* speechToTextVideoDuration(CSpeechToText instance); +EXPORT StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language); EXPORT StatusOrDevices* getAvailableDevices(); Status* handle_exceptions(); From e32515493b7e55a64d7034b6a384f2c7e7e05a19 Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Tue, 19 Nov 2024 16:40:29 +0100 Subject: [PATCH 02/17] Working video and subtitles --- lib/main.dart | 2 + lib/pages/transcription/playground.dart | 79 +++++-- .../transcription/widgets/subtitles.dart | 59 +++++ linux/flutter/generated_plugin_registrant.cc | 8 + linux/flutter/generated_plugins.cmake | 3 + macos/Flutter/GeneratedPluginRegistrant.swift | 10 + macos/Podfile.lock | 36 ++++ pubspec.lock | 204 +++++++++++++++++- pubspec.yaml | 3 + .../flutter/generated_plugin_registrant.cc | 9 + windows/flutter/generated_plugins.cmake | 4 + 11 files changed, 392 insertions(+), 25 deletions(-) create mode 100644 lib/pages/transcription/widgets/subtitles.dart diff --git a/lib/main.dart b/lib/main.dart index 9f019f0..fe0ba36 100644 --- a/lib/main.dart +++ b/lib/main.dart @@ -6,6 +6,7 @@ import 'package:inference/theme_fluent.dart'; import 'package:inference/providers/preference_provider.dart'; import 'package:inference/providers/project_provider.dart'; import 'package:inference/public_models.dart'; +import 'package:media_kit/media_kit.dart'; import 'package:provider/provider.dart'; @@ -25,6 +26,7 @@ void testConnection() async { } void main() { + MediaKit.ensureInitialized(); testConnection(); runApp(const App()); } diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart index 28417b8..0b2d74c 100644 --- a/lib/pages/transcription/playground.dart +++ b/lib/pages/transcription/playground.dart @@ -1,15 +1,19 @@ +import 'dart:async'; + import 'package:file_picker/file_picker.dart'; import 'package:fluent_ui/fluent_ui.dart'; import 'package:inference/pages/computer_vision/widgets/model_properties.dart'; import 'package:inference/pages/models/widgets/grid_container.dart'; +import 'package:inference/pages/transcription/widgets/subtitles.dart'; import 'package:inference/project.dart'; import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; import 'package:inference/theme_fluent.dart'; import 'package:inference/utils/drop_area.dart'; import 'package:inference/widgets/controls/no_outline_button.dart'; import 'package:inference/widgets/device_selector.dart'; -//import 'package:media_kit/media_kit.dart'; -//import 'package:media_kit_video/media_kit_video.dart'; +import 'package:intl/date_symbol_data_local.dart'; +import 'package:media_kit/media_kit.dart'; +import 'package:media_kit_video/media_kit_video.dart'; import 'package:provider/provider.dart'; class Playground extends StatefulWidget { @@ -20,9 +24,12 @@ class Playground extends StatefulWidget { State createState() => _PlaygroundState(); } -class _PlaygroundState extends State { - //late final player = Player(); - //late final controller = VideoController(player); +class _PlaygroundState extends State with TickerProviderStateMixin{ + final player = Player(); + late final controller = VideoController(player); + int subtitleIndex = 0; + StreamSubscription? listener; + void showUploadMenu() async { FilePickerResult? result = await FilePicker.platform.pickFiles(type: FileType.video); @@ -32,10 +39,29 @@ class _PlaygroundState extends State { } } + void positionListener(Duration position) { + int index = (position.inSeconds / transcriptionPeriod).floor(); + if (index != subtitleIndex) { + final inference = Provider.of(context, listen: false); + inference.skipTo(index); + setState(() { + subtitleIndex = index; + }); + } + } + + void initializeVideoAndListeners(String source) async { + await listener?.cancel(); + player.open(Media(source)); + player.setVolume(0); // TODO: Disable this for release. This is for our sanity + listener = player.stream.position.listen(positionListener); + } + void uploadFile(String file) async { final inference = Provider.of(context, listen: false); await inference.loadVideo(file); inference.startTranscribing(); + initializeVideoAndListeners(file); } @override @@ -72,24 +98,31 @@ class _PlaygroundState extends State { ), ), ), - Expanded( - child: GridContainer( - color: backgroundColor.of(theme), - child: Builder( - builder: (context) { - return DropArea( - type: "video", - showChild: false, - onUpload: (String file) { uploadFile(file); }, - extensions: const [], - child: Padding( - padding: const EdgeInsets.all(8.0), - child: Container(), - ), - ); - } - ), - ), + Consumer( + builder: (context, inference, child) { + return Expanded( + child: GridContainer( + color: backgroundColor.of(theme), + child: Builder( + builder: (context) { + return DropArea( + type: "video", + showChild: inference.videoLoaded, + onUpload: (String file) { uploadFile(file); }, + extensions: const [], + child: Stack( + alignment: Alignment.bottomCenter, + children: [ + Video(controller: controller), + Subtitles(transcription: inference.transcription, subtitleIndex: subtitleIndex), + ] + ), + ); + } + ), + ), + ); + } ) ], ), diff --git a/lib/pages/transcription/widgets/subtitles.dart b/lib/pages/transcription/widgets/subtitles.dart new file mode 100644 index 0000000..21c609b --- /dev/null +++ b/lib/pages/transcription/widgets/subtitles.dart @@ -0,0 +1,59 @@ +import 'dart:async'; + +import 'package:fluent_ui/fluent_ui.dart'; + +class Subtitles extends StatelessWidget { + const Subtitles({ + super.key, + required this.transcription, + required this.subtitleIndex, + }); + + final Map>? transcription; + final int subtitleIndex; + + static const double fontSize = 18; + + @override + Widget build(BuildContext context) { + return Padding( + padding: const EdgeInsets.only(left: 8, right: 8, bottom: 60), + child: SizedBox( + height: 100, + child: Builder( + builder: (context) { + if (transcription == null ) { + return Container(); + } + if (transcription![subtitleIndex] is String) { + return Stack( + alignment: Alignment.bottomCenter, + children: [ + Text( + transcription![subtitleIndex] as String, + textAlign: TextAlign.center, + style: TextStyle( + fontSize: fontSize, + foreground: Paint() + ..style = PaintingStyle.stroke + ..strokeWidth = 2 + ..color = Colors.black, + ) + ), + Text( + transcription![subtitleIndex] as String, + textAlign: TextAlign.center, + style: const TextStyle( + fontSize: fontSize + ) + ) + ], + ); + } + return Container(); + } + ), + ), + ); + } +} diff --git a/linux/flutter/generated_plugin_registrant.cc b/linux/flutter/generated_plugin_registrant.cc index 8e89f01..17066f6 100644 --- a/linux/flutter/generated_plugin_registrant.cc +++ b/linux/flutter/generated_plugin_registrant.cc @@ -8,6 +8,8 @@ #include #include +#include +#include #include void fl_register_plugins(FlPluginRegistry* registry) { @@ -17,6 +19,12 @@ void fl_register_plugins(FlPluginRegistry* registry) { g_autoptr(FlPluginRegistrar) flutter_acrylic_registrar = fl_plugin_registry_get_registrar_for_plugin(registry, "FlutterAcrylicPlugin"); flutter_acrylic_plugin_register_with_registrar(flutter_acrylic_registrar); + g_autoptr(FlPluginRegistrar) media_kit_libs_linux_registrar = + fl_plugin_registry_get_registrar_for_plugin(registry, "MediaKitLibsLinuxPlugin"); + media_kit_libs_linux_plugin_register_with_registrar(media_kit_libs_linux_registrar); + g_autoptr(FlPluginRegistrar) media_kit_video_registrar = + fl_plugin_registry_get_registrar_for_plugin(registry, "MediaKitVideoPlugin"); + media_kit_video_plugin_register_with_registrar(media_kit_video_registrar); g_autoptr(FlPluginRegistrar) system_theme_registrar = fl_plugin_registry_get_registrar_for_plugin(registry, "SystemThemePlugin"); system_theme_plugin_register_with_registrar(system_theme_registrar); diff --git a/linux/flutter/generated_plugins.cmake b/linux/flutter/generated_plugins.cmake index cc87f3a..386a1eb 100644 --- a/linux/flutter/generated_plugins.cmake +++ b/linux/flutter/generated_plugins.cmake @@ -5,10 +5,13 @@ list(APPEND FLUTTER_PLUGIN_LIST desktop_drop flutter_acrylic + media_kit_libs_linux + media_kit_video system_theme ) list(APPEND FLUTTER_FFI_PLUGIN_LIST + media_kit_native_event_loop ) set(PLUGIN_BUNDLED_LIBRARIES) diff --git a/macos/Flutter/GeneratedPluginRegistrant.swift b/macos/Flutter/GeneratedPluginRegistrant.swift index dc08871..c719943 100644 --- a/macos/Flutter/GeneratedPluginRegistrant.swift +++ b/macos/Flutter/GeneratedPluginRegistrant.swift @@ -7,12 +7,22 @@ import Foundation import desktop_drop import macos_window_utils +import media_kit_libs_macos_video +import media_kit_video +import package_info_plus import path_provider_foundation +import screen_brightness_macos import system_theme +import wakelock_plus func RegisterGeneratedPlugins(registry: FlutterPluginRegistry) { DesktopDropPlugin.register(with: registry.registrar(forPlugin: "DesktopDropPlugin")) MacOSWindowUtilsPlugin.register(with: registry.registrar(forPlugin: "MacOSWindowUtilsPlugin")) + MediaKitLibsMacosVideoPlugin.register(with: registry.registrar(forPlugin: "MediaKitLibsMacosVideoPlugin")) + MediaKitVideoPlugin.register(with: registry.registrar(forPlugin: "MediaKitVideoPlugin")) + FPPPackageInfoPlusPlugin.register(with: registry.registrar(forPlugin: "FPPPackageInfoPlusPlugin")) PathProviderPlugin.register(with: registry.registrar(forPlugin: "PathProviderPlugin")) + ScreenBrightnessMacosPlugin.register(with: registry.registrar(forPlugin: "ScreenBrightnessMacosPlugin")) SystemThemePlugin.register(with: registry.registrar(forPlugin: "SystemThemePlugin")) + WakelockPlusMacosPlugin.register(with: registry.registrar(forPlugin: "WakelockPlusMacosPlugin")) } diff --git a/macos/Podfile.lock b/macos/Podfile.lock index 5d8310b..c810432 100644 --- a/macos/Podfile.lock +++ b/macos/Podfile.lock @@ -4,18 +4,36 @@ PODS: - FlutterMacOS (1.0.0) - macos_window_utils (1.0.0): - FlutterMacOS + - media_kit_libs_macos_video (1.0.4): + - FlutterMacOS + - media_kit_native_event_loop (1.0.0): + - FlutterMacOS + - media_kit_video (0.0.1): + - FlutterMacOS + - package_info_plus (0.0.1): + - FlutterMacOS - path_provider_foundation (0.0.1): - Flutter - FlutterMacOS + - screen_brightness_macos (0.1.0): + - FlutterMacOS - system_theme (0.0.1): - FlutterMacOS + - wakelock_plus (0.0.1): + - FlutterMacOS DEPENDENCIES: - desktop_drop (from `Flutter/ephemeral/.symlinks/plugins/desktop_drop/macos`) - FlutterMacOS (from `Flutter/ephemeral`) - macos_window_utils (from `Flutter/ephemeral/.symlinks/plugins/macos_window_utils/macos`) + - media_kit_libs_macos_video (from `Flutter/ephemeral/.symlinks/plugins/media_kit_libs_macos_video/macos`) + - media_kit_native_event_loop (from `Flutter/ephemeral/.symlinks/plugins/media_kit_native_event_loop/macos`) + - media_kit_video (from `Flutter/ephemeral/.symlinks/plugins/media_kit_video/macos`) + - package_info_plus (from `Flutter/ephemeral/.symlinks/plugins/package_info_plus/macos`) - path_provider_foundation (from `Flutter/ephemeral/.symlinks/plugins/path_provider_foundation/darwin`) + - screen_brightness_macos (from `Flutter/ephemeral/.symlinks/plugins/screen_brightness_macos/macos`) - system_theme (from `Flutter/ephemeral/.symlinks/plugins/system_theme/macos`) + - wakelock_plus (from `Flutter/ephemeral/.symlinks/plugins/wakelock_plus/macos`) EXTERNAL SOURCES: desktop_drop: @@ -24,17 +42,35 @@ EXTERNAL SOURCES: :path: Flutter/ephemeral macos_window_utils: :path: Flutter/ephemeral/.symlinks/plugins/macos_window_utils/macos + media_kit_libs_macos_video: + :path: Flutter/ephemeral/.symlinks/plugins/media_kit_libs_macos_video/macos + media_kit_native_event_loop: + :path: Flutter/ephemeral/.symlinks/plugins/media_kit_native_event_loop/macos + media_kit_video: + :path: Flutter/ephemeral/.symlinks/plugins/media_kit_video/macos + package_info_plus: + :path: Flutter/ephemeral/.symlinks/plugins/package_info_plus/macos path_provider_foundation: :path: Flutter/ephemeral/.symlinks/plugins/path_provider_foundation/darwin + screen_brightness_macos: + :path: Flutter/ephemeral/.symlinks/plugins/screen_brightness_macos/macos system_theme: :path: Flutter/ephemeral/.symlinks/plugins/system_theme/macos + wakelock_plus: + :path: Flutter/ephemeral/.symlinks/plugins/wakelock_plus/macos SPEC CHECKSUMS: desktop_drop: 69eeff437544aa619c8db7f4481b3a65f7696898 FlutterMacOS: 8f6f14fa908a6fb3fba0cd85dbd81ec4b251fb24 macos_window_utils: 933f91f64805e2eb91a5bd057cf97cd097276663 + media_kit_libs_macos_video: b3e2bbec2eef97c285f2b1baa7963c67c753fb82 + media_kit_native_event_loop: 81fd5b45192b72f8b5b69eaf5b540f45777eb8d5 + media_kit_video: c75b07f14d59706c775778e4dd47dd027de8d1e5 + package_info_plus: 12f1c5c2cfe8727ca46cbd0b26677728972d9a5b path_provider_foundation: 2b6b4c569c0fb62ec74538f866245ac84301af46 + screen_brightness_macos: 2d6d3af2165592d9a55ffcd95b7550970e41ebda system_theme: c7b9f6659a5caa26c9bc2284da096781e9a6fcbc + wakelock_plus: 4783562c9a43d209c458cb9b30692134af456269 PODFILE CHECKSUM: 16208599a12443d53889ba2270a4985981cfb204 diff --git a/pubspec.lock b/pubspec.lock index 9e93ab4..d1a1ee8 100644 --- a/pubspec.lock +++ b/pubspec.lock @@ -230,6 +230,14 @@ packages: url: "https://pub.dev" source: hosted version: "2.3.7" + dbus: + dependency: transitive + description: + name: dbus + sha256: "365c771ac3b0e58845f39ec6deebc76e3276aa9922b0cc60840712094d9047ac" + url: "https://pub.dev" + source: hosted + version: "0.7.10" desktop_drop: dependency: "direct main" description: @@ -477,10 +485,10 @@ packages: dependency: transitive description: name: js - sha256: c1b2e9b5ea78c45e1a0788d29606ba27dc5f71f019f32ca5140f61ef071838cf + sha256: f2c445dce49627136094980615a031419f7f3eb393237e4ecd97ac15dea343f3 url: "https://pub.dev" source: hosted - version: "0.7.1" + version: "0.6.7" json_annotation: dependency: transitive description: @@ -569,6 +577,78 @@ packages: url: "https://pub.dev" source: hosted version: "2.6.0" + media_kit: + dependency: "direct main" + description: + name: media_kit + sha256: "1f1deee148533d75129a6f38251ff8388e33ee05fc2d20a6a80e57d6051b7b62" + url: "https://pub.dev" + source: hosted + version: "1.1.11" + media_kit_libs_android_video: + dependency: transitive + description: + name: media_kit_libs_android_video + sha256: "9dd8012572e4aff47516e55f2597998f0a378e3d588d0fad0ca1f11a53ae090c" + url: "https://pub.dev" + source: hosted + version: "1.3.6" + media_kit_libs_ios_video: + dependency: transitive + description: + name: media_kit_libs_ios_video + sha256: b5382994eb37a4564c368386c154ad70ba0cc78dacdd3fb0cd9f30db6d837991 + url: "https://pub.dev" + source: hosted + version: "1.1.4" + media_kit_libs_linux: + dependency: transitive + description: + name: media_kit_libs_linux + sha256: e186891c31daa6bedab4d74dcdb4e8adfccc7d786bfed6ad81fe24a3b3010310 + url: "https://pub.dev" + source: hosted + version: "1.1.3" + media_kit_libs_macos_video: + dependency: transitive + description: + name: media_kit_libs_macos_video + sha256: f26aa1452b665df288e360393758f84b911f70ffb3878032e1aabba23aa1032d + url: "https://pub.dev" + source: hosted + version: "1.1.4" + media_kit_libs_video: + dependency: "direct main" + description: + name: media_kit_libs_video + sha256: "20bb4aefa8fece282b59580e1cd8528117297083a6640c98c2e98cfc96b93288" + url: "https://pub.dev" + source: hosted + version: "1.0.5" + media_kit_libs_windows_video: + dependency: transitive + description: + name: media_kit_libs_windows_video + sha256: "32654572167825c42c55466f5d08eee23ea11061c84aa91b09d0e0f69bdd0887" + url: "https://pub.dev" + source: hosted + version: "1.0.10" + media_kit_native_event_loop: + dependency: transitive + description: + name: media_kit_native_event_loop + sha256: "7d82e3b3e9ded5c35c3146c5ba1da3118d1dd8ac3435bac7f29f458181471b40" + url: "https://pub.dev" + source: hosted + version: "1.0.9" + media_kit_video: + dependency: "direct main" + description: + name: media_kit_video + sha256: "2cc3b966679963ba25a4ce5b771e532a521ebde7c6aa20e9802bec95d9916c8f" + url: "https://pub.dev" + source: hosted + version: "1.2.5" meta: dependency: transitive description: @@ -609,6 +689,22 @@ packages: url: "https://pub.dev" source: hosted version: "2.1.0" + package_info_plus: + dependency: transitive + description: + name: package_info_plus + sha256: da8d9ac8c4b1df253d1a328b7bf01ae77ef132833479ab40763334db13b91cce + url: "https://pub.dev" + source: hosted + version: "8.1.1" + package_info_plus_platform_interface: + dependency: transitive + description: + name: package_info_plus_platform_interface + sha256: ac1f4a4847f1ade8e6a87d1f39f5d7c67490738642e2542f559ec38c37489a66 + url: "https://pub.dev" + source: hosted + version: "3.0.1" path: dependency: "direct main" description: @@ -753,6 +849,62 @@ packages: url: "https://pub.dev" source: hosted version: "4.1.0" + safe_local_storage: + dependency: transitive + description: + name: safe_local_storage + sha256: ede4eb6cb7d88a116b3d3bf1df70790b9e2038bc37cb19112e381217c74d9440 + url: "https://pub.dev" + source: hosted + version: "1.0.2" + screen_brightness: + dependency: transitive + description: + name: screen_brightness + sha256: ed8da4a4511e79422fc1aa88138e920e4008cd312b72cdaa15ccb426c0faaedd + url: "https://pub.dev" + source: hosted + version: "0.2.2+1" + screen_brightness_android: + dependency: transitive + description: + name: screen_brightness_android + sha256: "3df10961e3a9e968a5e076fe27e7f4741fa8a1d3950bdeb48cf121ed529d0caf" + url: "https://pub.dev" + source: hosted + version: "0.1.0+2" + screen_brightness_ios: + dependency: transitive + description: + name: screen_brightness_ios + sha256: "99adc3ca5490b8294284aad5fcc87f061ad685050e03cf45d3d018fe398fd9a2" + url: "https://pub.dev" + source: hosted + version: "0.1.0" + screen_brightness_macos: + dependency: transitive + description: + name: screen_brightness_macos + sha256: "64b34e7e3f4900d7687c8e8fb514246845a73ecec05ab53483ed025bd4a899fd" + url: "https://pub.dev" + source: hosted + version: "0.1.0+1" + screen_brightness_platform_interface: + dependency: transitive + description: + name: screen_brightness_platform_interface + sha256: b211d07f0c96637a15fb06f6168617e18030d5d74ad03795dd8547a52717c171 + url: "https://pub.dev" + source: hosted + version: "0.1.0" + screen_brightness_windows: + dependency: transitive + description: + name: screen_brightness_windows + sha256: "9261bf33d0fc2707d8cf16339ce25768100a65e70af0fcabaf032fc12408ba86" + url: "https://pub.dev" + source: hosted + version: "0.1.3" scroll_pos: dependency: transitive description: @@ -846,6 +998,14 @@ packages: url: "https://pub.dev" source: hosted version: "0.3.1" + synchronized: + dependency: transitive + description: + name: synchronized + sha256: "69fe30f3a8b04a0be0c15ae6490fc859a78ef4c43ae2dd5e8a623d45bfcf9225" + url: "https://pub.dev" + source: hosted + version: "3.3.0+3" system_theme: dependency: "direct main" description: @@ -894,6 +1054,22 @@ packages: url: "https://pub.dev" source: hosted version: "1.4.0" + universal_platform: + dependency: transitive + description: + name: universal_platform + sha256: "64e16458a0ea9b99260ceb5467a214c1f298d647c659af1bff6d3bf82536b1ec" + url: "https://pub.dev" + source: hosted + version: "1.1.0" + uri_parser: + dependency: transitive + description: + name: uri_parser + sha256: "6543c9fd86d2862fac55d800a43e67c0dcd1a41677cb69c2f8edfe73bbcf1835" + url: "https://pub.dev" + source: hosted + version: "2.0.2" uuid: dependency: "direct main" description: @@ -942,6 +1118,30 @@ packages: url: "https://pub.dev" source: hosted version: "14.2.5" + volume_controller: + dependency: transitive + description: + name: volume_controller + sha256: c71d4c62631305df63b72da79089e078af2659649301807fa746088f365cb48e + url: "https://pub.dev" + source: hosted + version: "2.0.8" + wakelock_plus: + dependency: transitive + description: + name: wakelock_plus + sha256: bf4ee6f17a2fa373ed3753ad0e602b7603f8c75af006d5b9bdade263928c0484 + url: "https://pub.dev" + source: hosted + version: "1.2.8" + wakelock_plus_platform_interface: + dependency: transitive + description: + name: wakelock_plus_platform_interface + sha256: "422d1cdbb448079a8a62a5a770b69baa489f8f7ca21aef47800c726d404f9d16" + url: "https://pub.dev" + source: hosted + version: "1.2.1" watcher: dependency: transitive description: diff --git a/pubspec.yaml b/pubspec.yaml index 5e35670..c3233b8 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -56,6 +56,9 @@ dependencies: fluent_ui: ^4.9.2 system_theme: ^3.1.2 flutter_acrylic: ^1.1.4 + media_kit: ^1.1.11 # Primary package. + media_kit_video: ^1.2.5 # For video rendering. + media_kit_libs_video: ^1.0.5 # Native video dependencies. dev_dependencies: flutter_test: diff --git a/windows/flutter/generated_plugin_registrant.cc b/windows/flutter/generated_plugin_registrant.cc index 909a92e..054d5c6 100644 --- a/windows/flutter/generated_plugin_registrant.cc +++ b/windows/flutter/generated_plugin_registrant.cc @@ -8,6 +8,9 @@ #include #include +#include +#include +#include #include void RegisterPlugins(flutter::PluginRegistry* registry) { @@ -15,6 +18,12 @@ void RegisterPlugins(flutter::PluginRegistry* registry) { registry->GetRegistrarForPlugin("DesktopDropPlugin")); FlutterAcrylicPluginRegisterWithRegistrar( registry->GetRegistrarForPlugin("FlutterAcrylicPlugin")); + MediaKitLibsWindowsVideoPluginCApiRegisterWithRegistrar( + registry->GetRegistrarForPlugin("MediaKitLibsWindowsVideoPluginCApi")); + MediaKitVideoPluginCApiRegisterWithRegistrar( + registry->GetRegistrarForPlugin("MediaKitVideoPluginCApi")); + ScreenBrightnessWindowsPluginRegisterWithRegistrar( + registry->GetRegistrarForPlugin("ScreenBrightnessWindowsPlugin")); SystemThemePluginRegisterWithRegistrar( registry->GetRegistrarForPlugin("SystemThemePlugin")); } diff --git a/windows/flutter/generated_plugins.cmake b/windows/flutter/generated_plugins.cmake index 1f4b61f..3c6f76d 100644 --- a/windows/flutter/generated_plugins.cmake +++ b/windows/flutter/generated_plugins.cmake @@ -5,10 +5,14 @@ list(APPEND FLUTTER_PLUGIN_LIST desktop_drop flutter_acrylic + media_kit_libs_windows_video + media_kit_video + screen_brightness_windows system_theme ) list(APPEND FLUTTER_FFI_PLUGIN_LIST + media_kit_native_event_loop ) set(PLUGIN_BUNDLED_LIBRARIES) From df3b0903b1e192d76489d762adb5375305d24f0b Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 13:29:07 +0100 Subject: [PATCH 03/17] Implement speech to text using chunks Gen AI allows you to output chunks when return_timestamps is true. The chunks are closer to actual sentences and the timestamps are better. I parse these chunks to get to combine them into better sentences. --- lib/interop/generated_bindings.dart | 68 +++++++++++- lib/interop/openvino_bindings.dart | 14 +++ lib/interop/speech_to_text.dart | 16 ++- lib/pages/transcription/playground.dart | 58 ++++++---- .../providers/speech_inference_provider.dart | 9 +- lib/pages/transcription/utils/message.dart | 38 +++++++ .../transcription/widgets/subtitles.dart | 12 +- .../transcription/widgets/transcription.dart | 104 ++++++++++++++++++ openvino_bindings/README.md | 2 +- openvino_bindings/WORKSPACE | 2 +- openvino_bindings/src/audio/speech_to_text.cc | 3 +- openvino_bindings/src/audio/speech_to_text.h | 3 +- openvino_bindings/src/bindings.cc | 29 ++++- openvino_bindings/src/bindings.h | 18 ++- 14 files changed, 330 insertions(+), 46 deletions(-) create mode 100644 lib/pages/transcription/utils/message.dart create mode 100644 lib/pages/transcription/widgets/transcription.dart diff --git a/lib/interop/generated_bindings.dart b/lib/interop/generated_bindings.dart index ea541a8..5699581 100644 --- a/lib/interop/generated_bindings.dart +++ b/lib/interop/generated_bindings.dart @@ -92,6 +92,37 @@ class OpenVINO { late final _freeStatusOrSpeechToText = _freeStatusOrSpeechToTextPtr .asFunction)>(); + void freeStatusOrModelResponse( + ffi.Pointer status, + ) { + return _freeStatusOrModelResponse( + status, + ); + } + + late final _freeStatusOrModelResponsePtr = _lookup< + ffi.NativeFunction< + ffi.Void Function(ffi.Pointer)>>( + 'freeStatusOrModelResponse'); + late final _freeStatusOrModelResponse = _freeStatusOrModelResponsePtr + .asFunction)>(); + + void freeStatusOrWhisperModelResponse( + ffi.Pointer status, + ) { + return _freeStatusOrWhisperModelResponse( + status, + ); + } + + late final _freeStatusOrWhisperModelResponsePtr = _lookup< + ffi.NativeFunction< + ffi.Void Function(ffi.Pointer)>>( + 'freeStatusOrWhisperModelResponse'); + late final _freeStatusOrWhisperModelResponse = + _freeStatusOrWhisperModelResponsePtr.asFunction< + void Function(ffi.Pointer)>(); + void freeStatusOrDevices( ffi.Pointer status, ) { @@ -618,7 +649,7 @@ class OpenVINO { late final _speechToTextVideoDuration = _speechToTextVideoDurationPtr .asFunction Function(CSpeechToText)>(); - ffi.Pointer speechToTextTranscribe( + ffi.Pointer speechToTextTranscribe( CSpeechToText instance, int start, int duration, @@ -634,10 +665,13 @@ class OpenVINO { late final _speechToTextTranscribePtr = _lookup< ffi.NativeFunction< - ffi.Pointer Function(CSpeechToText, ffi.Int, - ffi.Int, ffi.Pointer)>>('speechToTextTranscribe'); + ffi.Pointer Function( + CSpeechToText, + ffi.Int, + ffi.Int, + ffi.Pointer)>>('speechToTextTranscribe'); late final _speechToTextTranscribe = _speechToTextTranscribePtr.asFunction< - ffi.Pointer Function( + ffi.Pointer Function( CSpeechToText, int, int, ffi.Pointer)>(); ffi.Pointer getAvailableDevices() { @@ -744,6 +778,16 @@ final class Device extends ffi.Struct { external ffi.Pointer name; } +final class TranscriptionChunk extends ffi.Struct { + @ffi.Float() + external double start_ts; + + @ffi.Float() + external double end_ts; + + external ffi.Pointer text; +} + final class Status extends ffi.Struct { @ffi.Int() external int status; @@ -835,6 +879,22 @@ final class StatusOrModelResponse extends ffi.Struct { external ffi.Pointer value; } +final class StatusOrWhisperModelResponse extends ffi.Struct { + @ffi.Int() + external int status; + + external ffi.Pointer message; + + external Metrics metrics; + + external ffi.Pointer value; + + @ffi.Int() + external int size; + + external ffi.Pointer text; +} + final class StatusOrDevices extends ffi.Struct { @ffi.Int() external int status; diff --git a/lib/interop/openvino_bindings.dart b/lib/interop/openvino_bindings.dart index e11cc93..defe977 100644 --- a/lib/interop/openvino_bindings.dart +++ b/lib/interop/openvino_bindings.dart @@ -18,6 +18,20 @@ class SerializationOutput { } +class Chunk { + final double start; + final double end; + final String text; + const Chunk(this.start, this.end, this.text); +} + +class TranscriptionModelResponse { + final List chunks; + final Metrics metrics; + final String text; + const TranscriptionModelResponse(this.chunks, this.metrics, this.text); +} + class ModelResponse { final String content; final Metrics metrics; diff --git a/lib/interop/speech_to_text.dart b/lib/interop/speech_to_text.dart index b81ed02..1e07f17 100644 --- a/lib/interop/speech_to_text.dart +++ b/lib/interop/speech_to_text.dart @@ -59,7 +59,7 @@ class SpeechToText { } } - Future transcribe(int start, int duration, String language) async{ + Future transcribe(int start, int duration, String language) async{ int instanceAddress = instance.ref.value.address; final result = await Isolate.run(() { final languagePtr = language.toNativeUtf8(); @@ -72,6 +72,18 @@ class SpeechToText { throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}"; } - return result.ref.value.toDartString(); + List chunks = []; + for (int i = 0; i < result.ref.size; i++) { + chunks.add(Chunk( + result.ref.value[i].start_ts, + result.ref.value[i].end_ts, + result.ref.value[i].text.toDartString() + )); + } + final metrics = result.ref.metrics; + final text = result.ref.text.toDartString(); + ov.freeStatusOrWhisperModelResponse(result); + + return TranscriptionModelResponse(chunks, metrics, text); } } diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart index 0b2d74c..4af04e3 100644 --- a/lib/pages/transcription/playground.dart +++ b/lib/pages/transcription/playground.dart @@ -5,13 +5,13 @@ import 'package:fluent_ui/fluent_ui.dart'; import 'package:inference/pages/computer_vision/widgets/model_properties.dart'; import 'package:inference/pages/models/widgets/grid_container.dart'; import 'package:inference/pages/transcription/widgets/subtitles.dart'; +import 'package:inference/pages/transcription/widgets/transcription.dart'; import 'package:inference/project.dart'; import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; import 'package:inference/theme_fluent.dart'; import 'package:inference/utils/drop_area.dart'; import 'package:inference/widgets/controls/no_outline_button.dart'; import 'package:inference/widgets/device_selector.dart'; -import 'package:intl/date_symbol_data_local.dart'; import 'package:media_kit/media_kit.dart'; import 'package:media_kit_video/media_kit_video.dart'; import 'package:provider/provider.dart'; @@ -101,25 +101,42 @@ class _PlaygroundState extends State with TickerProviderStateMixin{ Consumer( builder: (context, inference, child) { return Expanded( - child: GridContainer( - color: backgroundColor.of(theme), - child: Builder( - builder: (context) { - return DropArea( - type: "video", - showChild: inference.videoLoaded, - onUpload: (String file) { uploadFile(file); }, - extensions: const [], - child: Stack( - alignment: Alignment.bottomCenter, - children: [ - Video(controller: controller), - Subtitles(transcription: inference.transcription, subtitleIndex: subtitleIndex), - ] - ), - ); - } - ), + child: Builder( + builder: (context) { + return DropArea( + type: "video", + showChild: inference.videoLoaded, + onUpload: (String file) { uploadFile(file); }, + extensions: const [], + child: Row( + crossAxisAlignment: CrossAxisAlignment.stretch, + children: [ + Expanded( + child: GridContainer( + color: backgroundColor.of(theme), + child: Stack( + alignment: Alignment.bottomCenter, + children: [ + Video(controller: controller), + Subtitles(transcription: inference.transcription, subtitleIndex: subtitleIndex), + ] + ), + ), + ), + SizedBox( + width: 312, + child: GridContainer( + color: backgroundColor.of(theme), + child: Transcription( + onSeek: player.seek, + transcription: inference.transcription + ), + ), + ) + ], + ), + ); + } ), ); } @@ -132,3 +149,4 @@ class _PlaygroundState extends State with TickerProviderStateMixin{ ); } } + diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart index 9f658fe..2302b34 100644 --- a/lib/pages/transcription/providers/speech_inference_provider.dart +++ b/lib/pages/transcription/providers/speech_inference_provider.dart @@ -1,6 +1,7 @@ import 'dart:async'; import 'package:flutter/material.dart'; +import 'package:inference/interop/openvino_bindings.dart'; import 'package:inference/interop/speech_to_text.dart'; import 'package:inference/pages/transcription/utils/section.dart'; import 'package:inference/project.dart'; @@ -19,8 +20,8 @@ class SpeechInferenceProvider extends ChangeNotifier { bool get videoLoaded => _videoPath != null; - DynamicRangeLoading>? _transcription; - Map>? get transcription => _transcription?.data; + DynamicRangeLoading>? _transcription; + Map>? get transcription => _transcription?.data; String _language = ""; @@ -54,7 +55,7 @@ class SpeechInferenceProvider extends ChangeNotifier { _videoPath = path; final duration = await _inference!.loadVideo(path); final sections = (duration / transcriptionPeriod).ceil(); - _transcription = DynamicRangeLoading>(Section(0, sections)); + _transcription = DynamicRangeLoading>(Section(0, sections)); notifyListeners(); } @@ -76,7 +77,7 @@ class SpeechInferenceProvider extends ChangeNotifier { } } - Future transcribe(int start, int duration) async { + Future transcribe(int start, int duration) async { await loaded.future; return await _inference!.transcribe(start, duration, _language); } diff --git a/lib/pages/transcription/utils/message.dart b/lib/pages/transcription/utils/message.dart new file mode 100644 index 0000000..9568732 --- /dev/null +++ b/lib/pages/transcription/utils/message.dart @@ -0,0 +1,38 @@ +import 'dart:async'; + +import 'package:inference/interop/openvino_bindings.dart'; + +class Message { + String message; + final Duration position; + + Message(this.message, this.position); + + static List parse(Map> transcriptions, int indexDuration) { + final indices = transcriptions.keys.toList()..sort(); + if (indices.isEmpty) { + return []; + } + + List output = []; + + bool lastChunkIsOpenEnded = false; + + for (int i in indices) { + if (transcriptions[i] is Future) { + continue; + } + final part = transcriptions[i] as TranscriptionModelResponse; + for (final chunk in part.chunks) { + String text = chunk.text; + if (lastChunkIsOpenEnded) { + output.last.message += text; + } else { + output.add(Message(text.substring(1), Duration(seconds: chunk.start.toInt()))); + } + lastChunkIsOpenEnded = text[text.length - 1] != "."; + } + } + return output; + } +} diff --git a/lib/pages/transcription/widgets/subtitles.dart b/lib/pages/transcription/widgets/subtitles.dart index 21c609b..9971c9a 100644 --- a/lib/pages/transcription/widgets/subtitles.dart +++ b/lib/pages/transcription/widgets/subtitles.dart @@ -1,6 +1,7 @@ import 'dart:async'; import 'package:fluent_ui/fluent_ui.dart'; +import 'package:inference/interop/openvino_bindings.dart'; class Subtitles extends StatelessWidget { const Subtitles({ @@ -9,7 +10,7 @@ class Subtitles extends StatelessWidget { required this.subtitleIndex, }); - final Map>? transcription; + final Map>? transcription; final int subtitleIndex; static const double fontSize = 18; @@ -25,12 +26,12 @@ class Subtitles extends StatelessWidget { if (transcription == null ) { return Container(); } - if (transcription![subtitleIndex] is String) { + if (transcription![subtitleIndex] is TranscriptionModelResponse) { + final text = (transcription![subtitleIndex] as TranscriptionModelResponse).text; return Stack( alignment: Alignment.bottomCenter, children: [ - Text( - transcription![subtitleIndex] as String, + Text(text, textAlign: TextAlign.center, style: TextStyle( fontSize: fontSize, @@ -40,8 +41,7 @@ class Subtitles extends StatelessWidget { ..color = Colors.black, ) ), - Text( - transcription![subtitleIndex] as String, + Text(text, textAlign: TextAlign.center, style: const TextStyle( fontSize: fontSize diff --git a/lib/pages/transcription/widgets/transcription.dart b/lib/pages/transcription/widgets/transcription.dart new file mode 100644 index 0000000..f521fb8 --- /dev/null +++ b/lib/pages/transcription/widgets/transcription.dart @@ -0,0 +1,104 @@ +import 'dart:async'; + +import 'package:fluent_ui/fluent_ui.dart'; +import 'package:inference/interop/openvino_bindings.dart'; +import 'package:inference/pages/transcription/utils/message.dart'; +import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; +import 'package:inference/theme_fluent.dart'; + +String formatDuration(int totalSeconds) { + final duration = Duration(seconds: totalSeconds); + final minutes = duration.inMinutes; + final seconds = totalSeconds % 60; + + final minutesString = '$minutes'.padLeft(2, '0'); + final secondsString = '$seconds'.padLeft(2, '0'); + return '$minutesString:$secondsString'; +} + + + +class Transcription extends StatelessWidget { + final Map>? transcription; + final Function(Duration)? onSeek; + const Transcription({super.key, this.transcription, this.onSeek}); + + @override + Widget build(BuildContext context) { + if (transcription == null) { + return Container(); + } + + final messages = Message.parse(transcription!, transcriptionPeriod); + + return SingleChildScrollView( + child: Padding( + padding: const EdgeInsets.symmetric(horizontal: 8), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + for (final message in messages) + TranscriptionMessage(message: message, onSeek: onSeek) + ], + ), + ), + ); + } +} + +class TranscriptionMessage extends StatefulWidget { + final Function(Duration)? onSeek; + final Message message; + + const TranscriptionMessage({super.key, required this.message, this.onSeek}); + + @override + State createState() => _TranscriptionMessageState(); +} + +class _TranscriptionMessageState extends State { + bool hover = false; + + @override + Widget build(BuildContext context) { + final theme = FluentTheme.of(context); + return MouseRegion( + onEnter: (_) { + setState(() => hover = true); + }, + onExit: (_) { + setState(() => hover = false); + }, + child: GestureDetector( + onTap: () { + widget.onSeek?.call(widget.message.position); + }, + child: Padding( + padding: const EdgeInsets.symmetric(vertical: 20), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Align( + alignment: Alignment.bottomRight, + child: Text(formatDuration(widget.message.position.inSeconds), + style: TextStyle( + fontSize: 9, + color: subtleTextColor.of(theme), + ) + ) + ), + Container( + decoration: BoxDecoration( + color: hover ? subtleTextColor.of(theme).withOpacity(0.3) : null, + borderRadius: const BorderRadius.all(Radius.circular(4)), + ), + padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 2), + child: Text(widget.message.message) + ), + ], + ), + ), + ), + ); + } +} diff --git a/openvino_bindings/README.md b/openvino_bindings/README.md index 952b68b..d6f8c21 100644 --- a/openvino_bindings/README.md +++ b/openvino_bindings/README.md @@ -108,7 +108,7 @@ The DLLs (with dependencies) will be in `bazel-bin/windows_bindings.tar` [Install OpenVINO Runtime 24.5.0](https://docs.openvino.ai/2024/get-started/install-openvino.html?PACKAGE=OPENVINO_GENAI&VERSION=v_2024_4_0&OP_SYSTEM=MACOS&DISTRIBUTION=ARCHIVE) with GenAI flavor in `/opt/intel/openvino_24.5.0` and symlink to `/opt/intel/openvino`. Install OpenCV: `brew install opencv` -Install ffmpeg: `brew install ffmpeg@6` +Install ffmpeg: `brew install ffmpeg@6 && brew link ffmpeg@6` Run: `bazel build :macos_bindings` diff --git a/openvino_bindings/WORKSPACE b/openvino_bindings/WORKSPACE index 1740965..6a1707a 100644 --- a/openvino_bindings/WORKSPACE +++ b/openvino_bindings/WORKSPACE @@ -115,7 +115,7 @@ git_repository( new_local_repository( name = "mac_ffmpeg", build_file = "//third_party/ffmpeg:mac.BUILD", - path = "/opt/homebrew/Cellar/ffmpeg@6/6.1.2_3", + path = "/opt/homebrew/opt/ffmpeg@6", ) # #new_local_repository( diff --git a/openvino_bindings/src/audio/speech_to_text.cc b/openvino_bindings/src/audio/speech_to_text.cc index e39cb4d..4a2e101 100644 --- a/openvino_bindings/src/audio/speech_to_text.cc +++ b/openvino_bindings/src/audio/speech_to_text.cc @@ -8,7 +8,7 @@ void SpeechToText::load_video(std::string video_path) { audio_grabber = std::make_unique(video_path); } -ov::genai::DecodedResults SpeechToText::transcribe(int start, int duration, std::string language) { +ov::genai::WhisperDecodedResults SpeechToText::transcribe(int start, int duration, std::string language) { auto video_duration = audio_grabber->get_duration(); if (start > video_duration) { throw api_error(SpeechToTextChunkOutOfBounds); @@ -23,6 +23,7 @@ ov::genai::DecodedResults SpeechToText::transcribe(int start, int duration, std: if (data.empty()) { throw api_error(SpeechToTextChunkHasNoData); } + config.return_timestamps = true; config.max_new_tokens = 100; if (!language.empty()){ config.language = language; diff --git a/openvino_bindings/src/audio/speech_to_text.h b/openvino_bindings/src/audio/speech_to_text.h index c0c7c1e..f119ca7 100644 --- a/openvino_bindings/src/audio/speech_to_text.h +++ b/openvino_bindings/src/audio/speech_to_text.h @@ -1,6 +1,7 @@ #ifndef SPEECH_TO_TEXT_H_ #define SPEECH_TO_TEXT_H_ + #include #include "openvino/genai/whisper_pipeline.hpp" #include "audio_grabber.h" @@ -14,7 +15,7 @@ class SpeechToText { SpeechToText(std::string model_path, std::string device): pipe(model_path, device), config(model_path + "/generation_config.json") {} void load_video(std::string video_path); int64_t video_duration(); - ov::genai::DecodedResults transcribe(int start, int duration, std::string language); + ov::genai::WhisperDecodedResults transcribe(int start, int duration, std::string language); }; diff --git a/openvino_bindings/src/bindings.cc b/openvino_bindings/src/bindings.cc index 04b0efa..89ee853 100644 --- a/openvino_bindings/src/bindings.cc +++ b/openvino_bindings/src/bindings.cc @@ -39,6 +39,19 @@ void freeStatusOrImageInference(StatusOrString *status) { delete status; } +void freeStatusOrModelResponse(StatusOrModelResponse *status) { + //std::cout << "Freeing StatusOrImageInference" << std::endl; + delete status; +} + +void freeStatusOrWhisperModelResponse(StatusOrWhisperModelResponse *status) { + if (status->status == StatusEnum::OkStatus) { + delete [] status->value; + status->value = NULL; // Prevent dangling pointers + } + delete status; +} + void freeStatusOrDevices(StatusOrDevices *status) { if (status->status == StatusEnum::OkStatus) { delete [] status->value; @@ -321,15 +334,21 @@ StatusOrInt* speechToTextVideoDuration(CSpeechToText instance) { } } -StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language) { +StatusOrWhisperModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language) { try { auto object = reinterpret_cast(instance); - auto result = object->transcribe(start, duration, language); - std::string text = result; - return new StatusOrModelResponse{OkStatus, "", convertToMetricsStruct(result.perf_metrics), strdup(text.c_str())}; + auto transcription_result = object->transcribe(start, duration, language); + auto chunks = transcription_result.chunks.value(); + std::string text = transcription_result; + TranscriptionChunk* result = new TranscriptionChunk[chunks.size()]; + for (int i = 0; i < chunks.size(); i++) { + auto r = chunks[i]; + result[i] = TranscriptionChunk{r.start_ts + start, r.end_ts + start, strdup(r.text.c_str())}; + } + return new StatusOrWhisperModelResponse{OkStatus, "", convertToMetricsStruct(transcription_result.perf_metrics), result, (int)chunks.size(), strdup(text.c_str())}; } catch (...) { auto except = handle_exceptions(); - return new StatusOrModelResponse{except->status, except->message}; + return new StatusOrWhisperModelResponse{except->status, except->message}; } } diff --git a/openvino_bindings/src/bindings.h b/openvino_bindings/src/bindings.h index 4528916..1f27f62 100644 --- a/openvino_bindings/src/bindings.h +++ b/openvino_bindings/src/bindings.h @@ -26,6 +26,11 @@ typedef struct { const char* name; } Device; +typedef struct { + float start_ts; + float end_ts; + const char* text; +} TranscriptionChunk; typedef struct { enum StatusEnum status; @@ -81,6 +86,15 @@ typedef struct { const char* value; } StatusOrModelResponse; +typedef struct { + enum StatusEnum status; + const char* message; + Metrics metrics; + TranscriptionChunk* value; + int size; + const char* text; +} StatusOrWhisperModelResponse; + typedef struct { enum StatusEnum status; const char* message; @@ -96,6 +110,8 @@ EXPORT void freeStatusOrString(StatusOrString *status); EXPORT void freeStatusOrImageInference(StatusOrImageInference *status); EXPORT void freeStatusOrLLMInference(StatusOrLLMInference *status); EXPORT void freeStatusOrSpeechToText(StatusOrSpeechToText *status); +EXPORT void freeStatusOrModelResponse(StatusOrModelResponse *status); +EXPORT void freeStatusOrWhisperModelResponse(StatusOrWhisperModelResponse *status); EXPORT void freeStatusOrDevices(StatusOrDevices *status); EXPORT StatusOrImageInference* imageInferenceOpen(const char* model_path, const char* task, const char* device, const char* label_definitions_json); @@ -126,7 +142,7 @@ EXPORT Status* graphRunnerStop(CGraphRunner instance); EXPORT StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device); EXPORT Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path); EXPORT StatusOrInt* speechToTextVideoDuration(CSpeechToText instance); -EXPORT StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language); +EXPORT StatusOrWhisperModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language); EXPORT StatusOrDevices* getAvailableDevices(); Status* handle_exceptions(); From c77fe809254be7db445a1f421428c494ea9a3d63 Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 14:16:42 +0100 Subject: [PATCH 04/17] Add download button and stub search bar to transcription --- lib/interop/speech_to_text.dart | 2 - lib/pages/transcription/playground.dart | 9 +- .../providers/speech_inference_provider.dart | 19 ++-- .../transcription/widgets/transcription.dart | 86 +++++++++++++++---- lib/widgets/controls/search_bar.dart | 2 +- 5 files changed, 89 insertions(+), 29 deletions(-) diff --git a/lib/interop/speech_to_text.dart b/lib/interop/speech_to_text.dart index 1e07f17..4f57cb2 100644 --- a/lib/interop/speech_to_text.dart +++ b/lib/interop/speech_to_text.dart @@ -9,8 +9,6 @@ final ov = getBindings(); class SpeechToText { final Pointer instance; - - SpeechToText(this.instance); static Future init(String modelPath, String device) async { diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart index 4af04e3..b1f8b54 100644 --- a/lib/pages/transcription/playground.dart +++ b/lib/pages/transcription/playground.dart @@ -118,18 +118,21 @@ class _PlaygroundState extends State with TickerProviderStateMixin{ alignment: Alignment.bottomCenter, children: [ Video(controller: controller), - Subtitles(transcription: inference.transcription, subtitleIndex: subtitleIndex), + Subtitles( + transcription: inference.transcription?.data, + subtitleIndex: subtitleIndex, + ), ] ), ), ), SizedBox( - width: 312, + width: 360, child: GridContainer( color: backgroundColor.of(theme), child: Transcription( onSeek: player.seek, - transcription: inference.transcription + transcription: inference.transcription, ), ), ) diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart index 2302b34..8e574f2 100644 --- a/lib/pages/transcription/providers/speech_inference_provider.dart +++ b/lib/pages/transcription/providers/speech_inference_provider.dart @@ -20,8 +20,11 @@ class SpeechInferenceProvider extends ChangeNotifier { bool get videoLoaded => _videoPath != null; - DynamicRangeLoading>? _transcription; - Map>? get transcription => _transcription?.data; + DynamicRangeLoading>? transcription; + + bool get transcriptionComplete { + return transcription?.complete ?? false; + } String _language = ""; @@ -47,7 +50,7 @@ class SpeechInferenceProvider extends ChangeNotifier { } void skipTo(int index) { - _transcription!.skipTo(index); + transcription!.skipTo(index); } Future loadVideo(String path) async { @@ -55,20 +58,20 @@ class SpeechInferenceProvider extends ChangeNotifier { _videoPath = path; final duration = await _inference!.loadVideo(path); final sections = (duration / transcriptionPeriod).ceil(); - _transcription = DynamicRangeLoading>(Section(0, sections)); + transcription = DynamicRangeLoading>(Section(0, sections)); notifyListeners(); } Future startTranscribing() async { - if (_transcription == null) { + if (transcription == null) { throw Exception("Can't transcribe before loading video"); } - while (!_transcription!.complete) { - if (_transcription == null) { + while (!transcription!.complete) { + if (transcription == null) { return; } - await _transcription!.process((int i) { + await transcription!.process((int i) { return transcribe(i * transcriptionPeriod, transcriptionPeriod); }); if (hasListeners) { diff --git a/lib/pages/transcription/widgets/transcription.dart b/lib/pages/transcription/widgets/transcription.dart index f521fb8..9301aa1 100644 --- a/lib/pages/transcription/widgets/transcription.dart +++ b/lib/pages/transcription/widgets/transcription.dart @@ -1,10 +1,14 @@ import 'dart:async'; +import 'dart:io'; +import 'package:file_picker/file_picker.dart'; import 'package:fluent_ui/fluent_ui.dart'; import 'package:inference/interop/openvino_bindings.dart'; import 'package:inference/pages/transcription/utils/message.dart'; import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; +import 'package:inference/pages/transcription/utils/section.dart'; import 'package:inference/theme_fluent.dart'; +import 'package:inference/widgets/controls/search_bar.dart'; String formatDuration(int totalSeconds) { final duration = Duration(seconds: totalSeconds); @@ -19,9 +23,30 @@ String formatDuration(int totalSeconds) { class Transcription extends StatelessWidget { - final Map>? transcription; + final DynamicRangeLoading>? transcription; final Function(Duration)? onSeek; - const Transcription({super.key, this.transcription, this.onSeek}); + const Transcription({super.key, this.onSeek, this.transcription}); + + void saveTranscript() async { + final file = await FilePicker.platform.saveFile( + dialogTitle: "Please select an output file:", + fileName: "transcription.txt", + ); + if (file == null){ + return; + } + + String contents = ""; + final indices = transcription!.data.keys.toList()..sort(); + for (int i in indices) { + final part = transcription!.data[i] as TranscriptionModelResponse; + for (final chunk in part.chunks) { + contents += chunk.text; + } + } + + await File(file).writeAsString(contents); + } @override Widget build(BuildContext context) { @@ -29,19 +54,50 @@ class Transcription extends StatelessWidget { return Container(); } - final messages = Message.parse(transcription!, transcriptionPeriod); - - return SingleChildScrollView( - child: Padding( - padding: const EdgeInsets.symmetric(horizontal: 8), - child: Column( - crossAxisAlignment: CrossAxisAlignment.start, - children: [ - for (final message in messages) - TranscriptionMessage(message: message, onSeek: onSeek) - ], + final messages = Message.parse(transcription!.data, transcriptionPeriod); + + return Column( + children: [ + Padding( + padding: const EdgeInsets.symmetric(vertical: 25, horizontal: 14), + child: Row( + children: [ + SearchBar(onChange: (p) {}, placeholder: "Search in transcript",), + Padding( + padding: const EdgeInsets.only(left: 8.0), + child: Tooltip( + message: transcription!.complete + ? "Download transcript" + : "Transcribing...", + child: Button( + onPressed: transcription!.complete + ? () => saveTranscript() + : null, + child: const Padding( + padding: EdgeInsets.symmetric(vertical: 2), + child: Icon(FluentIcons.download), + ), + ), + ), + ) + ], + ), ), - ), + Expanded( + child: SingleChildScrollView( + child: Padding( + padding: const EdgeInsets.only(left: 10, right: 18), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + for (final message in messages) + TranscriptionMessage(message: message, onSeek: onSeek) + ], + ), + ), + ), + ), + ], ); } } @@ -74,7 +130,7 @@ class _TranscriptionMessageState extends State { widget.onSeek?.call(widget.message.position); }, child: Padding( - padding: const EdgeInsets.symmetric(vertical: 20), + padding: const EdgeInsets.symmetric(vertical: 20, horizontal: 4), child: Column( crossAxisAlignment: CrossAxisAlignment.start, children: [ diff --git a/lib/widgets/controls/search_bar.dart b/lib/widgets/controls/search_bar.dart index 7142406..7b3e6a7 100644 --- a/lib/widgets/controls/search_bar.dart +++ b/lib/widgets/controls/search_bar.dart @@ -50,4 +50,4 @@ class _SearchBarState extends State { ), ); } -} \ No newline at end of file +} From 960de3d378488fb6fda660692dce55b35e85850c Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 16:16:15 +0100 Subject: [PATCH 05/17] Implement search No tests and implementation is a bit ugly. But it works. --- lib/pages/transcription/playground.dart | 15 +- lib/pages/transcription/utils/section.dart | 3 +- .../transcription/widgets/paragraph.dart | 95 +++++++++++ .../transcription/widgets/transcription.dart | 155 ++++++++---------- 4 files changed, 177 insertions(+), 91 deletions(-) create mode 100644 lib/pages/transcription/widgets/paragraph.dart diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart index b1f8b54..e9b7672 100644 --- a/lib/pages/transcription/playground.dart +++ b/lib/pages/transcription/playground.dart @@ -6,6 +6,7 @@ import 'package:inference/pages/computer_vision/widgets/model_properties.dart'; import 'package:inference/pages/models/widgets/grid_container.dart'; import 'package:inference/pages/transcription/widgets/subtitles.dart'; import 'package:inference/pages/transcription/widgets/transcription.dart'; +import 'package:inference/pages/transcription/utils/message.dart'; import 'package:inference/project.dart'; import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; import 'package:inference/theme_fluent.dart'; @@ -130,9 +131,17 @@ class _PlaygroundState extends State with TickerProviderStateMixin{ width: 360, child: GridContainer( color: backgroundColor.of(theme), - child: Transcription( - onSeek: player.seek, - transcription: inference.transcription, + child: Builder( + builder: (context) { + if (inference.transcription == null) { + return Container(); + } + return Transcription( + onSeek: player.seek, + transcription: inference.transcription!, + messages: Message.parse(inference.transcription!.data, transcriptionPeriod), + ); + } ), ), ) diff --git a/lib/pages/transcription/utils/section.dart b/lib/pages/transcription/utils/section.dart index 27ede1d..5c731b1 100644 --- a/lib/pages/transcription/utils/section.dart +++ b/lib/pages/transcription/utils/section.dart @@ -10,9 +10,10 @@ void moveToEnd(List list, I item) { class DynamicRangeLoading { List
sections = []; + int? size; Map data = {}; - DynamicRangeLoading(Section section): sections = [section]; + DynamicRangeLoading(Section section): sections = [section], size = section.end; Section get activeSection => sections.first; diff --git a/lib/pages/transcription/widgets/paragraph.dart b/lib/pages/transcription/widgets/paragraph.dart new file mode 100644 index 0000000..1189580 --- /dev/null +++ b/lib/pages/transcription/widgets/paragraph.dart @@ -0,0 +1,95 @@ + +import 'package:fluent_ui/fluent_ui.dart'; +import 'package:inference/theme_fluent.dart'; +import '../utils/message.dart'; + +String formatDuration(int totalSeconds) { + final duration = Duration(seconds: totalSeconds); + final minutes = duration.inMinutes; + final seconds = totalSeconds % 60; + + final minutesString = '$minutes'.padLeft(2, '0'); + final secondsString = '$seconds'.padLeft(2, '0'); + return '$minutesString:$secondsString'; +} + +class Paragraph extends StatefulWidget { + final Function(Duration)? onSeek; + final Message message; + final String? highlightedText; + + const Paragraph({super.key, required this.message, this.onSeek, this.highlightedText}); + + @override + State createState() => _ParagraphState(); +} + +class _ParagraphState extends State { + bool hover = false; + + @override + Widget build(BuildContext context) { + final theme = FluentTheme.of(context); + List pieces = []; + if (widget.highlightedText != null) { + final pattern = RegExp(widget.highlightedText!, caseSensitive: false); + final sections = widget.message.message.split(pattern); + if (sections.isNotEmpty) { + pieces.add(TextSpan(text: sections.first)); + for (int i = 1; i < sections.length; i++) { + pieces.add( + TextSpan( + text: widget.highlightedText!, + style: TextStyle(backgroundColor: theme.accentColor), + ) + ); + pieces.add(TextSpan(text: sections[i])); + } + } + } else { + pieces.add(TextSpan(text: widget.message.message)); + } + return MouseRegion( + onEnter: (_) { + setState(() => hover = true); + }, + onExit: (_) { + setState(() => hover = false); + }, + child: GestureDetector( + onTap: () { + widget.onSeek?.call(widget.message.position); + }, + child: Padding( + padding: const EdgeInsets.symmetric(vertical: 20, horizontal: 4), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Align( + alignment: Alignment.bottomRight, + child: Text(formatDuration(widget.message.position.inSeconds), + style: TextStyle( + fontSize: 9, + color: subtleTextColor.of(theme), + ) + ) + ), + Container( + decoration: BoxDecoration( + color: hover ? subtleTextColor.of(theme).withOpacity(0.3) : null, + borderRadius: const BorderRadius.all(Radius.circular(4)), + ), + padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 2), + child: RichText( + text: TextSpan( + children: pieces + ) + ) + ), + ], + ), + ), + ), + ); + } +} diff --git a/lib/pages/transcription/widgets/transcription.dart b/lib/pages/transcription/widgets/transcription.dart index 9301aa1..8c040da 100644 --- a/lib/pages/transcription/widgets/transcription.dart +++ b/lib/pages/transcription/widgets/transcription.dart @@ -5,27 +5,26 @@ import 'package:file_picker/file_picker.dart'; import 'package:fluent_ui/fluent_ui.dart'; import 'package:inference/interop/openvino_bindings.dart'; import 'package:inference/pages/transcription/utils/message.dart'; -import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; import 'package:inference/pages/transcription/utils/section.dart'; -import 'package:inference/theme_fluent.dart'; +import 'package:inference/pages/transcription/widgets/paragraph.dart'; import 'package:inference/widgets/controls/search_bar.dart'; -String formatDuration(int totalSeconds) { - final duration = Duration(seconds: totalSeconds); - final minutes = duration.inMinutes; - final seconds = totalSeconds % 60; - final minutesString = '$minutes'.padLeft(2, '0'); - final secondsString = '$seconds'.padLeft(2, '0'); - return '$minutesString:$secondsString'; -} - - - -class Transcription extends StatelessWidget { +class Transcription extends StatefulWidget { final DynamicRangeLoading>? transcription; final Function(Duration)? onSeek; - const Transcription({super.key, this.onSeek, this.transcription}); + final List messages; + const Transcription({super.key, this.onSeek, this.transcription, required this.messages}); + + @override + State createState() => _TranscriptionState(); +} + +class _TranscriptionState extends State { + final List _paragraphKeys = []; + final ScrollController _scrollController = ScrollController(); + final GlobalKey scrollKey = GlobalKey(); + String? searchText; void saveTranscript() async { final file = await FilePicker.platform.saveFile( @@ -37,9 +36,9 @@ class Transcription extends StatelessWidget { } String contents = ""; - final indices = transcription!.data.keys.toList()..sort(); + final indices = widget.transcription!.data.keys.toList()..sort(); for (int i in indices) { - final part = transcription!.data[i] as TranscriptionModelResponse; + final part = widget.transcription!.data[i] as TranscriptionModelResponse; for (final chunk in part.chunks) { contents += chunk.text; } @@ -48,29 +47,55 @@ class Transcription extends StatelessWidget { await File(file).writeAsString(contents); } - @override - Widget build(BuildContext context) { - if (transcription == null) { - return Container(); - } + void search(String text) { + setState(() { + searchText = text; + }); + + final pattern = RegExp(text, caseSensitive: false); + int? index; + for (int i = 0; i < widget.messages.length; i++) { + if (widget.messages[i].message.contains(pattern)) { + index = i; + break; + } - final messages = Message.parse(transcription!.data, transcriptionPeriod); + } + if (index != null){ + final context = _paragraphKeys[index].currentContext; + + if (context != null) { + final renderBox = context.findRenderObject() as RenderBox?; + if (renderBox != null) { + final position = renderBox.localToGlobal(Offset.zero, ancestor: scrollKey.currentContext?.findRenderObject()); + final offset = _scrollController.offset + position.dy; + _scrollController.animateTo( + offset, + duration: const Duration(milliseconds: 500), + curve: Curves.easeInOut, + ); + } + } + } + } + @override + Widget build(BuildContext context) { return Column( children: [ Padding( padding: const EdgeInsets.symmetric(vertical: 25, horizontal: 14), child: Row( children: [ - SearchBar(onChange: (p) {}, placeholder: "Search in transcript",), + SearchBar(onChange: search, placeholder: "Search in transcript",), Padding( padding: const EdgeInsets.only(left: 8.0), child: Tooltip( - message: transcription!.complete + message: widget.transcription!.complete ? "Download transcript" : "Transcribing...", child: Button( - onPressed: transcription!.complete + onPressed: widget.transcription?.complete ?? false ? () => saveTranscript() : null, child: const Padding( @@ -85,14 +110,27 @@ class Transcription extends StatelessWidget { ), Expanded( child: SingleChildScrollView( + key: scrollKey, + controller: _scrollController, child: Padding( padding: const EdgeInsets.only(left: 10, right: 18), child: Column( crossAxisAlignment: CrossAxisAlignment.start, - children: [ - for (final message in messages) - TranscriptionMessage(message: message, onSeek: onSeek) - ], + children: List.generate(widget.messages.length, (index) { + //Adjusting state in render is ugly. But might just work + if (_paragraphKeys.length <= index) { + print("length: ${_paragraphKeys.length}, index: $index"); + _paragraphKeys.add(GlobalKey()); + } + + return Paragraph( + key: _paragraphKeys[index], + message: widget.messages[index], + highlightedText: searchText, + onSeek: widget.onSeek, + ); + + }), ), ), ), @@ -101,60 +139,3 @@ class Transcription extends StatelessWidget { ); } } - -class TranscriptionMessage extends StatefulWidget { - final Function(Duration)? onSeek; - final Message message; - - const TranscriptionMessage({super.key, required this.message, this.onSeek}); - - @override - State createState() => _TranscriptionMessageState(); -} - -class _TranscriptionMessageState extends State { - bool hover = false; - - @override - Widget build(BuildContext context) { - final theme = FluentTheme.of(context); - return MouseRegion( - onEnter: (_) { - setState(() => hover = true); - }, - onExit: (_) { - setState(() => hover = false); - }, - child: GestureDetector( - onTap: () { - widget.onSeek?.call(widget.message.position); - }, - child: Padding( - padding: const EdgeInsets.symmetric(vertical: 20, horizontal: 4), - child: Column( - crossAxisAlignment: CrossAxisAlignment.start, - children: [ - Align( - alignment: Alignment.bottomRight, - child: Text(formatDuration(widget.message.position.inSeconds), - style: TextStyle( - fontSize: 9, - color: subtleTextColor.of(theme), - ) - ) - ), - Container( - decoration: BoxDecoration( - color: hover ? subtleTextColor.of(theme).withOpacity(0.3) : null, - borderRadius: const BorderRadius.all(Radius.circular(4)), - ), - padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 2), - child: Text(widget.message.message) - ), - ], - ), - ), - ), - ); - } -} From 56ec80d5509ebf65678e75baa71a3829058025e9 Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 16:28:02 +0100 Subject: [PATCH 06/17] Explain ugly state change on build --- lib/pages/transcription/widgets/transcription.dart | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pages/transcription/widgets/transcription.dart b/lib/pages/transcription/widgets/transcription.dart index 8c040da..b02fcdc 100644 --- a/lib/pages/transcription/widgets/transcription.dart +++ b/lib/pages/transcription/widgets/transcription.dart @@ -117,9 +117,9 @@ class _TranscriptionState extends State { child: Column( crossAxisAlignment: CrossAxisAlignment.start, children: List.generate(widget.messages.length, (index) { - //Adjusting state in render is ugly. But might just work + // Adjusting state in render is ugly. But works. + // This is done because we need a global key but the paragraphs are added as you go. if (_paragraphKeys.length <= index) { - print("length: ${_paragraphKeys.length}, index: $index"); _paragraphKeys.add(GlobalKey()); } From 2c5db4afbbc55cbe286ae841ac3facddaf49f643 Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 16:34:11 +0100 Subject: [PATCH 07/17] Hide performance metrics page for now --- lib/pages/transcription/transcription.dart | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/pages/transcription/transcription.dart b/lib/pages/transcription/transcription.dart index 14353d9..0ea6f14 100644 --- a/lib/pages/transcription/transcription.dart +++ b/lib/pages/transcription/transcription.dart @@ -80,11 +80,11 @@ class _TranscriptionPageState extends State { title: const Text("Playground"), body: Playground(project: widget.project), ), - PaneItem( - icon: const Icon(FluentIcons.project_collection), - title: const Text("Performance metrics"), - body: Container(), - ), + //PaneItem( + // icon: const Icon(FluentIcons.project_collection), + // title: const Text("Performance metrics"), + // body: Container(), + //), ], ) ), From d4195a3a966511c54e2b5d1909463ec1c3107388 Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 16:43:07 +0100 Subject: [PATCH 08/17] Fix old drop area from being used --- lib/pages/transcription/playground.dart | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart index e9b7672..5577170 100644 --- a/lib/pages/transcription/playground.dart +++ b/lib/pages/transcription/playground.dart @@ -10,7 +10,7 @@ import 'package:inference/pages/transcription/utils/message.dart'; import 'package:inference/project.dart'; import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; import 'package:inference/theme_fluent.dart'; -import 'package:inference/utils/drop_area.dart'; +import 'package:inference/widgets/controls/drop_area.dart'; import 'package:inference/widgets/controls/no_outline_button.dart'; import 'package:inference/widgets/device_selector.dart'; import 'package:media_kit/media_kit.dart'; From f1084d6123cccf1605a0e234587f38f1119711af Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 17:03:39 +0100 Subject: [PATCH 09/17] Fix color for paragraph in transcription --- lib/pages/transcription/widgets/paragraph.dart | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/pages/transcription/widgets/paragraph.dart b/lib/pages/transcription/widgets/paragraph.dart index 1189580..c6ca4f1 100644 --- a/lib/pages/transcription/widgets/paragraph.dart +++ b/lib/pages/transcription/widgets/paragraph.dart @@ -82,6 +82,9 @@ class _ParagraphState extends State { padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 2), child: RichText( text: TextSpan( + style: TextStyle( + color: theme.inactiveColor + ), children: pieces ) ) From 60e398665d6d39a08d889895401fd0a41074c82c Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 17:05:17 +0100 Subject: [PATCH 10/17] Add dispose for video player --- lib/pages/transcription/playground.dart | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart index 5577170..344d00f 100644 --- a/lib/pages/transcription/playground.dart +++ b/lib/pages/transcription/playground.dart @@ -65,6 +65,12 @@ class _PlaygroundState extends State with TickerProviderStateMixin{ initializeVideoAndListeners(file); } + @override + void dispose() { + player.dispose(); + super.dispose(); + } + @override Widget build(BuildContext context) { final theme = FluentTheme.of(context); From 72de8efb51dba110406af094612147c97fdf977f Mon Sep 17 00:00:00 2001 From: "Hecker, Ronald" Date: Wed, 20 Nov 2024 17:10:27 +0100 Subject: [PATCH 11/17] Reimplement section tests and fix test for computer vision --- .../computer_vision/computer_vision.dart | 1 - .../model_properties_test.dart | 4 +- .../transcriptions/utils/section_test.dart | 99 +++++++++++++++++ test/section_test.dart | 100 ------------------ 4 files changed, 101 insertions(+), 103 deletions(-) create mode 100644 test/pages/transcriptions/utils/section_test.dart delete mode 100644 test/section_test.dart diff --git a/lib/pages/computer_vision/computer_vision.dart b/lib/pages/computer_vision/computer_vision.dart index 8c7522c..715a270 100644 --- a/lib/pages/computer_vision/computer_vision.dart +++ b/lib/pages/computer_vision/computer_vision.dart @@ -2,7 +2,6 @@ import 'package:fluent_ui/fluent_ui.dart'; import 'package:go_router/go_router.dart'; import 'package:inference/pages/computer_vision/batch_inference.dart'; import 'package:inference/pages/computer_vision/live_inference.dart'; -import 'package:inference/pages/models/widgets/grid_container.dart'; import 'package:inference/project.dart'; import 'package:inference/providers/image_inference_provider.dart'; import 'package:inference/providers/preference_provider.dart'; diff --git a/test/pages/computer_vision/model_properties_test.dart b/test/pages/computer_vision/model_properties_test.dart index 70d1e6e..d034774 100644 --- a/test/pages/computer_vision/model_properties_test.dart +++ b/test/pages/computer_vision/model_properties_test.dart @@ -22,8 +22,8 @@ Widget testWidget(ImageInferenceProvider provider) { ), ], child: FluentApp( - home: const Center( - child: ModelProperties() + home: Center( + child: ModelProperties(project: provider.project) ), ), ); diff --git a/test/pages/transcriptions/utils/section_test.dart b/test/pages/transcriptions/utils/section_test.dart new file mode 100644 index 0000000..3452a62 --- /dev/null +++ b/test/pages/transcriptions/utils/section_test.dart @@ -0,0 +1,99 @@ +import 'package:flutter_test/flutter_test.dart'; +import 'package:inference/pages/transcription/utils/section.dart'; + +void main() { + group("Section", () { + group("process", () { + test("process sets values in data", () async { + final state = DynamicRangeLoading(Section(0, 10)); + for (int j = 0; j < 10; j++) { + await state.process((i) async { + return j; + }); + + expect(state.data[j], j); + } + }); + + test("process out of bounds throws error", () async { + final state = DynamicRangeLoading(Section(0, 10)); + for (int j = 0; j < 10; j++) { + await state.process((i) async { + return j; + }); + } + + expect(() async { + await state.process((i) async { + return 1; + }); + }, throwsException); + }); + + test("process continues after skip is done", () async { + final state = DynamicRangeLoading(Section(0, 10)); + state.skipTo(8); + for (int j = 0; j < 2; j++) { + await state.process((i) async { + return j; + }); + } + expect(state.getNextIndex(), 0); + }); + + }); + + test('getNextIndex throws error when state is complete', () { + final state = DynamicRangeLoading(Section(0, 0)); + expect(() { + state.getNextIndex(); + },throwsException); + }); + + test('complete', () async { + final state = DynamicRangeLoading(Section(0, 10)); + for (int j = 0; j < 10; j++) { + expect(state.complete, false); + await state.process((i) async { + return j; + }); + } + expect(state.complete, true); + }); + + group("skip", () { + test("skips to specific index", () async { + final state = DynamicRangeLoading(Section(0, 10)); + state.skipTo(5); + expect(state.getNextIndex(), 5); + expect(state.activeSection.begin, 5); + expect(state.activeSection.end, 10); + }); + + test("skips to partially complete section will go to end of that section ", () async { + final state = DynamicRangeLoading(Section(0, 10)); + + for (int j = 0; j < 8; j++) { + await state.process((i) async { + return j; + }); + } + state.skipTo(5); + expect(state.getNextIndex(), 8); + }); + + test("skips to fully complete section will not shift next index", () async { + final state = DynamicRangeLoading(Section(0, 10)); + state.skipTo(5); + + for (int j = 0; j < 5; j++) { + await state.process((i) async { + return j; + }); + } + state.skipTo(5); + expect(state.getNextIndex(), 0); + }); + }); + }); +} diff --git a/test/section_test.dart b/test/section_test.dart deleted file mode 100644 index 7aebd68..0000000 --- a/test/section_test.dart +++ /dev/null @@ -1,100 +0,0 @@ -import 'package:flutter_test/flutter_test.dart'; - -void main() { - /* - group("Section", () { - group("process", () { - // test("process sets values in data", () async { - // final state = DynamicRangeLoading(Section(0, 10)); - // for (int j = 0; j < 10; j++) { - // await state.process((i) async { - // return j; - // }); - - // expect(state.data[j], j); - // } - // }); - - // test("process out of bounds throws error", () async { - // final state = DynamicRangeLoading(Section(0, 10)); - // for (int j = 0; j < 10; j++) { - // await state.process((i) async { - // return j; - // }); - // } - - // expect(() async { - // await state.process((i) async { - // return 1; - // }); - // }, throwsException); - // }); - - // test("process continues after skip is done", () async { - // final state = DynamicRangeLoading(Section(0, 10)); - // state.skipTo(8); - // for (int j = 0; j < 2; j++) { - // await state.process((i) async { - // return j; - // }); - // } - // expect(state.getNextIndex(), 0); - // }); - - // }); - - // test('getNextIndex throws error when state is complete', () { - // final state = DynamicRangeLoading(Section(0, 0)); - // expect(() { - // state.getNextIndex(); - // },throwsException); - // }); - - // test('complete', () async { - // final state = DynamicRangeLoading(Section(0, 10)); - // for (int j = 0; j < 10; j++) { - // expect(state.complete, false); - // await state.process((i) async { - // return j; - // }); - // } - // expect(state.complete, true); - // }); - - // group("skip", () { - // test("skips to specific index", () async { - // final state = DynamicRangeLoading(Section(0, 10)); - // state.skipTo(5); - // expect(state.getNextIndex(), 5); - // expect(state.activeSection.begin, 5); - // expect(state.activeSection.end, 10); - // }); - - // test("skips to partially complete section will go to end of that section ", () async { - // final state = DynamicRangeLoading(Section(0, 10)); - - // for (int j = 0; j < 8; j++) { - // await state.process((i) async { - // return j; - // }); - // } - // state.skipTo(5); - // expect(state.getNextIndex(), 8); - // }); - - // test("skips to fully complete section will not shift next index", () async { - // final state = DynamicRangeLoading(Section(0, 10)); - // state.skipTo(5); - - // for (int j = 0; j < 5; j++) { - // await state.process((i) async { - // return j; - // }); - // } - // state.skipTo(5); - // expect(state.getNextIndex(), 0); - // }); - }); - }); - */ -} From bb635ac6498cd707cdd80b3a10b8284595fba07c Mon Sep 17 00:00:00 2001 From: Ronald Hecker Date: Thu, 21 Nov 2024 14:51:39 +0100 Subject: [PATCH 12/17] Lock ffmpeg in windows to 6.1.1 --- .github/workflows/windows-build.yaml | 3 ++- openvino_bindings/README.md | 6 +++++- openvino_bindings/WORKSPACE | 24 ++++++++++++------------ openvino_bindings/third_party/.gitignore | 1 + openvino_bindings/third_party/vcpkg.json | 10 ++++++++++ 5 files changed, 30 insertions(+), 14 deletions(-) create mode 100644 openvino_bindings/third_party/.gitignore create mode 100644 openvino_bindings/third_party/vcpkg.json diff --git a/.github/workflows/windows-build.yaml b/.github/workflows/windows-build.yaml index e0beccd..8b47c6d 100644 --- a/.github/workflows/windows-build.yaml +++ b/.github/workflows/windows-build.yaml @@ -76,7 +76,8 @@ jobs: run: | git clone https://github.com/microsoft/vcpkg.git C:\vcpkg C:\vcpkg\bootstrap-vcpkg.bat - C:\vcpkg\vcpkg install ffmpeg + cd openvino_bindings/third_party + C:\vcpkg\vcpkg install shell: cmd # Step 10: Download and Install OpenVINO Runtime diff --git a/openvino_bindings/README.md b/openvino_bindings/README.md index d6f8c21..4715bef 100644 --- a/openvino_bindings/README.md +++ b/openvino_bindings/README.md @@ -95,7 +95,11 @@ A step by step guide can be found [here]('./docs/WINDOWS.md'). [Install OpenVINO Runtime 24.5.0]( https://docs.openvino.ai/2024/get-started/install-openvino.html?PACKAGE=OPENVINO_GENAI&VERSION=v_2024_4_0&OP_SYSTEM=WINDOWS&DISTRIBUTION=ARCHIVE) with GenAI flavor in `C:/Intel/openvino_24.5.0`. Build OpenCV in `C:/opencv/build`. -Install ffmpeg: `vcpkg install ffmpeg`. +Install ffmpeg: +```sh +cd openvino_bindings/third_party +vcpkg install +``` Install [mediapipe requirements](https://ai.google.dev/edge/mediapipe/framework/getting_started/install#installing_on_windows) and setup the environment variables. diff --git a/openvino_bindings/WORKSPACE b/openvino_bindings/WORKSPACE index 6a1707a..1238a9d 100644 --- a/openvino_bindings/WORKSPACE +++ b/openvino_bindings/WORKSPACE @@ -106,23 +106,23 @@ git_repository( tag = "v3.11.3", ) -#new_local_repository( -# name = "linux_ffmpeg", -# build_file = "//third_party/ffmpeg:linux.BUILD", -# path = "/usr" -#) -# +new_local_repository( + name = "linux_ffmpeg", + build_file = "//third_party/ffmpeg:linux.BUILD", + path = "/usr" +) + new_local_repository( name = "mac_ffmpeg", build_file = "//third_party/ffmpeg:mac.BUILD", path = "/opt/homebrew/opt/ffmpeg@6", ) -# -#new_local_repository( -# name = "windows_ffmpeg", -# build_file = "//third_party/ffmpeg:windows.BUILD", -# path = "C:/vcpkg/packages/ffmpeg_x64-windows", -#) + +new_local_repository( + name = "windows_ffmpeg", + build_file = "//third_party/ffmpeg:windows.BUILD", + path = "./third_party/vcpkg_installed/x64-windows", +) http_archive( name = "rules_pkg", diff --git a/openvino_bindings/third_party/.gitignore b/openvino_bindings/third_party/.gitignore new file mode 100644 index 0000000..8a1403e --- /dev/null +++ b/openvino_bindings/third_party/.gitignore @@ -0,0 +1 @@ +vcpkg_installed diff --git a/openvino_bindings/third_party/vcpkg.json b/openvino_bindings/third_party/vcpkg.json new file mode 100644 index 0000000..2497f47 --- /dev/null +++ b/openvino_bindings/third_party/vcpkg.json @@ -0,0 +1,10 @@ +{ + "name": "openvinotestdrivebindings", + "builtin-baseline": "c8582b4d83dbd36e1bebc08bf166b5eb807996b0", + "dependencies": [ + "ffmpeg" + ], + "overrides": [ + { "name": "ffmpeg", "version": "6.1.1" } + ] +} From bc223d4afb59a6e6b5fbd23241b2a2cfc997eb3c Mon Sep 17 00:00:00 2001 From: Ronald Hecker Date: Thu, 21 Nov 2024 15:00:27 +0100 Subject: [PATCH 13/17] fix windows build flutter error --- .github/workflows/windows-build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows-build.yaml b/.github/workflows/windows-build.yaml index 8b47c6d..70d7c3e 100644 --- a/.github/workflows/windows-build.yaml +++ b/.github/workflows/windows-build.yaml @@ -127,7 +127,7 @@ jobs: - uses: subosito/flutter-action@v2 with: channel: 'stable' - flutter-version: '3.24.0' + flutter-version: '3.24.5' - name: Install project dependencies run: flutter pub get - name: Generate intermediates From 3491c2a56dad2dba83ddd1985386dc786a2880b3 Mon Sep 17 00:00:00 2001 From: Ronald Hecker Date: Thu, 21 Nov 2024 15:17:05 +0100 Subject: [PATCH 14/17] fix ffmpeg vcpkg hopefully --- .github/workflows/windows-build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows-build.yaml b/.github/workflows/windows-build.yaml index 70d7c3e..440b880 100644 --- a/.github/workflows/windows-build.yaml +++ b/.github/workflows/windows-build.yaml @@ -73,12 +73,12 @@ jobs: # Step 9: Install vcpkg and ffmpeg - name: Install vcpkg and ffmpeg + shell: powershell run: | - git clone https://github.com/microsoft/vcpkg.git C:\vcpkg + if (!(Test-Path "C:\vcpkg")) { git clone https://github.com/microsoft/vcpkg.git C:\vcpkg } C:\vcpkg\bootstrap-vcpkg.bat cd openvino_bindings/third_party C:\vcpkg\vcpkg install - shell: cmd # Step 10: Download and Install OpenVINO Runtime - name: Download and Install OpenVINO Runtime 24.5.0 From f58ea0cd71e66a93464493b7aff86fe9d0fa43ac Mon Sep 17 00:00:00 2001 From: Ronald Hecker Date: Fri, 22 Nov 2024 13:52:22 +0100 Subject: [PATCH 15/17] Implement performance metrics page Added metrics for transcription page Fixes subtitles for light colorscheme --- .../transcription/performance_metrics.dart | 87 +++++++++++++++++++ lib/pages/transcription/playground.dart | 9 ++ .../providers/speech_inference_provider.dart | 16 +++- lib/pages/transcription/transcription.dart | 11 +-- lib/pages/transcription/utils/metrics.dart | 54 ++++++++++++ .../transcription/widgets/subtitles.dart | 3 +- lib/widgets/performance_tile.dart | 65 ++++++++++++++ 7 files changed, 238 insertions(+), 7 deletions(-) create mode 100644 lib/pages/transcription/performance_metrics.dart create mode 100644 lib/pages/transcription/utils/metrics.dart create mode 100644 lib/widgets/performance_tile.dart diff --git a/lib/pages/transcription/performance_metrics.dart b/lib/pages/transcription/performance_metrics.dart new file mode 100644 index 0000000..a12b554 --- /dev/null +++ b/lib/pages/transcription/performance_metrics.dart @@ -0,0 +1,87 @@ +import 'package:fluent_ui/fluent_ui.dart'; +import 'package:inference/pages/computer_vision/widgets/horizontal_rule.dart'; +import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; +import 'package:inference/widgets/performance_tile.dart'; +import 'package:intl/intl.dart'; +import 'package:provider/provider.dart'; + +class PerformanceMetrics extends StatelessWidget { + const PerformanceMetrics({super.key}); + + @override + Widget build(BuildContext context) { + return Consumer( + builder: (context, inference, child) { + final metrics = inference.metrics; + if (metrics == null) { + return Container(); + } + + Locale locale = Localizations.localeOf(context); + final nf = NumberFormat.decimalPatternDigits( + locale: locale.languageCode, decimalDigits: 0); + + return Padding( + padding: const EdgeInsets.symmetric(vertical: 80), + child: Center( + child: SizedBox( + width: 887, + child: Column( + children: [ + Row( + mainAxisAlignment: MainAxisAlignment.spaceEvenly, + children: [ + PerformanceTile( + title: "Time to first token (TTFT)", + value: nf.format(metrics.ttft), + unit: "ms", + tall: true, + ), + PerformanceTile( + title: "Time per output token (TPOT)", + value: nf.format(metrics.tpot), + unit: "ms", + tall: true, + ), + PerformanceTile( + title: "Generate total duration", + value: nf.format(metrics.generateTime), + unit: "ms", + tall: true, + ), + ], + ), + const Padding( + padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16), + child: HorizontalRule(), + ), + Row( + mainAxisAlignment: MainAxisAlignment.spaceEvenly, + children: [ + PerformanceTile( + title: "Load time", + value: nf.format(metrics.loadTime), + unit: "ms", + ), + PerformanceTile( + title: "Detokenization duration", + value: nf.format(metrics.detokenizationTime), + unit: "ms", + ), + PerformanceTile( + title: "Throughput", + value: nf.format(metrics.throughput), + unit: "tokens/sec", + ), + ], + ), + ], + ), + ), + ), + ); + } + ); + } +} + diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart index 344d00f..5f3e03b 100644 --- a/lib/pages/transcription/playground.dart +++ b/lib/pages/transcription/playground.dart @@ -65,6 +65,15 @@ class _PlaygroundState extends State with TickerProviderStateMixin{ initializeVideoAndListeners(file); } + @override + void initState() { + super.initState(); + final inference = Provider.of(context, listen: false); + if (inference.videoPath != null) { + initializeVideoAndListeners(inference.videoPath!); + } + } + @override void dispose() { player.dispose(); diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart index 8e574f2..fedbf72 100644 --- a/lib/pages/transcription/providers/speech_inference_provider.dart +++ b/lib/pages/transcription/providers/speech_inference_provider.dart @@ -3,9 +3,11 @@ import 'dart:async'; import 'package:flutter/material.dart'; import 'package:inference/interop/openvino_bindings.dart'; import 'package:inference/interop/speech_to_text.dart'; +import 'package:inference/pages/transcription/utils/metrics.dart'; import 'package:inference/pages/transcription/utils/section.dart'; import 'package:inference/project.dart'; + const transcriptionPeriod = 10; class SpeechInferenceProvider extends ChangeNotifier { @@ -21,6 +23,7 @@ class SpeechInferenceProvider extends ChangeNotifier { bool get videoLoaded => _videoPath != null; DynamicRangeLoading>? transcription; + DMetrics? metrics; bool get transcriptionComplete { return transcription?.complete ?? false; @@ -62,6 +65,15 @@ class SpeechInferenceProvider extends ChangeNotifier { notifyListeners(); } + void addMetrics(TranscriptionModelResponse response) { + if (metrics == null) { + metrics = DMetrics.fromCMetrics(response.metrics); + } else { + metrics!.addCMetrics(response.metrics); + } + notifyListeners(); + } + Future startTranscribing() async { if (transcription == null) { throw Exception("Can't transcribe before loading video"); @@ -72,7 +84,9 @@ class SpeechInferenceProvider extends ChangeNotifier { return; } await transcription!.process((int i) { - return transcribe(i * transcriptionPeriod, transcriptionPeriod); + final request = transcribe(i * transcriptionPeriod, transcriptionPeriod); + request.then(addMetrics); + return request; }); if (hasListeners) { notifyListeners(); diff --git a/lib/pages/transcription/transcription.dart b/lib/pages/transcription/transcription.dart index 0ea6f14..1d9fd00 100644 --- a/lib/pages/transcription/transcription.dart +++ b/lib/pages/transcription/transcription.dart @@ -3,6 +3,7 @@ import 'package:go_router/go_router.dart'; import 'package:inference/project.dart'; import 'package:inference/providers/preference_provider.dart'; import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; +import 'package:inference/pages/transcription/performance_metrics.dart'; import 'package:inference/pages/transcription/playground.dart'; import 'package:provider/provider.dart'; @@ -80,11 +81,11 @@ class _TranscriptionPageState extends State { title: const Text("Playground"), body: Playground(project: widget.project), ), - //PaneItem( - // icon: const Icon(FluentIcons.project_collection), - // title: const Text("Performance metrics"), - // body: Container(), - //), + PaneItem( + icon: const Icon(FluentIcons.line_chart), + title: const Text("Performance metrics"), + body: const PerformanceMetrics(), + ), ], ) ), diff --git a/lib/pages/transcription/utils/metrics.dart b/lib/pages/transcription/utils/metrics.dart new file mode 100644 index 0000000..481c9f3 --- /dev/null +++ b/lib/pages/transcription/utils/metrics.dart @@ -0,0 +1,54 @@ +import 'package:inference/interop/generated_bindings.dart'; + +class DMetrics { + double loadTime; + double generateTime; + double tokenizationTime; + double detokenizationTime; + double ttft; + double tpot; + double throughput; + int numberOfGeneratedTokens; + int numberOfInputTokens; + + int n = 1; // number of added metrics + + DMetrics({ + required this.loadTime, + required this.generateTime, + required this.tokenizationTime, + required this.detokenizationTime, + required this.ttft, + required this.tpot, + required this.throughput, + required this.numberOfGeneratedTokens, + required this.numberOfInputTokens, + }); + + void addCMetrics(Metrics metrics) { + //loadTime = metrics.load_time; + generateTime += metrics.generate_time; + tokenizationTime += metrics.tokenization_time; + detokenizationTime += metrics.detokenization_time; + ttft = (ttft * (n / (n + 1))) + metrics.ttft / n; + tpot = (tpot * (n / (n + 1))) + metrics.tpot / n; + throughput = (throughput * (n / (n + 1))) + metrics.throughput / n; + numberOfGeneratedTokens += metrics.number_of_generated_tokens; + numberOfInputTokens += metrics.number_of_input_tokens; + n += 1; + } + + factory DMetrics.fromCMetrics(Metrics metrics) { + return DMetrics( + loadTime: metrics.load_time, + generateTime: metrics.generate_time, + tokenizationTime: metrics.tokenization_time, + detokenizationTime: metrics.detokenization_time, + ttft: metrics.ttft, + tpot: metrics.tpot, + throughput: metrics.throughput, + numberOfGeneratedTokens: metrics.number_of_generated_tokens, + numberOfInputTokens: metrics.number_of_input_tokens, + ); + } +} diff --git a/lib/pages/transcription/widgets/subtitles.dart b/lib/pages/transcription/widgets/subtitles.dart index 9971c9a..da17b0c 100644 --- a/lib/pages/transcription/widgets/subtitles.dart +++ b/lib/pages/transcription/widgets/subtitles.dart @@ -44,7 +44,8 @@ class Subtitles extends StatelessWidget { Text(text, textAlign: TextAlign.center, style: const TextStyle( - fontSize: fontSize + fontSize: fontSize, + color: Colors.white, ) ) ], diff --git a/lib/widgets/performance_tile.dart b/lib/widgets/performance_tile.dart new file mode 100644 index 0000000..6ab5dd0 --- /dev/null +++ b/lib/widgets/performance_tile.dart @@ -0,0 +1,65 @@ +import 'package:fluent_ui/fluent_ui.dart'; + +class PerformanceTile extends StatelessWidget { + final String title; + final String value; + final String unit; + final bool tall; + + const PerformanceTile({ + super.key, + required this.title, + required this.value, + required this.unit, + this.tall = false, + }); + + @override + Widget build(BuildContext context) { + final theme = FluentTheme.of(context); + return Padding( + padding: const EdgeInsets.all(8.0), + child: Acrylic( + elevation: 5, + shadowColor: Colors.black, + shape: RoundedRectangleBorder ( + borderRadius: BorderRadius.circular(4), + ), + child: SizedBox( + width: 268, + height: tall ? 200 : 124, + child: Center( + child: Column( + mainAxisAlignment: MainAxisAlignment.center, + crossAxisAlignment: CrossAxisAlignment.center, + children: [ + Text( + title, + style: const TextStyle( + fontSize: 14, + ), + ), + RichText( + text: TextSpan( + style: TextStyle( + color: theme.inactiveColor, + ), + children: [ + TextSpan(text: value, + style: const TextStyle( + fontSize: 30, + ) + ), + TextSpan(text: " $unit"), + ] + ) + ), + ], + ) + ) + ), + ), + ); + } + +} From 6bddb727891a8b9fa28f217bbefc2d2564020b6c Mon Sep 17 00:00:00 2001 From: Ronald Hecker Date: Fri, 22 Nov 2024 13:58:09 +0100 Subject: [PATCH 16/17] Add model properties to performance metrics --- .../transcription/performance_metrics.dart | 147 ++++++++++-------- lib/pages/transcription/transcription.dart | 2 +- 2 files changed, 79 insertions(+), 70 deletions(-) diff --git a/lib/pages/transcription/performance_metrics.dart b/lib/pages/transcription/performance_metrics.dart index a12b554..e1795a1 100644 --- a/lib/pages/transcription/performance_metrics.dart +++ b/lib/pages/transcription/performance_metrics.dart @@ -1,87 +1,96 @@ import 'package:fluent_ui/fluent_ui.dart'; import 'package:inference/pages/computer_vision/widgets/horizontal_rule.dart'; +import 'package:inference/pages/computer_vision/widgets/model_properties.dart'; import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; +import 'package:inference/project.dart'; import 'package:inference/widgets/performance_tile.dart'; import 'package:intl/intl.dart'; import 'package:provider/provider.dart'; class PerformanceMetrics extends StatelessWidget { - const PerformanceMetrics({super.key}); + final Project project; + const PerformanceMetrics({super.key, required this.project}); @override Widget build(BuildContext context) { - return Consumer( - builder: (context, inference, child) { - final metrics = inference.metrics; - if (metrics == null) { - return Container(); - } + return Row( + children: [ + Expanded( + child: Consumer( + builder: (context, inference, child) { + final metrics = inference.metrics; + if (metrics == null) { + return Container(); + } - Locale locale = Localizations.localeOf(context); - final nf = NumberFormat.decimalPatternDigits( - locale: locale.languageCode, decimalDigits: 0); + Locale locale = Localizations.localeOf(context); + final nf = NumberFormat.decimalPatternDigits( + locale: locale.languageCode, decimalDigits: 0); - return Padding( - padding: const EdgeInsets.symmetric(vertical: 80), - child: Center( - child: SizedBox( - width: 887, - child: Column( - children: [ - Row( - mainAxisAlignment: MainAxisAlignment.spaceEvenly, - children: [ - PerformanceTile( - title: "Time to first token (TTFT)", - value: nf.format(metrics.ttft), - unit: "ms", - tall: true, - ), - PerformanceTile( - title: "Time per output token (TPOT)", - value: nf.format(metrics.tpot), - unit: "ms", - tall: true, - ), - PerformanceTile( - title: "Generate total duration", - value: nf.format(metrics.generateTime), - unit: "ms", - tall: true, - ), - ], + return Padding( + padding: const EdgeInsets.symmetric(vertical: 80), + child: Center( + child: SizedBox( + width: 887, + child: Column( + children: [ + Row( + mainAxisAlignment: MainAxisAlignment.spaceEvenly, + children: [ + PerformanceTile( + title: "Time to first token (TTFT)", + value: nf.format(metrics.ttft), + unit: "ms", + tall: true, + ), + PerformanceTile( + title: "Time per output token (TPOT)", + value: nf.format(metrics.tpot), + unit: "ms", + tall: true, + ), + PerformanceTile( + title: "Generate total duration", + value: nf.format(metrics.generateTime), + unit: "ms", + tall: true, + ), + ], + ), + const Padding( + padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16), + child: HorizontalRule(), + ), + Row( + mainAxisAlignment: MainAxisAlignment.spaceEvenly, + children: [ + PerformanceTile( + title: "Load time", + value: nf.format(metrics.loadTime), + unit: "ms", + ), + PerformanceTile( + title: "Detokenization duration", + value: nf.format(metrics.detokenizationTime), + unit: "ms", + ), + PerformanceTile( + title: "Throughput", + value: nf.format(metrics.throughput), + unit: "tokens/sec", + ), + ], + ), + ], + ), ), - const Padding( - padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16), - child: HorizontalRule(), - ), - Row( - mainAxisAlignment: MainAxisAlignment.spaceEvenly, - children: [ - PerformanceTile( - title: "Load time", - value: nf.format(metrics.loadTime), - unit: "ms", - ), - PerformanceTile( - title: "Detokenization duration", - value: nf.format(metrics.detokenizationTime), - unit: "ms", - ), - PerformanceTile( - title: "Throughput", - value: nf.format(metrics.throughput), - unit: "tokens/sec", - ), - ], - ), - ], - ), - ), + ), + ); + } ), - ); - } + ), + ModelProperties(project: project), + ], ); } } - diff --git a/lib/pages/transcription/transcription.dart b/lib/pages/transcription/transcription.dart index 1d9fd00..46f54af 100644 --- a/lib/pages/transcription/transcription.dart +++ b/lib/pages/transcription/transcription.dart @@ -84,7 +84,7 @@ class _TranscriptionPageState extends State { PaneItem( icon: const Icon(FluentIcons.line_chart), title: const Text("Performance metrics"), - body: const PerformanceMetrics(), + body: PerformanceMetrics(project: widget.project), ), ], ) From c9788eaceac6ba4dcf1ba01649e1b5f4c3f894ca Mon Sep 17 00:00:00 2001 From: Ronald Hecker Date: Fri, 22 Nov 2024 17:09:54 +0100 Subject: [PATCH 17/17] Fix transcription from continuening after dispose --- .../transcription/performance_metrics.dart | 137 +++++++++--------- .../providers/speech_inference_provider.dart | 16 +- 2 files changed, 84 insertions(+), 69 deletions(-) diff --git a/lib/pages/transcription/performance_metrics.dart b/lib/pages/transcription/performance_metrics.dart index e1795a1..0abf3e1 100644 --- a/lib/pages/transcription/performance_metrics.dart +++ b/lib/pages/transcription/performance_metrics.dart @@ -1,6 +1,7 @@ import 'package:fluent_ui/fluent_ui.dart'; import 'package:inference/pages/computer_vision/widgets/horizontal_rule.dart'; import 'package:inference/pages/computer_vision/widgets/model_properties.dart'; +import 'package:inference/pages/models/widgets/grid_container.dart'; import 'package:inference/pages/transcription/providers/speech_inference_provider.dart'; import 'package:inference/project.dart'; import 'package:inference/widgets/performance_tile.dart'; @@ -16,77 +17,79 @@ class PerformanceMetrics extends StatelessWidget { return Row( children: [ Expanded( - child: Consumer( - builder: (context, inference, child) { - final metrics = inference.metrics; - if (metrics == null) { - return Container(); - } + child: GridContainer( + child: Consumer( + builder: (context, inference, child) { + final metrics = inference.metrics; + if (metrics == null) { + return Container(); + } - Locale locale = Localizations.localeOf(context); - final nf = NumberFormat.decimalPatternDigits( - locale: locale.languageCode, decimalDigits: 0); + Locale locale = Localizations.localeOf(context); + final nf = NumberFormat.decimalPatternDigits( + locale: locale.languageCode, decimalDigits: 0); - return Padding( - padding: const EdgeInsets.symmetric(vertical: 80), - child: Center( - child: SizedBox( - width: 887, - child: Column( - children: [ - Row( - mainAxisAlignment: MainAxisAlignment.spaceEvenly, - children: [ - PerformanceTile( - title: "Time to first token (TTFT)", - value: nf.format(metrics.ttft), - unit: "ms", - tall: true, - ), - PerformanceTile( - title: "Time per output token (TPOT)", - value: nf.format(metrics.tpot), - unit: "ms", - tall: true, - ), - PerformanceTile( - title: "Generate total duration", - value: nf.format(metrics.generateTime), - unit: "ms", - tall: true, - ), - ], - ), - const Padding( - padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16), - child: HorizontalRule(), - ), - Row( - mainAxisAlignment: MainAxisAlignment.spaceEvenly, - children: [ - PerformanceTile( - title: "Load time", - value: nf.format(metrics.loadTime), - unit: "ms", - ), - PerformanceTile( - title: "Detokenization duration", - value: nf.format(metrics.detokenizationTime), - unit: "ms", - ), - PerformanceTile( - title: "Throughput", - value: nf.format(metrics.throughput), - unit: "tokens/sec", - ), - ], - ), - ], + return Padding( + padding: const EdgeInsets.symmetric(vertical: 80), + child: Center( + child: SizedBox( + width: 887, + child: Column( + children: [ + Row( + mainAxisAlignment: MainAxisAlignment.spaceEvenly, + children: [ + PerformanceTile( + title: "Time to first token (TTFT)", + value: nf.format(metrics.ttft), + unit: "ms", + tall: true, + ), + PerformanceTile( + title: "Time per output token (TPOT)", + value: nf.format(metrics.tpot), + unit: "ms", + tall: true, + ), + PerformanceTile( + title: "Generate total duration", + value: nf.format(metrics.generateTime), + unit: "ms", + tall: true, + ), + ], + ), + const Padding( + padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16), + child: HorizontalRule(), + ), + Row( + mainAxisAlignment: MainAxisAlignment.spaceEvenly, + children: [ + PerformanceTile( + title: "Load time", + value: nf.format(metrics.loadTime), + unit: "ms", + ), + PerformanceTile( + title: "Detokenization duration", + value: nf.format(metrics.detokenizationTime), + unit: "ms", + ), + PerformanceTile( + title: "Throughput", + value: nf.format(metrics.throughput), + unit: "tokens/sec", + ), + ], + ), + ], + ), ), ), - ), - ); - } + ); + } + ), ), ), ModelProperties(project: project), diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart index fedbf72..606b9e7 100644 --- a/lib/pages/transcription/providers/speech_inference_provider.dart +++ b/lib/pages/transcription/providers/speech_inference_provider.dart @@ -20,6 +20,8 @@ class SpeechInferenceProvider extends ChangeNotifier { String? _videoPath; String? get videoPath => _videoPath; + bool forceStop = false; + bool get videoLoaded => _videoPath != null; DynamicRangeLoading>? transcription; @@ -79,13 +81,17 @@ class SpeechInferenceProvider extends ChangeNotifier { throw Exception("Can't transcribe before loading video"); } - while (!transcription!.complete) { + forceStop = false; + + while ((!transcription!.complete) || !forceStop) { if (transcription == null) { return; } await transcription!.process((int i) { final request = transcribe(i * transcriptionPeriod, transcriptionPeriod); - request.then(addMetrics); + if (!forceStop) { + request.then(addMetrics); + } return request; }); if (hasListeners) { @@ -103,4 +109,10 @@ class SpeechInferenceProvider extends ChangeNotifier { return _project == project && _device == device; } + @override + void dispose() { + forceStop = true; + super.dispose(); + } + }