From a39903087a0ea996342c53133c4a954510afbde1 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Tue, 19 Nov 2024 16:23:50 +0100
Subject: [PATCH 01/17] Reimplement part of speech to text

Enabled the c++ part and added the base UI.
Actual video and subtitles showing etc needs to be implemented
---
 lib/interop/generated_bindings.dart           |  71 +++++++++++
 lib/interop/speech_to_text.dart               | 109 ++++++++--------
 .../computer_vision/batch_inference.dart      |   2 +-
 lib/pages/computer_vision/live_inference.dart |   2 +-
 .../widgets/model_properties.dart             | 107 ++++++++--------
 lib/pages/models/inference.dart               |   3 +-
 lib/pages/transcription/playground.dart       | 101 +++++++++++++++
 .../providers/speech_inference_provider.dart  |  88 +++++++++++++
 lib/pages/transcription/transcription.dart    | 120 ++++++++++++++++++
 lib/pages/transcription/utils/section.dart    |  98 ++++++++++++++
 lib/utils/drop_area.dart                      |  84 ++++++------
 macos/Runner.xcodeproj/project.pbxproj        |  30 +++++
 openvino_bindings/src/BUILD                   |   1 +
 openvino_bindings/src/bindings.cc             |  86 ++++++-------
 openvino_bindings/src/bindings.h              |   8 +-
 15 files changed, 706 insertions(+), 204 deletions(-)
 create mode 100644 lib/pages/transcription/playground.dart
 create mode 100644 lib/pages/transcription/providers/speech_inference_provider.dart
 create mode 100644 lib/pages/transcription/transcription.dart
 create mode 100644 lib/pages/transcription/utils/section.dart
diff --git a/lib/interop/generated_bindings.dart b/lib/interop/generated_bindings.dart
index f37fd89..ea541a8 100644
--- a/lib/interop/generated_bindings.dart
+++ b/lib/interop/generated_bindings.dart
@@ -569,6 +569,77 @@ class OpenVINO {
   late final _graphRunnerStop = _graphRunnerStopPtr
       .asFunction<ffi.Pointer<Status> Function(CGraphRunner)>();
 
+  ffi.Pointer<StatusOrSpeechToText> speechToTextOpen(
+    ffi.Pointer<pkg_ffi.Utf8> model_path,
+    ffi.Pointer<pkg_ffi.Utf8> device,
+  ) {
+    return _speechToTextOpen(
+      model_path,
+      device,
+    );
+  }
+
+  late final _speechToTextOpenPtr = _lookup<
+      ffi.NativeFunction<
+          ffi.Pointer<StatusOrSpeechToText> Function(ffi.Pointer<pkg_ffi.Utf8>,
+              ffi.Pointer<pkg_ffi.Utf8>)>>('speechToTextOpen');
+  late final _speechToTextOpen = _speechToTextOpenPtr.asFunction<
+      ffi.Pointer<StatusOrSpeechToText> Function(
+          ffi.Pointer<pkg_ffi.Utf8>, ffi.Pointer<pkg_ffi.Utf8>)>();
+
+  ffi.Pointer<Status> speechToTextLoadVideo(
+    CSpeechToText instance,
+    ffi.Pointer<pkg_ffi.Utf8> video_path,
+  ) {
+    return _speechToTextLoadVideo(
+      instance,
+      video_path,
+    );
+  }
+
+  late final _speechToTextLoadVideoPtr = _lookup<
+      ffi.NativeFunction<
+          ffi.Pointer<Status> Function(CSpeechToText,
+              ffi.Pointer<pkg_ffi.Utf8>)>>('speechToTextLoadVideo');
+  late final _speechToTextLoadVideo = _speechToTextLoadVideoPtr.asFunction<
+      ffi.Pointer<Status> Function(CSpeechToText, ffi.Pointer<pkg_ffi.Utf8>)>();
+
+  ffi.Pointer<StatusOrInt> speechToTextVideoDuration(
+    CSpeechToText instance,
+  ) {
+    return _speechToTextVideoDuration(
+      instance,
+    );
+  }
+
+  late final _speechToTextVideoDurationPtr = _lookup<
+          ffi.NativeFunction<ffi.Pointer<StatusOrInt> Function(CSpeechToText)>>(
+      'speechToTextVideoDuration');
+  late final _speechToTextVideoDuration = _speechToTextVideoDurationPtr
+      .asFunction<ffi.Pointer<StatusOrInt> Function(CSpeechToText)>();
+
+  ffi.Pointer<StatusOrModelResponse> speechToTextTranscribe(
+    CSpeechToText instance,
+    int start,
+    int duration,
+    ffi.Pointer<pkg_ffi.Utf8> language,
+  ) {
+    return _speechToTextTranscribe(
+      instance,
+      start,
+      duration,
+      language,
+    );
+  }
+
+  late final _speechToTextTranscribePtr = _lookup<
+      ffi.NativeFunction<
+          ffi.Pointer<StatusOrModelResponse> Function(CSpeechToText, ffi.Int,
+              ffi.Int, ffi.Pointer<pkg_ffi.Utf8>)>>('speechToTextTranscribe');
+  late final _speechToTextTranscribe = _speechToTextTranscribePtr.asFunction<
+      ffi.Pointer<StatusOrModelResponse> Function(
+          CSpeechToText, int, int, ffi.Pointer<pkg_ffi.Utf8>)>();
+
   ffi.Pointer<StatusOrDevices> getAvailableDevices() {
     return _getAvailableDevices();
   }
diff --git a/lib/interop/speech_to_text.dart b/lib/interop/speech_to_text.dart
index c8635ae..b81ed02 100644
--- a/lib/interop/speech_to_text.dart
+++ b/lib/interop/speech_to_text.dart
@@ -14,67 +14,64 @@ class SpeechToText {
   SpeechToText(this.instance);
 
   static Future<SpeechToText> init(String modelPath, String device) async {
-    throw UnimplementedError();
-    //final result = await Isolate.run(() {
-    //  final modelPathPtr = modelPath.toNativeUtf8();
-    //  final devicePtr = device.toNativeUtf8();
-    //  final status = ov.speechToTextOpen(modelPathPtr, devicePtr);
-    //  calloc.free(modelPathPtr);
-    //  calloc.free(devicePtr);
-
-    //  return status;
-    //});
-
-    //print("${result.ref.status}, ${result.ref.message}");
-    //if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
-    //  throw "SpeechToText open error: ${result.ref.status} ${result.ref.message.toDartString()}";
-    //}
-
-    //return SpeechToText(result);
+    final result = await Isolate.run(() {
+      final modelPathPtr = modelPath.toNativeUtf8();
+      final devicePtr = device.toNativeUtf8();
+      final status = ov.speechToTextOpen(modelPathPtr, devicePtr);
+      calloc.free(modelPathPtr);
+      calloc.free(devicePtr);
+
+      return status;
+    });
+
+    print("${result.ref.status}, ${result.ref.message}");
+    if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
+      throw "SpeechToText open error: ${result.ref.status} ${result.ref.message.toDartString()}";
+    }
+
+    return SpeechToText(result);
   }
 
   Future<int> loadVideo(String videoPath) async{
-    throw UnimplementedError();
-    //int instanceAddress = instance.ref.value.address;
-    //{
-    //  final result = await Isolate.run(() {
-    //    final videoPathPtr = videoPath.toNativeUtf8();
-    //    final status = ov.speechToTextLoadVideo(Pointer<Void>.fromAddress(instanceAddress), videoPathPtr);
-    //    calloc.free(videoPathPtr);
-    //    return status;
-    //  });
-
-    //  if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
-    //    throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
-    //  }
-    //}
-
-    //{
-    //  final result = await Isolate.run(() {
-    //    final status = ov.speechToTextVideoDuration(Pointer<Void>.fromAddress(instanceAddress));
-    //    return status;
-    //  });
-    //  if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
-    //    throw "SpeechToText VideoDuration error: ${result.ref.status} ${result.ref.message.toDartString()}";
-    //  }
-    //  return result.ref.value;
-    //}
+    int instanceAddress = instance.ref.value.address;
+    {
+      final result = await Isolate.run(() {
+        final videoPathPtr = videoPath.toNativeUtf8();
+        final status = ov.speechToTextLoadVideo(Pointer<Void>.fromAddress(instanceAddress), videoPathPtr);
+        calloc.free(videoPathPtr);
+        return status;
+      });
+
+      if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
+        throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
+      }
+    }
+
+    {
+      final result = await Isolate.run(() {
+        final status = ov.speechToTextVideoDuration(Pointer<Void>.fromAddress(instanceAddress));
+        return status;
+      });
+      if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
+        throw "SpeechToText VideoDuration error: ${result.ref.status} ${result.ref.message.toDartString()}";
+      }
+      return result.ref.value;
+    }
   }
 
   Future<String> transcribe(int start, int duration, String language) async{
-    throw UnimplementedError();
-    //int instanceAddress = instance.ref.value.address;
-    //final result = await Isolate.run(() {
-    //  final languagePtr = language.toNativeUtf8();
-    //  final status = ov.speechToTextTranscribe(Pointer<Void>.fromAddress(instanceAddress), start, duration, languagePtr);
-    //  calloc.free(languagePtr);
-    //  return status;
-    //});
-
-    //if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
-    //  throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
-    //}
-
-    //return result.ref.value.toDartString();
+    int instanceAddress = instance.ref.value.address;
+    final result = await Isolate.run(() {
+      final languagePtr = language.toNativeUtf8();
+      final status = ov.speechToTextTranscribe(Pointer<Void>.fromAddress(instanceAddress), start, duration, languagePtr);
+      calloc.free(languagePtr);
+      return status;
+    });
+
+    if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
+      throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
+    }
+
+    return result.ref.value.toDartString();
   }
 }
diff --git a/lib/pages/computer_vision/batch_inference.dart b/lib/pages/computer_vision/batch_inference.dart
index d2f34cf..53c74ff 100644
--- a/lib/pages/computer_vision/batch_inference.dart
+++ b/lib/pages/computer_vision/batch_inference.dart
@@ -99,7 +99,7 @@ class BatchInference extends StatelessWidget {
                   ),
                 ),
               ),
-              const ModelProperties(),
+              ModelProperties(project: batchInference.imageInference.project),
             ],
           );
         }
diff --git a/lib/pages/computer_vision/live_inference.dart b/lib/pages/computer_vision/live_inference.dart
index 0b089bf..9c78f25 100644
--- a/lib/pages/computer_vision/live_inference.dart
+++ b/lib/pages/computer_vision/live_inference.dart
@@ -135,7 +135,7 @@ class _LiveInferenceState extends State<LiveInference> {
             ],
           ),
         ),
-        const ModelProperties(),
+        ModelProperties(project: widget.project),
       ],
     );
   }
diff --git a/lib/pages/computer_vision/widgets/model_properties.dart b/lib/pages/computer_vision/widgets/model_properties.dart
index d333243..5d7e932 100644
--- a/lib/pages/computer_vision/widgets/model_properties.dart
+++ b/lib/pages/computer_vision/widgets/model_properties.dart
@@ -1,70 +1,67 @@
 import 'package:fluent_ui/fluent_ui.dart';
+import 'package:inference/project.dart';
 import 'package:inference/theme_fluent.dart';
-import 'package:inference/utils.dart';
 import 'package:inference/pages/models/widgets/grid_container.dart';
-import 'package:inference/providers/image_inference_provider.dart';
 import 'package:intl/intl.dart';
-import 'package:provider/provider.dart';
+import 'package:inference/utils.dart';
 
 class ModelProperties extends StatelessWidget {
-  const ModelProperties({super.key});
+  final Project project;
+  const ModelProperties({super.key, required this.project});
 
   @override
   Widget build(BuildContext context) {
-    return Consumer<ImageInferenceProvider>(builder: (context, inference, child) {
-        Locale locale = Localizations.localeOf(context);
-        final formatter = NumberFormat.percentPattern(locale.languageCode);
+    Locale locale = Localizations.localeOf(context);
+    final formatter = NumberFormat.percentPattern(locale.languageCode);
 
-        return SizedBox(
-          width: 280,
-          child: GridContainer(
-            padding: const EdgeInsets.symmetric(vertical: 18, horizontal: 24),
-            child: Column(
-              crossAxisAlignment: CrossAxisAlignment.start,
-              children: [
-                const Text("Model parameters", style: TextStyle(
-                    fontSize: 20,
-                )),
-                Container(
-                  padding: const EdgeInsets.only(top: 16),
-                  child: Column(
-                    crossAxisAlignment: CrossAxisAlignment.start,
-                    children: [
-                      ModelProperty(
-                        title: "Model name",
-                        value: inference.project.name,
-                      ),
-                      ModelProperty(
-                        title: "Task",
-                        value: inference.project.taskName(),
-                      ),
-                      ModelProperty(
-                        title: "Architecture",
-                        value: inference.project.architecture,
-                      ),
-                      ModelProperty(
-                        title: "Size",
-                        value: inference.project.size?.readableFileSize() ?? "",
-                      ),
-                      Builder(
-                        builder: (context) {
-                          if (inference.project.tasks.first.performance == null) {
-                            return Container();
-                          }
-                          return ModelProperty(
-                            title: "Accuracy",
-                            value: formatter.format(inference.project.tasks.first.performance!.score)
-                          );
-                        }
-                      ),
-                    ],
+    return SizedBox(
+      width: 280,
+      child: GridContainer(
+        padding: const EdgeInsets.symmetric(vertical: 18, horizontal: 24),
+        child: Column(
+          crossAxisAlignment: CrossAxisAlignment.start,
+          children: [
+            const Text("Model parameters", style: TextStyle(
+                fontSize: 20,
+            )),
+            Container(
+              padding: const EdgeInsets.only(top: 16),
+              child: Column(
+                crossAxisAlignment: CrossAxisAlignment.start,
+                children: [
+                  ModelProperty(
+                    title: "Model name",
+                    value: project.name,
+                  ),
+                  ModelProperty(
+                    title: "Task",
+                    value: project.taskName(),
+                  ),
+                  ModelProperty(
+                    title: "Architecture",
+                    value: project.architecture,
+                  ),
+                  ModelProperty(
+                    title: "Size",
+                    value: project.size?.readableFileSize() ?? "",
+                  ),
+                  Builder(
+                    builder: (context) {
+                      if (project.tasks.first.performance == null) {
+                        return Container();
+                      }
+                      return ModelProperty(
+                        title: "Accuracy",
+                        value: formatter.format(project.tasks.first.performance!.score)
+                      );
+                    }
                   ),
-                )
-              ],
+                ],
+              ),
             )
-          ),
-        );
-      }
+          ],
+        )
+      ),
     );
   }
 }
diff --git a/lib/pages/models/inference.dart b/lib/pages/models/inference.dart
index 1445420..18b022a 100644
--- a/lib/pages/models/inference.dart
+++ b/lib/pages/models/inference.dart
@@ -1,5 +1,6 @@
 import 'package:fluent_ui/fluent_ui.dart';
 import 'package:inference/pages/computer_vision/computer_vision.dart';
+import 'package:inference/pages/transcription/transcription.dart';
 import 'package:inference/project.dart';
 
 class InferencePage extends StatelessWidget {
@@ -14,7 +15,7 @@ class InferencePage extends StatelessWidget {
       case ProjectType.text:
         return Container();
       case ProjectType.speech:
-        return Container();
+        return TranscriptionPage(project);
     }
   }
 
diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart
new file mode 100644
index 0000000..28417b8
--- /dev/null
+++ b/lib/pages/transcription/playground.dart
@@ -0,0 +1,101 @@
+import 'package:file_picker/file_picker.dart';
+import 'package:fluent_ui/fluent_ui.dart';
+import 'package:inference/pages/computer_vision/widgets/model_properties.dart';
+import 'package:inference/pages/models/widgets/grid_container.dart';
+import 'package:inference/project.dart';
+import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
+import 'package:inference/theme_fluent.dart';
+import 'package:inference/utils/drop_area.dart';
+import 'package:inference/widgets/controls/no_outline_button.dart';
+import 'package:inference/widgets/device_selector.dart';
+//import 'package:media_kit/media_kit.dart';
+//import 'package:media_kit_video/media_kit_video.dart';
+import 'package:provider/provider.dart';
+
+class Playground extends StatefulWidget {
+  final Project project;
+  const Playground({super.key, required this.project});
+
+  @override
+  State<Playground> createState() => _PlaygroundState();
+}
+
+class _PlaygroundState extends State<Playground> {
+  //late final player = Player();
+  //late final controller = VideoController(player);
+
+  void showUploadMenu() async {
+    FilePickerResult? result = await FilePicker.platform.pickFiles(type: FileType.video);
+
+    if (result != null) {
+      uploadFile(result.files.single.path!);
+    }
+  }
+
+  void uploadFile(String file) async {
+    final inference = Provider.of<SpeechInferenceProvider>(context, listen: false);
+    await inference.loadVideo(file);
+    inference.startTranscribing();
+  }
+
+  @override
+  Widget build(BuildContext context) {
+    final theme = FluentTheme.of(context);
+    return Row(
+      crossAxisAlignment: CrossAxisAlignment.start,
+      children: [
+        Expanded(
+          child: Column(
+            children: [
+              SizedBox(
+                height: 64,
+                child: GridContainer(
+                  child: Padding(
+                    padding: const EdgeInsets.symmetric(horizontal: 16),
+                    child: Row(
+                      children: [
+                        NoOutlineButton(
+                          onPressed: showUploadMenu,
+                          child: Row(
+                            children: [
+                              const Text("Choose video"),
+                              const Padding(
+                                padding: EdgeInsets.only(left: 8),
+                                child: Icon(FluentIcons.chevron_down, size: 12),
+                              ),
+                            ],
+                          ),
+                        ),
+                        const DeviceSelector(),
+                      ],
+                    ),
+                  ),
+                ),
+              ),
+              Expanded(
+                child: GridContainer(
+                  color: backgroundColor.of(theme),
+                  child: Builder(
+                    builder: (context) {
+                      return DropArea(
+                        type: "video",
+                        showChild: false,
+                        onUpload: (String file) { uploadFile(file); },
+                        extensions: const [],
+                        child: Padding(
+                          padding: const EdgeInsets.all(8.0),
+                          child: Container(),
+                        ),
+                      );
+                    }
+                  ),
+                ),
+              )
+            ],
+          ),
+        ),
+        ModelProperties(project: widget.project),
+      ]
+    );
+  }
+}
diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart
new file mode 100644
index 0000000..9f658fe
--- /dev/null
+++ b/lib/pages/transcription/providers/speech_inference_provider.dart
@@ -0,0 +1,88 @@
+import 'dart:async';
+
+import 'package:flutter/material.dart';
+import 'package:inference/interop/speech_to_text.dart';
+import 'package:inference/pages/transcription/utils/section.dart';
+import 'package:inference/project.dart';
+
+const transcriptionPeriod = 10;
+
+class SpeechInferenceProvider  extends ChangeNotifier {
+  Completer<void> loaded = Completer<void>();
+
+
+  Project? _project;
+  String? _device;
+
+  String? _videoPath;
+  String? get videoPath => _videoPath;
+
+  bool get videoLoaded => _videoPath != null;
+
+  DynamicRangeLoading<FutureOr<String>>? _transcription;
+  Map<int, FutureOr<String>>? get transcription => _transcription?.data;
+
+  String _language = "";
+
+  String get language => _language;
+  set language(String val) {
+    _language = val;
+    notifyListeners();
+  }
+
+  SpeechToText? _inference;
+
+  SpeechInferenceProvider(Project? project, String? device) {
+    _project = project;
+    _device = device;
+
+    if (project != null && device != null) {
+      SpeechToText.init(project.storagePath, device).then((instance) {
+         _inference = instance;
+         loaded.complete();
+         notifyListeners();
+      });
+    }
+  }
+
+  void skipTo(int index) {
+    _transcription!.skipTo(index);
+  }
+
+  Future<void> loadVideo(String path) async {
+    await loaded.future;
+    _videoPath = path;
+    final duration = await _inference!.loadVideo(path);
+    final sections = (duration / transcriptionPeriod).ceil();
+    _transcription = DynamicRangeLoading<FutureOr<String>>(Section(0, sections));
+    notifyListeners();
+  }
+
+  Future<void> startTranscribing() async {
+    if (_transcription == null) {
+      throw Exception("Can't transcribe before loading video");
+    }
+
+    while (!_transcription!.complete) {
+      if (_transcription == null) {
+        return;
+      }
+      await _transcription!.process((int i) {
+          return transcribe(i * transcriptionPeriod, transcriptionPeriod);
+      });
+      if (hasListeners) {
+        notifyListeners();
+      }
+    }
+  }
+
+  Future<String> transcribe(int start, int duration) async {
+    await loaded.future;
+    return await _inference!.transcribe(start, duration, _language);
+  }
+
+  bool sameProps(Project? project, String? device) {
+    return _project == project && _device == device;
+  }
+
+}
diff --git a/lib/pages/transcription/transcription.dart b/lib/pages/transcription/transcription.dart
new file mode 100644
index 0000000..14353d9
--- /dev/null
+++ b/lib/pages/transcription/transcription.dart
@@ -0,0 +1,120 @@
+import 'package:fluent_ui/fluent_ui.dart';
+import 'package:go_router/go_router.dart';
+import 'package:inference/project.dart';
+import 'package:inference/providers/preference_provider.dart';
+import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
+import 'package:inference/pages/transcription/playground.dart';
+import 'package:provider/provider.dart';
+
+class TranscriptionPage extends StatefulWidget {
+  final Project project;
+  const TranscriptionPage(this.project, {super.key});
+
+  @override
+  State<TranscriptionPage> createState() => _TranscriptionPageState();
+}
+
+class _TranscriptionPageState extends State<TranscriptionPage> {
+
+
+  int selected = 0;
+  @override
+  Widget build(BuildContext context) {
+    final theme = FluentTheme.of(context);
+    final updatedTheme = theme.copyWith(
+        navigationPaneTheme: theme.navigationPaneTheme.merge(NavigationPaneThemeData(
+            backgroundColor: theme.scaffoldBackgroundColor,
+        ))
+    );
+    return ChangeNotifierProxyProvider<PreferenceProvider, SpeechInferenceProvider>(
+      lazy: false,
+      create: (_) {
+        final device = Provider.of<PreferenceProvider>(context, listen: false).device;
+        return SpeechInferenceProvider(widget.project, device);
+      },
+      update: (_, preferences, imageInferenceProvider) {
+        if (imageInferenceProvider != null && imageInferenceProvider.sameProps(widget.project, preferences.device)) {
+          return imageInferenceProvider;
+        }
+        return SpeechInferenceProvider(widget.project, preferences.device);
+      },
+      child:  Stack(
+        children: [
+          FluentTheme(
+            data: updatedTheme,
+            child: NavigationView(
+              pane: NavigationPane(
+                size: const NavigationPaneSize(topHeight: 64),
+                header: Row(
+                  children: [
+                    Padding(
+                      padding: const EdgeInsets.only(left: 12.0),
+                      child: ClipRRect(
+                        borderRadius: BorderRadius.circular(4.0),
+                        child: Container(
+                          width: 40,
+                          height: 40,
+                          decoration: BoxDecoration(
+                            image: DecorationImage(
+                                image: widget.project.thumbnailImage(),
+                                fit: BoxFit.cover),
+                          ),
+                        ),
+                      ),
+                    ),
+                    Padding(
+                      padding: const EdgeInsets.symmetric(horizontal: 16),
+                      child: Text(widget.project.name,
+                        style: const TextStyle(fontSize: 20, fontWeight: FontWeight.bold),
+                      ),
+                    ),
+                  ],
+                ),
+                //customPane: CustomNavigationPane(),
+                selected: selected,
+                onChanged: (i) => setState(() {selected = i;}),
+                displayMode: PaneDisplayMode.top,
+                items: [
+                  PaneItem(
+                    icon: const Icon(FluentIcons.processing),
+                    title: const Text("Playground"),
+                    body: Playground(project: widget.project),
+                  ),
+                  PaneItem(
+                    icon: const Icon(FluentIcons.project_collection),
+                    title: const Text("Performance metrics"),
+                    body: Container(),
+                  ),
+                ],
+              )
+            ),
+          ),
+          SizedBox(
+            height: 64,
+            child: Padding(
+              padding: const EdgeInsets.symmetric(horizontal: 25),
+              child: Row(
+                mainAxisAlignment: MainAxisAlignment.end,
+                children: [
+                  Padding(
+                    padding: const EdgeInsets.all(4),
+                    child: OutlinedButton(
+                      style: ButtonStyle(
+                        shape:WidgetStatePropertyAll(RoundedRectangleBorder(
+                          borderRadius: BorderRadius.circular(4.0),
+                          side:  const BorderSide(color: Color(0XFF545454)),
+                        )),
+                      ),
+                      child: const Text("Close"),
+                      onPressed: () =>  GoRouter.of(context).go("/models"),
+                    ),
+                  ),
+                ]
+              ),
+            ),
+          )
+        ],
+      )
+    );
+  }
+}
diff --git a/lib/pages/transcription/utils/section.dart b/lib/pages/transcription/utils/section.dart
new file mode 100644
index 0000000..27ede1d
--- /dev/null
+++ b/lib/pages/transcription/utils/section.dart
@@ -0,0 +1,98 @@
+void moveToFront<I>(List<I> list, I item) {
+  list.remove(item);
+  list.insert(0, item);
+}
+
+void moveToEnd<I>(List<I> list, I item) {
+  list.remove(item);
+  list.add(item);
+}
+
+class DynamicRangeLoading<I> {
+  List<Section> sections = [];
+  Map<int, I> data = {};
+
+  DynamicRangeLoading(Section section): sections = [section];
+
+  Section get activeSection => sections.first;
+
+  // The incomplete sections will always be in front
+  bool get complete => activeSection.complete;
+
+  void skipTo(int i) {
+    for (var section in sections) {
+      if (section.contains(i)) {
+        if (i > section.index) {
+          // Section has not progressed until the requested index
+          // Split the section and move the new section to the front
+          final newSection = section.split(i);
+          sections.insert(0, newSection);
+        } else {
+          // Section is further ahead than requested skipTo
+          // move section to front since that work has higher prio
+          if (!section.complete && section != activeSection) {
+            moveToFront(sections, section);
+          }
+        }
+        return;
+      }
+    }
+
+    throw Exception("Out of range");
+  }
+
+  int getNextIndex() {
+    if (complete) {
+      throw Exception("Cannot get next index. All work is done");
+    }
+    return activeSection.index;
+  }
+
+  void pumpIndex() {
+    if (activeSection.pump()) {
+      //activeSection has ended
+      if (sections.length > 1) {
+        moveToEnd(sections,activeSection);
+      }
+    }
+  }
+
+  Future<I> process(Future<I> Function(int) func) async{
+    final index = getNextIndex();
+    final val = await func(index);
+    data[index] = val;
+    pumpIndex();
+    return val;
+  }
+
+  void setData(I value) {
+    data[activeSection.index] = value;
+    activeSection.index += 1;
+  }
+}
+
+class Section {
+  int begin;
+  int? end;
+  int index;
+
+  Section(this.begin, this.end): index = begin;
+
+  bool contains(int i) => begin <= i && (end == null ? true : i < end!);
+
+  Section split(int i) {
+    final newSection = Section(i, end);
+    end = i;
+    return newSection;
+  }
+
+  bool get complete => index == end;
+
+  //returns false if there is still work to do in the section
+  bool pump() {
+    if (end == null || index < end!) {
+      index += 1;
+    }
+    return complete;
+  }
+}
diff --git a/lib/utils/drop_area.dart b/lib/utils/drop_area.dart
index 61bb2f8..dcdf761 100644
--- a/lib/utils/drop_area.dart
+++ b/lib/utils/drop_area.dart
@@ -50,51 +50,49 @@ class _DropAreaState extends State<DropArea> {
 
   @override
   Widget build(BuildContext context) {
-    return Expanded(
-      child: DropTarget(
-        onDragDone: (details) => handleDrop(details),
-        onDragExited: (val) => hideReleaseMessage(),
-        onDragEntered: (val) => showReleaseMessage(),
-        child: Container(
-          decoration: BoxDecoration(
-            borderRadius: BorderRadius.circular(4.0),
-            color: intelGray,
-          ),
-          child: Builder(
-            builder: (context) {
-              if (!_showReleaseMessage && widget.showChild) {
-                return widget.child!;
-              }
-              return Center(
-                child: SizedBox(
-                  height: 310,
-                  child: Column(
-                    crossAxisAlignment: CrossAxisAlignment.center,
-                    mainAxisAlignment: MainAxisAlignment.spaceBetween,
-                    children: [
-                      SvgPicture.asset('images/drop.svg'),
-                      ( _showReleaseMessage
-                        ? const Text("Release to drop media")
-                        : Text("Drop ${widget.type} here")
-                      ),
-                      ElevatedButton(
-                        onPressed: () => showUploadMenu(),
-                        child: const Text("Upload")
-                      ),
-                      Builder(
-                        builder: (context) {
-                          if (widget.extensions == null) {
-                            return Container();
-                          }
-                          return Text(widget.extensions!.join(", "));
+    return DropTarget(
+      onDragDone: (details) => handleDrop(details),
+      onDragExited: (val) => hideReleaseMessage(),
+      onDragEntered: (val) => showReleaseMessage(),
+      child: Container(
+        decoration: BoxDecoration(
+          borderRadius: BorderRadius.circular(4.0),
+          color: intelGray,
+        ),
+        child: Builder(
+          builder: (context) {
+            if (!_showReleaseMessage && widget.showChild) {
+              return widget.child!;
+            }
+            return Center(
+              child: SizedBox(
+                height: 310,
+                child: Column(
+                  crossAxisAlignment: CrossAxisAlignment.center,
+                  mainAxisAlignment: MainAxisAlignment.spaceBetween,
+                  children: [
+                    SvgPicture.asset('images/drop.svg'),
+                    ( _showReleaseMessage
+                      ? const Text("Release to drop media")
+                      : Text("Drop ${widget.type} here")
+                    ),
+                    ElevatedButton(
+                      onPressed: () => showUploadMenu(),
+                      child: const Text("Upload")
+                    ),
+                    Builder(
+                      builder: (context) {
+                        if (widget.extensions == null) {
+                          return Container();
                         }
-                      )
-                    ],
-                  ),
+                        return Text(widget.extensions!.join(", "));
+                      }
+                    )
+                  ],
                 ),
-              );
-            }
-          ),
+              ),
+            );
+          }
         ),
       ),
     );
diff --git a/macos/Runner.xcodeproj/project.pbxproj b/macos/Runner.xcodeproj/project.pbxproj
index 8e01daa..9d53cd0 100644
--- a/macos/Runner.xcodeproj/project.pbxproj
+++ b/macos/Runner.xcodeproj/project.pbxproj
@@ -39,6 +39,16 @@
 		0C42C76A2CE386680079F72B /* libopenvino_tensorflow_lite_frontend.2450.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C42C7592CE386520079F72B /* libopenvino_tensorflow_lite_frontend.2450.dylib */; };
 		0C42C76B2CE388D90079F72B /* libopenvino_c.2450.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C42C7522CE386520079F72B /* libopenvino_c.2450.dylib */; };
 		0C42C76C2CE388DC0079F72B /* libopenvino.2450.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C42C75A2CE386520079F72B /* libopenvino.2450.dylib */; };
+		0C4E1F6C2CECC22800124339 /* libavformat.60.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F692CECC22800124339 /* libavformat.60.dylib */; };
+		0C4E1F6D2CECC22800124339 /* libavutil.58.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F6A2CECC22800124339 /* libavutil.58.dylib */; };
+		0C4E1F6E2CECC22800124339 /* libswresample.4.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F6B2CECC22800124339 /* libswresample.4.dylib */; };
+		0C4E1F6F2CECC22800124339 /* libavcodec.60.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F672CECC22800124339 /* libavcodec.60.dylib */; };
+		0C4E1F702CECC22800124339 /* libavdevice.60.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C4E1F682CECC22800124339 /* libavdevice.60.dylib */; };
+		0C4E1F712CECC24900124339 /* libswresample.4.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F6B2CECC22800124339 /* libswresample.4.dylib */; };
+		0C4E1F722CECC25400124339 /* libavcodec.60.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F672CECC22800124339 /* libavcodec.60.dylib */; };
+		0C4E1F732CECC25400124339 /* libavdevice.60.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F682CECC22800124339 /* libavdevice.60.dylib */; };
+		0C4E1F742CECC25400124339 /* libavformat.60.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F692CECC22800124339 /* libavformat.60.dylib */; };
+		0C4E1F752CECC25400124339 /* libavutil.58.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C4E1F6A2CECC22800124339 /* libavutil.58.dylib */; };
 		0C5D47382C6F2F9500307B37 /* libmacos_bindings.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */; settings = {ATTRIBUTES = (Weak, ); }; };
 		0C5D47392C6F2FB200307B37 /* libmacos_bindings.dylib in Resources */ = {isa = PBXBuildFile; fileRef = 0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */; };
 		0C5D473A2C6F308000307B37 /* libmacos_bindings.dylib in Bundle Framework */ = {isa = PBXBuildFile; fileRef = 0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; };
@@ -117,15 +127,19 @@
 			dstSubfolderSpec = 10;
 			files = (
 				0C42C7672CE386680079F72B /* libopenvino_paddle_frontend.2450.dylib in Bundle Framework */,
+				0C4E1F712CECC24900124339 /* libswresample.4.dylib in Bundle Framework */,
 				0C5D47B32C6F5C1300307B37 /* libopenvino_hetero_plugin.so in Bundle Framework */,
+				0C4E1F752CECC25400124339 /* libavutil.58.dylib in Bundle Framework */,
 				0C42C76C2CE388DC0079F72B /* libopenvino.2450.dylib in Bundle Framework */,
 				0C42C7662CE386680079F72B /* libopenvino_onnx_frontend.2450.dylib in Bundle Framework */,
 				0C5D47B12C6F5C0A00307B37 /* libopenvino_auto_batch_plugin.so in Bundle Framework */,
 				0C5D47B22C6F5C0E00307B37 /* libopenvino_auto_plugin.so in Bundle Framework */,
 				0C5D473E2C6F35E500307B37 /* libblend2d.dylib in Bundle Framework */,
+				0C4E1F732CECC25400124339 /* libavdevice.60.dylib in Bundle Framework */,
 				0C5D47782C6F398400307B37 /* libopencv_core.407.dylib in Bundle Framework */,
 				0C42C7642CE386680079F72B /* libopenvino_genai.2450.dylib in Bundle Framework */,
 				0C5D47B02C6F5C0200307B37 /* libopenvino_arm_cpu_plugin.so in Bundle Framework */,
+				0C4E1F742CECC25400124339 /* libavformat.60.dylib in Bundle Framework */,
 				0C5D47802C6F398400307B37 /* libopencv_videoio.407.dylib in Bundle Framework */,
 				0C5D47792C6F398400307B37 /* libopencv_features2d.407.dylib in Bundle Framework */,
 				0C42C7682CE386680079F72B /* libopenvino_pytorch_frontend.2450.dylib in Bundle Framework */,
@@ -139,6 +153,7 @@
 				0C5D477F2C6F398400307B37 /* libopencv_video.407.dylib in Bundle Framework */,
 				0C5D47812C6F398400307B37 /* libopencv_ximgproc.407.dylib in Bundle Framework */,
 				0C5D473A2C6F308000307B37 /* libmacos_bindings.dylib in Bundle Framework */,
+				0C4E1F722CECC25400124339 /* libavcodec.60.dylib in Bundle Framework */,
 				0C42C7692CE386680079F72B /* libopenvino_tensorflow_frontend.2450.dylib in Bundle Framework */,
 				0C5D47A52C6F3B7700307B37 /* libtbb.12.dylib in Bundle Framework */,
 				0C5D477C2C6F398400307B37 /* libopencv_imgcodecs.407.dylib in Bundle Framework */,
@@ -161,6 +176,11 @@
 		0C42C7582CE386520079F72B /* libopenvino_tensorflow_frontend.2450.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libopenvino_tensorflow_frontend.2450.dylib; path = ../bindings/libopenvino_tensorflow_frontend.2450.dylib; sourceTree = SOURCE_ROOT; };
 		0C42C7592CE386520079F72B /* libopenvino_tensorflow_lite_frontend.2450.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libopenvino_tensorflow_lite_frontend.2450.dylib; path = ../bindings/libopenvino_tensorflow_lite_frontend.2450.dylib; sourceTree = SOURCE_ROOT; };
 		0C42C75A2CE386520079F72B /* libopenvino.2450.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libopenvino.2450.dylib; path = ../bindings/libopenvino.2450.dylib; sourceTree = SOURCE_ROOT; };
+		0C4E1F672CECC22800124339 /* libavcodec.60.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libavcodec.60.dylib; path = ../bindings/libavcodec.60.dylib; sourceTree = SOURCE_ROOT; };
+		0C4E1F682CECC22800124339 /* libavdevice.60.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libavdevice.60.dylib; path = ../bindings/libavdevice.60.dylib; sourceTree = SOURCE_ROOT; };
+		0C4E1F692CECC22800124339 /* libavformat.60.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libavformat.60.dylib; path = ../bindings/libavformat.60.dylib; sourceTree = SOURCE_ROOT; };
+		0C4E1F6A2CECC22800124339 /* libavutil.58.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libavutil.58.dylib; path = ../bindings/libavutil.58.dylib; sourceTree = SOURCE_ROOT; };
+		0C4E1F6B2CECC22800124339 /* libswresample.4.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libswresample.4.dylib; path = ../bindings/libswresample.4.dylib; sourceTree = SOURCE_ROOT; };
 		0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libmacos_bindings.dylib; path = ../bindings/libmacos_bindings.dylib; sourceTree = "<group>"; };
 		0C5D473B2C6F357C00307B37 /* libblend2d.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libblend2d.dylib; path = ../bindings/libblend2d.dylib; sourceTree = "<group>"; };
 		0C5D47602C6F382800307B37 /* libopencv_calib3d.407.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libopencv_calib3d.407.dylib; path = ../bindings/libopencv_calib3d.407.dylib; sourceTree = "<group>"; };
@@ -241,6 +261,11 @@
 				0C42C75D2CE386520079F72B /* libopenvino_paddle_frontend.2450.dylib in Frameworks */,
 				0C42C75E2CE386520079F72B /* libopenvino_onnx_frontend.2450.dylib in Frameworks */,
 				0C42C75F2CE386520079F72B /* libopenvino_c.2450.dylib in Frameworks */,
+				0C4E1F6C2CECC22800124339 /* libavformat.60.dylib in Frameworks */,
+				0C4E1F6D2CECC22800124339 /* libavutil.58.dylib in Frameworks */,
+				0C4E1F6E2CECC22800124339 /* libswresample.4.dylib in Frameworks */,
+				0C4E1F6F2CECC22800124339 /* libavcodec.60.dylib in Frameworks */,
+				0C4E1F702CECC22800124339 /* libavdevice.60.dylib in Frameworks */,
 				0C42C7602CE386520079F72B /* libopenvino_genai.2450.dylib in Frameworks */,
 				0C42C7612CE386520079F72B /* libopenvino.2450.dylib in Frameworks */,
 				0C42C7622CE386520079F72B /* libopenvino_ir_frontend.2450.dylib in Frameworks */,
@@ -361,6 +386,11 @@
 				0C5D47642C6F397900307B37 /* libopencv_ximgproc.407.dylib */,
 				0C5D473B2C6F357C00307B37 /* libblend2d.dylib */,
 				0C5D47372C6F2F9500307B37 /* libmacos_bindings.dylib */,
+				0C4E1F672CECC22800124339 /* libavcodec.60.dylib */,
+				0C4E1F682CECC22800124339 /* libavdevice.60.dylib */,
+				0C4E1F692CECC22800124339 /* libavformat.60.dylib */,
+				0C4E1F6A2CECC22800124339 /* libavutil.58.dylib */,
+				0C4E1F6B2CECC22800124339 /* libswresample.4.dylib */,
 				11E6C6B7198D7B3B20F4A75C /* Pods_Runner.framework */,
 				CB5E7865DB70376BADAAEAE6 /* Pods_RunnerTests.framework */,
 			);
diff --git a/openvino_bindings/src/BUILD b/openvino_bindings/src/BUILD
index 13b1000..3bb179c 100644
--- a/openvino_bindings/src/BUILD
+++ b/openvino_bindings/src/BUILD
@@ -9,6 +9,7 @@ cc_library(
         "//src/utils:utils",
         "//src/image:image_inference",
         "//src/llm:llm_inference",
+        "//src/audio:speech_to_text",
         "//src/mediapipe:graph_runner",
     ],
 )
diff --git a/openvino_bindings/src/bindings.cc b/openvino_bindings/src/bindings.cc
index fcefbec..04b0efa 100644
--- a/openvino_bindings/src/bindings.cc
+++ b/openvino_bindings/src/bindings.cc
@@ -4,7 +4,7 @@
 #include <nlohmann/json.hpp>
 #include <openvino/openvino.hpp>
 
-//#include "src/audio/speech_to_text.h"
+#include "src/audio/speech_to_text.h"
 #include "src/image/image_inference.h"
 #include "src/mediapipe/graph_runner.h"
 #include "src/mediapipe/serialization/serialization_calculators.h"
@@ -290,48 +290,48 @@ Status* graphRunnerStop(CGraphRunner instance) {
     }
 }
 
-//StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device) {
-//    try {
-//        auto instance = new SpeechToText(model_path, device);
-//        return new StatusOrSpeechToText{OkStatus, "", instance};
-//    } catch (...) {
-//        auto except = handle_exceptions();
-//        return new StatusOrSpeechToText{except->status, except->message};
-//    }
-//}
-//
-//Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path) {
-//    try {
-//        auto object = reinterpret_cast<SpeechToText*>(instance);
-//        object->load_video(video_path);
-//        return new Status{OkStatus, ""};
-//    } catch (...) {
-//        return handle_exceptions();
-//    }
-//}
-//
-//StatusOrInt* speechToTextVideoDuration(CSpeechToText instance) {
-//    try {
-//        auto object = reinterpret_cast<SpeechToText*>(instance);
-//        object->video_duration();
-//        // Deal with long in the future
-//        return new StatusOrInt{OkStatus, "", (int)object->video_duration()};
-//    } catch (...) {
-//        return new StatusOrInt{OkStatus, ""};
-//    }
-//}
-//
-//StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language) {
-//    try {
-//        auto object = reinterpret_cast<SpeechToText*>(instance);
-//        auto result = object->transcribe(start, duration, language);
-//        std::string text = result;
-//        return new StatusOrModelResponse{OkStatus, "", convertToMetricsStruct(result.perf_metrics), strdup(text.c_str())};
-//    } catch (...) {
-//        auto except = handle_exceptions();
-//        return new StatusOrModelResponse{except->status, except->message};
-//    }
-//}
+StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device) {
+    try {
+        auto instance = new SpeechToText(model_path, device);
+        return new StatusOrSpeechToText{OkStatus, "", instance};
+    } catch (...) {
+        auto except = handle_exceptions();
+        return new StatusOrSpeechToText{except->status, except->message};
+    }
+}
+
+Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path) {
+    try {
+        auto object = reinterpret_cast<SpeechToText*>(instance);
+        object->load_video(video_path);
+        return new Status{OkStatus, ""};
+    } catch (...) {
+        return handle_exceptions();
+    }
+}
+
+StatusOrInt* speechToTextVideoDuration(CSpeechToText instance) {
+    try {
+        auto object = reinterpret_cast<SpeechToText*>(instance);
+        object->video_duration();
+        // Deal with long in the future
+        return new StatusOrInt{OkStatus, "", (int)object->video_duration()};
+    } catch (...) {
+        return new StatusOrInt{OkStatus, ""};
+    }
+}
+
+StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language) {
+    try {
+        auto object = reinterpret_cast<SpeechToText*>(instance);
+        auto result = object->transcribe(start, duration, language);
+        std::string text = result;
+        return new StatusOrModelResponse{OkStatus, "", convertToMetricsStruct(result.perf_metrics), strdup(text.c_str())};
+    } catch (...) {
+        auto except = handle_exceptions();
+        return new StatusOrModelResponse{except->status, except->message};
+    }
+}
 
 //void report_rss() {
 //    struct rusage r_usage;
diff --git a/openvino_bindings/src/bindings.h b/openvino_bindings/src/bindings.h
index e496c5a..4528916 100644
--- a/openvino_bindings/src/bindings.h
+++ b/openvino_bindings/src/bindings.h
@@ -123,10 +123,10 @@ EXPORT Status* graphRunnerQueueSerializationOutput(CGraphRunner instance, const
 EXPORT StatusOrString* graphRunnerGet(CGraphRunner instance);
 EXPORT Status* graphRunnerStop(CGraphRunner instance);
 
-//EXPORT StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device);
-//EXPORT Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path);
-//EXPORT StatusOrInt* speechToTextVideoDuration(CSpeechToText instance);
-//EXPORT StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language);
+EXPORT StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device);
+EXPORT Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path);
+EXPORT StatusOrInt* speechToTextVideoDuration(CSpeechToText instance);
+EXPORT StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language);
 
 EXPORT StatusOrDevices* getAvailableDevices();
 Status* handle_exceptions();

From e32515493b7e55a64d7034b6a384f2c7e7e05a19 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Tue, 19 Nov 2024 16:40:29 +0100
Subject: [PATCH 02/17] Working video and subtitles

---
 lib/main.dart                                 |   2 +
 lib/pages/transcription/playground.dart       |  79 +++++--
 .../transcription/widgets/subtitles.dart      |  59 +++++
 linux/flutter/generated_plugin_registrant.cc  |   8 +
 linux/flutter/generated_plugins.cmake         |   3 +
 macos/Flutter/GeneratedPluginRegistrant.swift |  10 +
 macos/Podfile.lock                            |  36 ++++
 pubspec.lock                                  | 204 +++++++++++++++++-
 pubspec.yaml                                  |   3 +
 .../flutter/generated_plugin_registrant.cc    |   9 +
 windows/flutter/generated_plugins.cmake       |   4 +
 11 files changed, 392 insertions(+), 25 deletions(-)
 create mode 100644 lib/pages/transcription/widgets/subtitles.dart

diff --git a/lib/main.dart b/lib/main.dart
index 9f019f0..fe0ba36 100644
--- a/lib/main.dart
+++ b/lib/main.dart
@@ -6,6 +6,7 @@ import 'package:inference/theme_fluent.dart';
 import 'package:inference/providers/preference_provider.dart';
 import 'package:inference/providers/project_provider.dart';
 import 'package:inference/public_models.dart';
+import 'package:media_kit/media_kit.dart';
 import 'package:provider/provider.dart';
 
 
@@ -25,6 +26,7 @@ void testConnection() async {
 }
 
 void main() {
+  MediaKit.ensureInitialized();
   testConnection();
   runApp(const App());
 }
diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart
index 28417b8..0b2d74c 100644
--- a/lib/pages/transcription/playground.dart
+++ b/lib/pages/transcription/playground.dart
@@ -1,15 +1,19 @@
+import 'dart:async';
+
 import 'package:file_picker/file_picker.dart';
 import 'package:fluent_ui/fluent_ui.dart';
 import 'package:inference/pages/computer_vision/widgets/model_properties.dart';
 import 'package:inference/pages/models/widgets/grid_container.dart';
+import 'package:inference/pages/transcription/widgets/subtitles.dart';
 import 'package:inference/project.dart';
 import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
 import 'package:inference/theme_fluent.dart';
 import 'package:inference/utils/drop_area.dart';
 import 'package:inference/widgets/controls/no_outline_button.dart';
 import 'package:inference/widgets/device_selector.dart';
-//import 'package:media_kit/media_kit.dart';
-//import 'package:media_kit_video/media_kit_video.dart';
+import 'package:intl/date_symbol_data_local.dart';
+import 'package:media_kit/media_kit.dart';
+import 'package:media_kit_video/media_kit_video.dart';
 import 'package:provider/provider.dart';
 
 class Playground extends StatefulWidget {
@@ -20,9 +24,12 @@ class Playground extends StatefulWidget {
   State<Playground> createState() => _PlaygroundState();
 }
 
-class _PlaygroundState extends State<Playground> {
-  //late final player = Player();
-  //late final controller = VideoController(player);
+class _PlaygroundState extends State<Playground> with TickerProviderStateMixin{
+  final player = Player();
+  late final controller = VideoController(player);
+  int subtitleIndex = 0;
+  StreamSubscription<Duration>? listener;
+
 
   void showUploadMenu() async {
     FilePickerResult? result = await FilePicker.platform.pickFiles(type: FileType.video);
@@ -32,10 +39,29 @@ class _PlaygroundState extends State<Playground> {
     }
   }
 
+  void positionListener(Duration position) {
+    int index = (position.inSeconds / transcriptionPeriod).floor();
+    if (index != subtitleIndex) {
+      final inference = Provider.of<SpeechInferenceProvider>(context, listen: false);
+      inference.skipTo(index);
+      setState(() {
+          subtitleIndex = index;
+      });
+    }
+  }
+
+  void initializeVideoAndListeners(String source) async {
+    await listener?.cancel();
+    player.open(Media(source));
+    player.setVolume(0); // TODO: Disable this for release. This is for our sanity
+    listener = player.stream.position.listen(positionListener);
+  }
+
   void uploadFile(String file) async {
     final inference = Provider.of<SpeechInferenceProvider>(context, listen: false);
     await inference.loadVideo(file);
     inference.startTranscribing();
+    initializeVideoAndListeners(file);
   }
 
   @override
@@ -72,24 +98,31 @@ class _PlaygroundState extends State<Playground> {
                   ),
                 ),
               ),
-              Expanded(
-                child: GridContainer(
-                  color: backgroundColor.of(theme),
-                  child: Builder(
-                    builder: (context) {
-                      return DropArea(
-                        type: "video",
-                        showChild: false,
-                        onUpload: (String file) { uploadFile(file); },
-                        extensions: const [],
-                        child: Padding(
-                          padding: const EdgeInsets.all(8.0),
-                          child: Container(),
-                        ),
-                      );
-                    }
-                  ),
-                ),
+              Consumer<SpeechInferenceProvider>(
+                builder: (context, inference, child) {
+                  return Expanded(
+                    child: GridContainer(
+                      color: backgroundColor.of(theme),
+                      child: Builder(
+                        builder: (context) {
+                          return DropArea(
+                            type: "video",
+                            showChild: inference.videoLoaded,
+                            onUpload: (String file) { uploadFile(file); },
+                            extensions: const [],
+                            child: Stack(
+                              alignment: Alignment.bottomCenter,
+                              children: [
+                                Video(controller: controller),
+                                Subtitles(transcription: inference.transcription, subtitleIndex: subtitleIndex),
+                              ]
+                            ),
+                          );
+                        }
+                      ),
+                    ),
+                  );
+                }
               )
             ],
           ),
diff --git a/lib/pages/transcription/widgets/subtitles.dart b/lib/pages/transcription/widgets/subtitles.dart
new file mode 100644
index 0000000..21c609b
--- /dev/null
+++ b/lib/pages/transcription/widgets/subtitles.dart
@@ -0,0 +1,59 @@
+import 'dart:async';
+
+import 'package:fluent_ui/fluent_ui.dart';
+
+class Subtitles extends StatelessWidget {
+  const Subtitles({
+    super.key,
+    required this.transcription,
+    required this.subtitleIndex,
+  });
+
+  final Map<int, FutureOr<String>>? transcription;
+  final int subtitleIndex;
+
+  static const double fontSize = 18;
+
+  @override
+  Widget build(BuildContext context) {
+    return Padding(
+      padding: const EdgeInsets.only(left: 8, right: 8, bottom: 60),
+      child: SizedBox(
+        height: 100,
+        child: Builder(
+          builder: (context) {
+            if (transcription == null ) {
+              return Container();
+            }
+            if (transcription![subtitleIndex] is String) {
+              return Stack(
+                alignment: Alignment.bottomCenter,
+                children: [
+                  Text(
+                    transcription![subtitleIndex] as String,
+                    textAlign: TextAlign.center,
+                    style: TextStyle(
+                      fontSize: fontSize,
+                      foreground: Paint()
+                        ..style = PaintingStyle.stroke
+                        ..strokeWidth = 2
+                        ..color = Colors.black,
+                    )
+                  ),
+                  Text(
+                    transcription![subtitleIndex] as String,
+                    textAlign: TextAlign.center,
+                    style: const TextStyle(
+                      fontSize: fontSize
+                    )
+                  )
+                ],
+              );
+            }
+            return Container();
+          }
+        ),
+      ),
+    );
+  }
+}
diff --git a/linux/flutter/generated_plugin_registrant.cc b/linux/flutter/generated_plugin_registrant.cc
index 8e89f01..17066f6 100644
--- a/linux/flutter/generated_plugin_registrant.cc
+++ b/linux/flutter/generated_plugin_registrant.cc
@@ -8,6 +8,8 @@
 
 #include <desktop_drop/desktop_drop_plugin.h>
 #include <flutter_acrylic/flutter_acrylic_plugin.h>
+#include <media_kit_libs_linux/media_kit_libs_linux_plugin.h>
+#include <media_kit_video/media_kit_video_plugin.h>
 #include <system_theme/system_theme_plugin.h>
 
 void fl_register_plugins(FlPluginRegistry* registry) {
@@ -17,6 +19,12 @@ void fl_register_plugins(FlPluginRegistry* registry) {
   g_autoptr(FlPluginRegistrar) flutter_acrylic_registrar =
       fl_plugin_registry_get_registrar_for_plugin(registry, "FlutterAcrylicPlugin");
   flutter_acrylic_plugin_register_with_registrar(flutter_acrylic_registrar);
+  g_autoptr(FlPluginRegistrar) media_kit_libs_linux_registrar =
+      fl_plugin_registry_get_registrar_for_plugin(registry, "MediaKitLibsLinuxPlugin");
+  media_kit_libs_linux_plugin_register_with_registrar(media_kit_libs_linux_registrar);
+  g_autoptr(FlPluginRegistrar) media_kit_video_registrar =
+      fl_plugin_registry_get_registrar_for_plugin(registry, "MediaKitVideoPlugin");
+  media_kit_video_plugin_register_with_registrar(media_kit_video_registrar);
   g_autoptr(FlPluginRegistrar) system_theme_registrar =
       fl_plugin_registry_get_registrar_for_plugin(registry, "SystemThemePlugin");
   system_theme_plugin_register_with_registrar(system_theme_registrar);
diff --git a/linux/flutter/generated_plugins.cmake b/linux/flutter/generated_plugins.cmake
index cc87f3a..386a1eb 100644
--- a/linux/flutter/generated_plugins.cmake
+++ b/linux/flutter/generated_plugins.cmake
@@ -5,10 +5,13 @@
 list(APPEND FLUTTER_PLUGIN_LIST
   desktop_drop
   flutter_acrylic
+  media_kit_libs_linux
+  media_kit_video
   system_theme
 )
 
 list(APPEND FLUTTER_FFI_PLUGIN_LIST
+  media_kit_native_event_loop
 )
 
 set(PLUGIN_BUNDLED_LIBRARIES)
diff --git a/macos/Flutter/GeneratedPluginRegistrant.swift b/macos/Flutter/GeneratedPluginRegistrant.swift
index dc08871..c719943 100644
--- a/macos/Flutter/GeneratedPluginRegistrant.swift
+++ b/macos/Flutter/GeneratedPluginRegistrant.swift
@@ -7,12 +7,22 @@ import Foundation
 
 import desktop_drop
 import macos_window_utils
+import media_kit_libs_macos_video
+import media_kit_video
+import package_info_plus
 import path_provider_foundation
+import screen_brightness_macos
 import system_theme
+import wakelock_plus
 
 func RegisterGeneratedPlugins(registry: FlutterPluginRegistry) {
   DesktopDropPlugin.register(with: registry.registrar(forPlugin: "DesktopDropPlugin"))
   MacOSWindowUtilsPlugin.register(with: registry.registrar(forPlugin: "MacOSWindowUtilsPlugin"))
+  MediaKitLibsMacosVideoPlugin.register(with: registry.registrar(forPlugin: "MediaKitLibsMacosVideoPlugin"))
+  MediaKitVideoPlugin.register(with: registry.registrar(forPlugin: "MediaKitVideoPlugin"))
+  FPPPackageInfoPlusPlugin.register(with: registry.registrar(forPlugin: "FPPPackageInfoPlusPlugin"))
   PathProviderPlugin.register(with: registry.registrar(forPlugin: "PathProviderPlugin"))
+  ScreenBrightnessMacosPlugin.register(with: registry.registrar(forPlugin: "ScreenBrightnessMacosPlugin"))
   SystemThemePlugin.register(with: registry.registrar(forPlugin: "SystemThemePlugin"))
+  WakelockPlusMacosPlugin.register(with: registry.registrar(forPlugin: "WakelockPlusMacosPlugin"))
 }
diff --git a/macos/Podfile.lock b/macos/Podfile.lock
index 5d8310b..c810432 100644
--- a/macos/Podfile.lock
+++ b/macos/Podfile.lock
@@ -4,18 +4,36 @@ PODS:
   - FlutterMacOS (1.0.0)
   - macos_window_utils (1.0.0):
     - FlutterMacOS
+  - media_kit_libs_macos_video (1.0.4):
+    - FlutterMacOS
+  - media_kit_native_event_loop (1.0.0):
+    - FlutterMacOS
+  - media_kit_video (0.0.1):
+    - FlutterMacOS
+  - package_info_plus (0.0.1):
+    - FlutterMacOS
   - path_provider_foundation (0.0.1):
     - Flutter
     - FlutterMacOS
+  - screen_brightness_macos (0.1.0):
+    - FlutterMacOS
   - system_theme (0.0.1):
     - FlutterMacOS
+  - wakelock_plus (0.0.1):
+    - FlutterMacOS
 
 DEPENDENCIES:
   - desktop_drop (from `Flutter/ephemeral/.symlinks/plugins/desktop_drop/macos`)
   - FlutterMacOS (from `Flutter/ephemeral`)
   - macos_window_utils (from `Flutter/ephemeral/.symlinks/plugins/macos_window_utils/macos`)
+  - media_kit_libs_macos_video (from `Flutter/ephemeral/.symlinks/plugins/media_kit_libs_macos_video/macos`)
+  - media_kit_native_event_loop (from `Flutter/ephemeral/.symlinks/plugins/media_kit_native_event_loop/macos`)
+  - media_kit_video (from `Flutter/ephemeral/.symlinks/plugins/media_kit_video/macos`)
+  - package_info_plus (from `Flutter/ephemeral/.symlinks/plugins/package_info_plus/macos`)
   - path_provider_foundation (from `Flutter/ephemeral/.symlinks/plugins/path_provider_foundation/darwin`)
+  - screen_brightness_macos (from `Flutter/ephemeral/.symlinks/plugins/screen_brightness_macos/macos`)
   - system_theme (from `Flutter/ephemeral/.symlinks/plugins/system_theme/macos`)
+  - wakelock_plus (from `Flutter/ephemeral/.symlinks/plugins/wakelock_plus/macos`)
 
 EXTERNAL SOURCES:
   desktop_drop:
@@ -24,17 +42,35 @@ EXTERNAL SOURCES:
     :path: Flutter/ephemeral
   macos_window_utils:
     :path: Flutter/ephemeral/.symlinks/plugins/macos_window_utils/macos
+  media_kit_libs_macos_video:
+    :path: Flutter/ephemeral/.symlinks/plugins/media_kit_libs_macos_video/macos
+  media_kit_native_event_loop:
+    :path: Flutter/ephemeral/.symlinks/plugins/media_kit_native_event_loop/macos
+  media_kit_video:
+    :path: Flutter/ephemeral/.symlinks/plugins/media_kit_video/macos
+  package_info_plus:
+    :path: Flutter/ephemeral/.symlinks/plugins/package_info_plus/macos
   path_provider_foundation:
     :path: Flutter/ephemeral/.symlinks/plugins/path_provider_foundation/darwin
+  screen_brightness_macos:
+    :path: Flutter/ephemeral/.symlinks/plugins/screen_brightness_macos/macos
   system_theme:
     :path: Flutter/ephemeral/.symlinks/plugins/system_theme/macos
+  wakelock_plus:
+    :path: Flutter/ephemeral/.symlinks/plugins/wakelock_plus/macos
 
 SPEC CHECKSUMS:
   desktop_drop: 69eeff437544aa619c8db7f4481b3a65f7696898
   FlutterMacOS: 8f6f14fa908a6fb3fba0cd85dbd81ec4b251fb24
   macos_window_utils: 933f91f64805e2eb91a5bd057cf97cd097276663
+  media_kit_libs_macos_video: b3e2bbec2eef97c285f2b1baa7963c67c753fb82
+  media_kit_native_event_loop: 81fd5b45192b72f8b5b69eaf5b540f45777eb8d5
+  media_kit_video: c75b07f14d59706c775778e4dd47dd027de8d1e5
+  package_info_plus: 12f1c5c2cfe8727ca46cbd0b26677728972d9a5b
   path_provider_foundation: 2b6b4c569c0fb62ec74538f866245ac84301af46
+  screen_brightness_macos: 2d6d3af2165592d9a55ffcd95b7550970e41ebda
   system_theme: c7b9f6659a5caa26c9bc2284da096781e9a6fcbc
+  wakelock_plus: 4783562c9a43d209c458cb9b30692134af456269
 
 PODFILE CHECKSUM: 16208599a12443d53889ba2270a4985981cfb204
 
diff --git a/pubspec.lock b/pubspec.lock
index 9e93ab4..d1a1ee8 100644
--- a/pubspec.lock
+++ b/pubspec.lock
@@ -230,6 +230,14 @@ packages:
       url: "https://pub.dev"
     source: hosted
     version: "2.3.7"
+  dbus:
+    dependency: transitive
+    description:
+      name: dbus
+      sha256: "365c771ac3b0e58845f39ec6deebc76e3276aa9922b0cc60840712094d9047ac"
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.7.10"
   desktop_drop:
     dependency: "direct main"
     description:
@@ -477,10 +485,10 @@ packages:
     dependency: transitive
     description:
       name: js
-      sha256: c1b2e9b5ea78c45e1a0788d29606ba27dc5f71f019f32ca5140f61ef071838cf
+      sha256: f2c445dce49627136094980615a031419f7f3eb393237e4ecd97ac15dea343f3
       url: "https://pub.dev"
     source: hosted
-    version: "0.7.1"
+    version: "0.6.7"
   json_annotation:
     dependency: transitive
     description:
@@ -569,6 +577,78 @@ packages:
       url: "https://pub.dev"
     source: hosted
     version: "2.6.0"
+  media_kit:
+    dependency: "direct main"
+    description:
+      name: media_kit
+      sha256: "1f1deee148533d75129a6f38251ff8388e33ee05fc2d20a6a80e57d6051b7b62"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.1.11"
+  media_kit_libs_android_video:
+    dependency: transitive
+    description:
+      name: media_kit_libs_android_video
+      sha256: "9dd8012572e4aff47516e55f2597998f0a378e3d588d0fad0ca1f11a53ae090c"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.3.6"
+  media_kit_libs_ios_video:
+    dependency: transitive
+    description:
+      name: media_kit_libs_ios_video
+      sha256: b5382994eb37a4564c368386c154ad70ba0cc78dacdd3fb0cd9f30db6d837991
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.1.4"
+  media_kit_libs_linux:
+    dependency: transitive
+    description:
+      name: media_kit_libs_linux
+      sha256: e186891c31daa6bedab4d74dcdb4e8adfccc7d786bfed6ad81fe24a3b3010310
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.1.3"
+  media_kit_libs_macos_video:
+    dependency: transitive
+    description:
+      name: media_kit_libs_macos_video
+      sha256: f26aa1452b665df288e360393758f84b911f70ffb3878032e1aabba23aa1032d
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.1.4"
+  media_kit_libs_video:
+    dependency: "direct main"
+    description:
+      name: media_kit_libs_video
+      sha256: "20bb4aefa8fece282b59580e1cd8528117297083a6640c98c2e98cfc96b93288"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.0.5"
+  media_kit_libs_windows_video:
+    dependency: transitive
+    description:
+      name: media_kit_libs_windows_video
+      sha256: "32654572167825c42c55466f5d08eee23ea11061c84aa91b09d0e0f69bdd0887"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.0.10"
+  media_kit_native_event_loop:
+    dependency: transitive
+    description:
+      name: media_kit_native_event_loop
+      sha256: "7d82e3b3e9ded5c35c3146c5ba1da3118d1dd8ac3435bac7f29f458181471b40"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.0.9"
+  media_kit_video:
+    dependency: "direct main"
+    description:
+      name: media_kit_video
+      sha256: "2cc3b966679963ba25a4ce5b771e532a521ebde7c6aa20e9802bec95d9916c8f"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.2.5"
   meta:
     dependency: transitive
     description:
@@ -609,6 +689,22 @@ packages:
       url: "https://pub.dev"
     source: hosted
     version: "2.1.0"
+  package_info_plus:
+    dependency: transitive
+    description:
+      name: package_info_plus
+      sha256: da8d9ac8c4b1df253d1a328b7bf01ae77ef132833479ab40763334db13b91cce
+      url: "https://pub.dev"
+    source: hosted
+    version: "8.1.1"
+  package_info_plus_platform_interface:
+    dependency: transitive
+    description:
+      name: package_info_plus_platform_interface
+      sha256: ac1f4a4847f1ade8e6a87d1f39f5d7c67490738642e2542f559ec38c37489a66
+      url: "https://pub.dev"
+    source: hosted
+    version: "3.0.1"
   path:
     dependency: "direct main"
     description:
@@ -753,6 +849,62 @@ packages:
       url: "https://pub.dev"
     source: hosted
     version: "4.1.0"
+  safe_local_storage:
+    dependency: transitive
+    description:
+      name: safe_local_storage
+      sha256: ede4eb6cb7d88a116b3d3bf1df70790b9e2038bc37cb19112e381217c74d9440
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.0.2"
+  screen_brightness:
+    dependency: transitive
+    description:
+      name: screen_brightness
+      sha256: ed8da4a4511e79422fc1aa88138e920e4008cd312b72cdaa15ccb426c0faaedd
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.2.2+1"
+  screen_brightness_android:
+    dependency: transitive
+    description:
+      name: screen_brightness_android
+      sha256: "3df10961e3a9e968a5e076fe27e7f4741fa8a1d3950bdeb48cf121ed529d0caf"
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.1.0+2"
+  screen_brightness_ios:
+    dependency: transitive
+    description:
+      name: screen_brightness_ios
+      sha256: "99adc3ca5490b8294284aad5fcc87f061ad685050e03cf45d3d018fe398fd9a2"
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.1.0"
+  screen_brightness_macos:
+    dependency: transitive
+    description:
+      name: screen_brightness_macos
+      sha256: "64b34e7e3f4900d7687c8e8fb514246845a73ecec05ab53483ed025bd4a899fd"
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.1.0+1"
+  screen_brightness_platform_interface:
+    dependency: transitive
+    description:
+      name: screen_brightness_platform_interface
+      sha256: b211d07f0c96637a15fb06f6168617e18030d5d74ad03795dd8547a52717c171
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.1.0"
+  screen_brightness_windows:
+    dependency: transitive
+    description:
+      name: screen_brightness_windows
+      sha256: "9261bf33d0fc2707d8cf16339ce25768100a65e70af0fcabaf032fc12408ba86"
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.1.3"
   scroll_pos:
     dependency: transitive
     description:
@@ -846,6 +998,14 @@ packages:
       url: "https://pub.dev"
     source: hosted
     version: "0.3.1"
+  synchronized:
+    dependency: transitive
+    description:
+      name: synchronized
+      sha256: "69fe30f3a8b04a0be0c15ae6490fc859a78ef4c43ae2dd5e8a623d45bfcf9225"
+      url: "https://pub.dev"
+    source: hosted
+    version: "3.3.0+3"
   system_theme:
     dependency: "direct main"
     description:
@@ -894,6 +1054,22 @@ packages:
       url: "https://pub.dev"
     source: hosted
     version: "1.4.0"
+  universal_platform:
+    dependency: transitive
+    description:
+      name: universal_platform
+      sha256: "64e16458a0ea9b99260ceb5467a214c1f298d647c659af1bff6d3bf82536b1ec"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.1.0"
+  uri_parser:
+    dependency: transitive
+    description:
+      name: uri_parser
+      sha256: "6543c9fd86d2862fac55d800a43e67c0dcd1a41677cb69c2f8edfe73bbcf1835"
+      url: "https://pub.dev"
+    source: hosted
+    version: "2.0.2"
   uuid:
     dependency: "direct main"
     description:
@@ -942,6 +1118,30 @@ packages:
       url: "https://pub.dev"
     source: hosted
     version: "14.2.5"
+  volume_controller:
+    dependency: transitive
+    description:
+      name: volume_controller
+      sha256: c71d4c62631305df63b72da79089e078af2659649301807fa746088f365cb48e
+      url: "https://pub.dev"
+    source: hosted
+    version: "2.0.8"
+  wakelock_plus:
+    dependency: transitive
+    description:
+      name: wakelock_plus
+      sha256: bf4ee6f17a2fa373ed3753ad0e602b7603f8c75af006d5b9bdade263928c0484
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.2.8"
+  wakelock_plus_platform_interface:
+    dependency: transitive
+    description:
+      name: wakelock_plus_platform_interface
+      sha256: "422d1cdbb448079a8a62a5a770b69baa489f8f7ca21aef47800c726d404f9d16"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.2.1"
   watcher:
     dependency: transitive
     description:
diff --git a/pubspec.yaml b/pubspec.yaml
index 5e35670..c3233b8 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -56,6 +56,9 @@ dependencies:
   fluent_ui: ^4.9.2
   system_theme: ^3.1.2
   flutter_acrylic: ^1.1.4
+  media_kit: ^1.1.11 # Primary package.
+  media_kit_video: ^1.2.5 # For video rendering.
+  media_kit_libs_video: ^1.0.5 # Native video dependencies.
 
 dev_dependencies:
   flutter_test:
diff --git a/windows/flutter/generated_plugin_registrant.cc b/windows/flutter/generated_plugin_registrant.cc
index 909a92e..054d5c6 100644
--- a/windows/flutter/generated_plugin_registrant.cc
+++ b/windows/flutter/generated_plugin_registrant.cc
@@ -8,6 +8,9 @@
 
 #include <desktop_drop/desktop_drop_plugin.h>
 #include <flutter_acrylic/flutter_acrylic_plugin.h>
+#include <media_kit_libs_windows_video/media_kit_libs_windows_video_plugin_c_api.h>
+#include <media_kit_video/media_kit_video_plugin_c_api.h>
+#include <screen_brightness_windows/screen_brightness_windows_plugin.h>
 #include <system_theme/system_theme_plugin.h>
 
 void RegisterPlugins(flutter::PluginRegistry* registry) {
@@ -15,6 +18,12 @@ void RegisterPlugins(flutter::PluginRegistry* registry) {
       registry->GetRegistrarForPlugin("DesktopDropPlugin"));
   FlutterAcrylicPluginRegisterWithRegistrar(
       registry->GetRegistrarForPlugin("FlutterAcrylicPlugin"));
+  MediaKitLibsWindowsVideoPluginCApiRegisterWithRegistrar(
+      registry->GetRegistrarForPlugin("MediaKitLibsWindowsVideoPluginCApi"));
+  MediaKitVideoPluginCApiRegisterWithRegistrar(
+      registry->GetRegistrarForPlugin("MediaKitVideoPluginCApi"));
+  ScreenBrightnessWindowsPluginRegisterWithRegistrar(
+      registry->GetRegistrarForPlugin("ScreenBrightnessWindowsPlugin"));
   SystemThemePluginRegisterWithRegistrar(
       registry->GetRegistrarForPlugin("SystemThemePlugin"));
 }
diff --git a/windows/flutter/generated_plugins.cmake b/windows/flutter/generated_plugins.cmake
index 1f4b61f..3c6f76d 100644
--- a/windows/flutter/generated_plugins.cmake
+++ b/windows/flutter/generated_plugins.cmake
@@ -5,10 +5,14 @@
 list(APPEND FLUTTER_PLUGIN_LIST
   desktop_drop
   flutter_acrylic
+  media_kit_libs_windows_video
+  media_kit_video
+  screen_brightness_windows
   system_theme
 )
 
 list(APPEND FLUTTER_FFI_PLUGIN_LIST
+  media_kit_native_event_loop
 )
 
 set(PLUGIN_BUNDLED_LIBRARIES)

From df3b0903b1e192d76489d762adb5375305d24f0b Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 13:29:07 +0100
Subject: [PATCH 03/17] Implement speech to text using chunks

Gen AI allows you to output chunks when return_timestamps is true.
The chunks are closer to actual sentences and the timestamps are better.
I parse these chunks to get to combine them into better sentences.
---
 lib/interop/generated_bindings.dart           |  68 +++++++++++-
 lib/interop/openvino_bindings.dart            |  14 +++
 lib/interop/speech_to_text.dart               |  16 ++-
 lib/pages/transcription/playground.dart       |  58 ++++++----
 .../providers/speech_inference_provider.dart  |   9 +-
 lib/pages/transcription/utils/message.dart    |  38 +++++++
 .../transcription/widgets/subtitles.dart      |  12 +-
 .../transcription/widgets/transcription.dart  | 104 ++++++++++++++++++
 openvino_bindings/README.md                   |   2 +-
 openvino_bindings/WORKSPACE                   |   2 +-
 openvino_bindings/src/audio/speech_to_text.cc |   3 +-
 openvino_bindings/src/audio/speech_to_text.h  |   3 +-
 openvino_bindings/src/bindings.cc             |  29 ++++-
 openvino_bindings/src/bindings.h              |  18 ++-
 14 files changed, 330 insertions(+), 46 deletions(-)
 create mode 100644 lib/pages/transcription/utils/message.dart
 create mode 100644 lib/pages/transcription/widgets/transcription.dart

diff --git a/lib/interop/generated_bindings.dart b/lib/interop/generated_bindings.dart
index ea541a8..5699581 100644
--- a/lib/interop/generated_bindings.dart
+++ b/lib/interop/generated_bindings.dart
@@ -92,6 +92,37 @@ class OpenVINO {
   late final _freeStatusOrSpeechToText = _freeStatusOrSpeechToTextPtr
       .asFunction<void Function(ffi.Pointer<StatusOrSpeechToText>)>();
 
+  void freeStatusOrModelResponse(
+    ffi.Pointer<StatusOrModelResponse> status,
+  ) {
+    return _freeStatusOrModelResponse(
+      status,
+    );
+  }
+
+  late final _freeStatusOrModelResponsePtr = _lookup<
+          ffi.NativeFunction<
+              ffi.Void Function(ffi.Pointer<StatusOrModelResponse>)>>(
+      'freeStatusOrModelResponse');
+  late final _freeStatusOrModelResponse = _freeStatusOrModelResponsePtr
+      .asFunction<void Function(ffi.Pointer<StatusOrModelResponse>)>();
+
+  void freeStatusOrWhisperModelResponse(
+    ffi.Pointer<StatusOrWhisperModelResponse> status,
+  ) {
+    return _freeStatusOrWhisperModelResponse(
+      status,
+    );
+  }
+
+  late final _freeStatusOrWhisperModelResponsePtr = _lookup<
+          ffi.NativeFunction<
+              ffi.Void Function(ffi.Pointer<StatusOrWhisperModelResponse>)>>(
+      'freeStatusOrWhisperModelResponse');
+  late final _freeStatusOrWhisperModelResponse =
+      _freeStatusOrWhisperModelResponsePtr.asFunction<
+          void Function(ffi.Pointer<StatusOrWhisperModelResponse>)>();
+
   void freeStatusOrDevices(
     ffi.Pointer<StatusOrDevices> status,
   ) {
@@ -618,7 +649,7 @@ class OpenVINO {
   late final _speechToTextVideoDuration = _speechToTextVideoDurationPtr
       .asFunction<ffi.Pointer<StatusOrInt> Function(CSpeechToText)>();
 
-  ffi.Pointer<StatusOrModelResponse> speechToTextTranscribe(
+  ffi.Pointer<StatusOrWhisperModelResponse> speechToTextTranscribe(
     CSpeechToText instance,
     int start,
     int duration,
@@ -634,10 +665,13 @@ class OpenVINO {
 
   late final _speechToTextTranscribePtr = _lookup<
       ffi.NativeFunction<
-          ffi.Pointer<StatusOrModelResponse> Function(CSpeechToText, ffi.Int,
-              ffi.Int, ffi.Pointer<pkg_ffi.Utf8>)>>('speechToTextTranscribe');
+          ffi.Pointer<StatusOrWhisperModelResponse> Function(
+              CSpeechToText,
+              ffi.Int,
+              ffi.Int,
+              ffi.Pointer<pkg_ffi.Utf8>)>>('speechToTextTranscribe');
   late final _speechToTextTranscribe = _speechToTextTranscribePtr.asFunction<
-      ffi.Pointer<StatusOrModelResponse> Function(
+      ffi.Pointer<StatusOrWhisperModelResponse> Function(
           CSpeechToText, int, int, ffi.Pointer<pkg_ffi.Utf8>)>();
 
   ffi.Pointer<StatusOrDevices> getAvailableDevices() {
@@ -744,6 +778,16 @@ final class Device extends ffi.Struct {
   external ffi.Pointer<pkg_ffi.Utf8> name;
 }
 
+final class TranscriptionChunk extends ffi.Struct {
+  @ffi.Float()
+  external double start_ts;
+
+  @ffi.Float()
+  external double end_ts;
+
+  external ffi.Pointer<pkg_ffi.Utf8> text;
+}
+
 final class Status extends ffi.Struct {
   @ffi.Int()
   external int status;
@@ -835,6 +879,22 @@ final class StatusOrModelResponse extends ffi.Struct {
   external ffi.Pointer<pkg_ffi.Utf8> value;
 }
 
+final class StatusOrWhisperModelResponse extends ffi.Struct {
+  @ffi.Int()
+  external int status;
+
+  external ffi.Pointer<pkg_ffi.Utf8> message;
+
+  external Metrics metrics;
+
+  external ffi.Pointer<TranscriptionChunk> value;
+
+  @ffi.Int()
+  external int size;
+
+  external ffi.Pointer<pkg_ffi.Utf8> text;
+}
+
 final class StatusOrDevices extends ffi.Struct {
   @ffi.Int()
   external int status;
diff --git a/lib/interop/openvino_bindings.dart b/lib/interop/openvino_bindings.dart
index e11cc93..defe977 100644
--- a/lib/interop/openvino_bindings.dart
+++ b/lib/interop/openvino_bindings.dart
@@ -18,6 +18,20 @@ class SerializationOutput {
 
 }
 
+class Chunk {
+  final double start;
+  final double end;
+  final String text;
+  const Chunk(this.start, this.end, this.text);
+}
+
+class TranscriptionModelResponse {
+  final List<Chunk> chunks;
+  final Metrics metrics;
+  final String text;
+  const TranscriptionModelResponse(this.chunks, this.metrics, this.text);
+}
+
 class ModelResponse {
   final String content;
   final Metrics metrics;
diff --git a/lib/interop/speech_to_text.dart b/lib/interop/speech_to_text.dart
index b81ed02..1e07f17 100644
--- a/lib/interop/speech_to_text.dart
+++ b/lib/interop/speech_to_text.dart
@@ -59,7 +59,7 @@ class SpeechToText {
     }
   }
 
-  Future<String> transcribe(int start, int duration, String language) async{
+  Future<TranscriptionModelResponse> transcribe(int start, int duration, String language) async{
     int instanceAddress = instance.ref.value.address;
     final result = await Isolate.run(() {
       final languagePtr = language.toNativeUtf8();
@@ -72,6 +72,18 @@ class SpeechToText {
       throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
     }
 
-    return result.ref.value.toDartString();
+    List<Chunk> chunks = [];
+    for (int i = 0; i < result.ref.size; i++) {
+      chunks.add(Chunk(
+        result.ref.value[i].start_ts,
+        result.ref.value[i].end_ts,
+        result.ref.value[i].text.toDartString()
+      ));
+    }
+    final metrics = result.ref.metrics;
+    final text = result.ref.text.toDartString();
+    ov.freeStatusOrWhisperModelResponse(result);
+
+    return TranscriptionModelResponse(chunks, metrics, text);
   }
 }
diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart
index 0b2d74c..4af04e3 100644
--- a/lib/pages/transcription/playground.dart
+++ b/lib/pages/transcription/playground.dart
@@ -5,13 +5,13 @@ import 'package:fluent_ui/fluent_ui.dart';
 import 'package:inference/pages/computer_vision/widgets/model_properties.dart';
 import 'package:inference/pages/models/widgets/grid_container.dart';
 import 'package:inference/pages/transcription/widgets/subtitles.dart';
+import 'package:inference/pages/transcription/widgets/transcription.dart';
 import 'package:inference/project.dart';
 import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
 import 'package:inference/theme_fluent.dart';
 import 'package:inference/utils/drop_area.dart';
 import 'package:inference/widgets/controls/no_outline_button.dart';
 import 'package:inference/widgets/device_selector.dart';
-import 'package:intl/date_symbol_data_local.dart';
 import 'package:media_kit/media_kit.dart';
 import 'package:media_kit_video/media_kit_video.dart';
 import 'package:provider/provider.dart';
@@ -101,25 +101,42 @@ class _PlaygroundState extends State<Playground> with TickerProviderStateMixin{
               Consumer<SpeechInferenceProvider>(
                 builder: (context, inference, child) {
                   return Expanded(
-                    child: GridContainer(
-                      color: backgroundColor.of(theme),
-                      child: Builder(
-                        builder: (context) {
-                          return DropArea(
-                            type: "video",
-                            showChild: inference.videoLoaded,
-                            onUpload: (String file) { uploadFile(file); },
-                            extensions: const [],
-                            child: Stack(
-                              alignment: Alignment.bottomCenter,
-                              children: [
-                                Video(controller: controller),
-                                Subtitles(transcription: inference.transcription, subtitleIndex: subtitleIndex),
-                              ]
-                            ),
-                          );
-                        }
-                      ),
+                    child: Builder(
+                      builder: (context) {
+                        return DropArea(
+                          type: "video",
+                          showChild: inference.videoLoaded,
+                          onUpload: (String file) { uploadFile(file); },
+                          extensions: const [],
+                          child: Row(
+                            crossAxisAlignment: CrossAxisAlignment.stretch,
+                            children: [
+                              Expanded(
+                                child: GridContainer(
+                                  color: backgroundColor.of(theme),
+                                  child: Stack(
+                                    alignment: Alignment.bottomCenter,
+                                    children: [
+                                      Video(controller: controller),
+                                      Subtitles(transcription: inference.transcription, subtitleIndex: subtitleIndex),
+                                    ]
+                                  ),
+                                ),
+                              ),
+                              SizedBox(
+                                width: 312,
+                                child: GridContainer(
+                                  color: backgroundColor.of(theme),
+                                  child: Transcription(
+                                    onSeek: player.seek,
+                                    transcription: inference.transcription
+                                  ),
+                                ),
+                              )
+                            ],
+                          ),
+                        );
+                      }
                     ),
                   );
                 }
@@ -132,3 +149,4 @@ class _PlaygroundState extends State<Playground> with TickerProviderStateMixin{
     );
   }
 }
+
diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart
index 9f658fe..2302b34 100644
--- a/lib/pages/transcription/providers/speech_inference_provider.dart
+++ b/lib/pages/transcription/providers/speech_inference_provider.dart
@@ -1,6 +1,7 @@
 import 'dart:async';
 
 import 'package:flutter/material.dart';
+import 'package:inference/interop/openvino_bindings.dart';
 import 'package:inference/interop/speech_to_text.dart';
 import 'package:inference/pages/transcription/utils/section.dart';
 import 'package:inference/project.dart';
@@ -19,8 +20,8 @@ class SpeechInferenceProvider  extends ChangeNotifier {
 
   bool get videoLoaded => _videoPath != null;
 
-  DynamicRangeLoading<FutureOr<String>>? _transcription;
-  Map<int, FutureOr<String>>? get transcription => _transcription?.data;
+  DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>? _transcription;
+  Map<int, FutureOr<TranscriptionModelResponse>>? get transcription => _transcription?.data;
 
   String _language = "";
 
@@ -54,7 +55,7 @@ class SpeechInferenceProvider  extends ChangeNotifier {
     _videoPath = path;
     final duration = await _inference!.loadVideo(path);
     final sections = (duration / transcriptionPeriod).ceil();
-    _transcription = DynamicRangeLoading<FutureOr<String>>(Section(0, sections));
+    _transcription = DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>(Section(0, sections));
     notifyListeners();
   }
 
@@ -76,7 +77,7 @@ class SpeechInferenceProvider  extends ChangeNotifier {
     }
   }
 
-  Future<String> transcribe(int start, int duration) async {
+  Future<TranscriptionModelResponse> transcribe(int start, int duration) async {
     await loaded.future;
     return await _inference!.transcribe(start, duration, _language);
   }
diff --git a/lib/pages/transcription/utils/message.dart b/lib/pages/transcription/utils/message.dart
new file mode 100644
index 0000000..9568732
--- /dev/null
+++ b/lib/pages/transcription/utils/message.dart
@@ -0,0 +1,38 @@
+import 'dart:async';
+
+import 'package:inference/interop/openvino_bindings.dart';
+
+class Message {
+  String message;
+  final Duration position;
+
+  Message(this.message, this.position);
+
+  static List<Message> parse(Map<int, FutureOr<TranscriptionModelResponse>> transcriptions, int indexDuration) {
+    final indices = transcriptions.keys.toList()..sort();
+    if (indices.isEmpty) {
+      return [];
+    }
+
+    List<Message> output = [];
+
+    bool lastChunkIsOpenEnded  = false;
+
+    for (int i in indices) {
+      if (transcriptions[i] is Future) {
+        continue;
+      }
+      final part = transcriptions[i] as TranscriptionModelResponse;
+      for (final chunk in part.chunks) {
+        String text = chunk.text;
+        if (lastChunkIsOpenEnded) {
+          output.last.message += text;
+        } else {
+          output.add(Message(text.substring(1), Duration(seconds: chunk.start.toInt())));
+        }
+        lastChunkIsOpenEnded = text[text.length - 1] != ".";
+      }
+    }
+    return output;
+  }
+}
diff --git a/lib/pages/transcription/widgets/subtitles.dart b/lib/pages/transcription/widgets/subtitles.dart
index 21c609b..9971c9a 100644
--- a/lib/pages/transcription/widgets/subtitles.dart
+++ b/lib/pages/transcription/widgets/subtitles.dart
@@ -1,6 +1,7 @@
 import 'dart:async';
 
 import 'package:fluent_ui/fluent_ui.dart';
+import 'package:inference/interop/openvino_bindings.dart';
 
 class Subtitles extends StatelessWidget {
   const Subtitles({
@@ -9,7 +10,7 @@ class Subtitles extends StatelessWidget {
     required this.subtitleIndex,
   });
 
-  final Map<int, FutureOr<String>>? transcription;
+  final Map<int, FutureOr<TranscriptionModelResponse>>? transcription;
   final int subtitleIndex;
 
   static const double fontSize = 18;
@@ -25,12 +26,12 @@ class Subtitles extends StatelessWidget {
             if (transcription == null ) {
               return Container();
             }
-            if (transcription![subtitleIndex] is String) {
+            if (transcription![subtitleIndex] is TranscriptionModelResponse) {
+              final text = (transcription![subtitleIndex] as TranscriptionModelResponse).text;
               return Stack(
                 alignment: Alignment.bottomCenter,
                 children: [
-                  Text(
-                    transcription![subtitleIndex] as String,
+                  Text(text,
                     textAlign: TextAlign.center,
                     style: TextStyle(
                       fontSize: fontSize,
@@ -40,8 +41,7 @@ class Subtitles extends StatelessWidget {
                         ..color = Colors.black,
                     )
                   ),
-                  Text(
-                    transcription![subtitleIndex] as String,
+                  Text(text,
                     textAlign: TextAlign.center,
                     style: const TextStyle(
                       fontSize: fontSize
diff --git a/lib/pages/transcription/widgets/transcription.dart b/lib/pages/transcription/widgets/transcription.dart
new file mode 100644
index 0000000..f521fb8
--- /dev/null
+++ b/lib/pages/transcription/widgets/transcription.dart
@@ -0,0 +1,104 @@
+import 'dart:async';
+
+import 'package:fluent_ui/fluent_ui.dart';
+import 'package:inference/interop/openvino_bindings.dart';
+import 'package:inference/pages/transcription/utils/message.dart';
+import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
+import 'package:inference/theme_fluent.dart';
+
+String formatDuration(int totalSeconds) {
+  final duration = Duration(seconds: totalSeconds);
+  final minutes = duration.inMinutes;
+  final seconds = totalSeconds % 60;
+
+  final minutesString = '$minutes'.padLeft(2, '0');
+  final secondsString = '$seconds'.padLeft(2, '0');
+  return '$minutesString:$secondsString';
+}
+
+
+
+class Transcription extends StatelessWidget {
+  final Map<int, FutureOr<TranscriptionModelResponse>>? transcription;
+  final Function(Duration)? onSeek;
+  const Transcription({super.key, this.transcription, this.onSeek});
+
+  @override
+  Widget build(BuildContext context) {
+    if (transcription == null) {
+      return Container();
+    }
+
+    final messages = Message.parse(transcription!, transcriptionPeriod);
+
+    return SingleChildScrollView(
+      child: Padding(
+        padding: const EdgeInsets.symmetric(horizontal: 8),
+        child: Column(
+          crossAxisAlignment: CrossAxisAlignment.start,
+          children: [
+            for (final message in messages)
+              TranscriptionMessage(message: message, onSeek: onSeek)
+          ],
+        ),
+      ),
+    );
+  }
+}
+
+class TranscriptionMessage extends StatefulWidget {
+  final Function(Duration)? onSeek;
+  final Message message;
+
+  const TranscriptionMessage({super.key, required this.message, this.onSeek});
+
+  @override
+  State<TranscriptionMessage> createState() => _TranscriptionMessageState();
+}
+
+class _TranscriptionMessageState extends State<TranscriptionMessage> {
+  bool hover = false;
+
+  @override
+  Widget build(BuildContext context) {
+    final theme = FluentTheme.of(context);
+    return MouseRegion(
+      onEnter: (_) {
+        setState(() => hover = true);
+      },
+      onExit: (_) {
+        setState(() => hover = false);
+      },
+      child: GestureDetector(
+        onTap: () {
+          widget.onSeek?.call(widget.message.position);
+        },
+        child: Padding(
+          padding: const EdgeInsets.symmetric(vertical: 20),
+          child: Column(
+            crossAxisAlignment: CrossAxisAlignment.start,
+            children: [
+              Align(
+                alignment: Alignment.bottomRight,
+                child: Text(formatDuration(widget.message.position.inSeconds),
+                  style: TextStyle(
+                    fontSize: 9,
+                    color: subtleTextColor.of(theme),
+                  )
+                )
+              ),
+              Container(
+                decoration: BoxDecoration(
+                  color: hover ? subtleTextColor.of(theme).withOpacity(0.3) : null,
+                  borderRadius: const BorderRadius.all(Radius.circular(4)),
+                ),
+                padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 2),
+                child: Text(widget.message.message)
+              ),
+            ],
+          ),
+        ),
+      ),
+    );
+  }
+}
diff --git a/openvino_bindings/README.md b/openvino_bindings/README.md
index 952b68b..d6f8c21 100644
--- a/openvino_bindings/README.md
+++ b/openvino_bindings/README.md
@@ -108,7 +108,7 @@ The DLLs (with dependencies) will be in `bazel-bin/windows_bindings.tar`
 [Install OpenVINO Runtime 24.5.0](https://docs.openvino.ai/2024/get-started/install-openvino.html?PACKAGE=OPENVINO_GENAI&VERSION=v_2024_4_0&OP_SYSTEM=MACOS&DISTRIBUTION=ARCHIVE)  with GenAI flavor in `/opt/intel/openvino_24.5.0` and symlink to `/opt/intel/openvino`.
 
 Install OpenCV: `brew install opencv`
-Install ffmpeg: `brew install ffmpeg@6`
+Install ffmpeg: `brew install ffmpeg@6 && brew link ffmpeg@6`
 
 Run: `bazel build :macos_bindings`
 
diff --git a/openvino_bindings/WORKSPACE b/openvino_bindings/WORKSPACE
index 1740965..6a1707a 100644
--- a/openvino_bindings/WORKSPACE
+++ b/openvino_bindings/WORKSPACE
@@ -115,7 +115,7 @@ git_repository(
 new_local_repository(
     name = "mac_ffmpeg",
     build_file = "//third_party/ffmpeg:mac.BUILD",
-    path = "/opt/homebrew/Cellar/ffmpeg@6/6.1.2_3",
+    path = "/opt/homebrew/opt/ffmpeg@6",
 )
 #
 #new_local_repository(
diff --git a/openvino_bindings/src/audio/speech_to_text.cc b/openvino_bindings/src/audio/speech_to_text.cc
index e39cb4d..4a2e101 100644
--- a/openvino_bindings/src/audio/speech_to_text.cc
+++ b/openvino_bindings/src/audio/speech_to_text.cc
@@ -8,7 +8,7 @@ void SpeechToText::load_video(std::string video_path) {
     audio_grabber = std::make_unique<AudioGrabber>(video_path);
 }
 
-ov::genai::DecodedResults SpeechToText::transcribe(int start, int duration, std::string language) {
+ov::genai::WhisperDecodedResults SpeechToText::transcribe(int start, int duration, std::string language) {
     auto video_duration = audio_grabber->get_duration();
     if (start > video_duration) {
         throw api_error(SpeechToTextChunkOutOfBounds);
@@ -23,6 +23,7 @@ ov::genai::DecodedResults SpeechToText::transcribe(int start, int duration, std:
     if (data.empty()) {
         throw api_error(SpeechToTextChunkHasNoData);
     }
+    config.return_timestamps = true;
     config.max_new_tokens = 100;
     if (!language.empty()){
         config.language = language;
diff --git a/openvino_bindings/src/audio/speech_to_text.h b/openvino_bindings/src/audio/speech_to_text.h
index c0c7c1e..f119ca7 100644
--- a/openvino_bindings/src/audio/speech_to_text.h
+++ b/openvino_bindings/src/audio/speech_to_text.h
@@ -1,6 +1,7 @@
 #ifndef SPEECH_TO_TEXT_H_
 #define SPEECH_TO_TEXT_H_
 
+
 #include <memory>
 #include "openvino/genai/whisper_pipeline.hpp"
 #include "audio_grabber.h"
@@ -14,7 +15,7 @@ class SpeechToText {
     SpeechToText(std::string model_path, std::string device): pipe(model_path, device), config(model_path + "/generation_config.json") {}
     void load_video(std::string video_path);
     int64_t video_duration();
-    ov::genai::DecodedResults transcribe(int start, int duration, std::string language);
+    ov::genai::WhisperDecodedResults transcribe(int start, int duration, std::string language);
 };
 
 
diff --git a/openvino_bindings/src/bindings.cc b/openvino_bindings/src/bindings.cc
index 04b0efa..89ee853 100644
--- a/openvino_bindings/src/bindings.cc
+++ b/openvino_bindings/src/bindings.cc
@@ -39,6 +39,19 @@ void freeStatusOrImageInference(StatusOrString *status) {
     delete status;
 }
 
+void freeStatusOrModelResponse(StatusOrModelResponse *status) {
+    //std::cout << "Freeing StatusOrImageInference" << std::endl;
+    delete status;
+}
+
+void freeStatusOrWhisperModelResponse(StatusOrWhisperModelResponse *status) {
+    if (status->status == StatusEnum::OkStatus) {
+        delete [] status->value;
+        status->value = NULL;        // Prevent dangling pointers
+    }
+    delete status;
+}
+
 void freeStatusOrDevices(StatusOrDevices *status) {
     if (status->status == StatusEnum::OkStatus) {
         delete [] status->value;
@@ -321,15 +334,21 @@ StatusOrInt* speechToTextVideoDuration(CSpeechToText instance) {
     }
 }
 
-StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language) {
+StatusOrWhisperModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language) {
     try {
         auto object = reinterpret_cast<SpeechToText*>(instance);
-        auto result = object->transcribe(start, duration, language);
-        std::string text = result;
-        return new StatusOrModelResponse{OkStatus, "", convertToMetricsStruct(result.perf_metrics), strdup(text.c_str())};
+        auto transcription_result = object->transcribe(start, duration, language);
+        auto chunks = transcription_result.chunks.value();
+        std::string text = transcription_result;
+        TranscriptionChunk* result = new TranscriptionChunk[chunks.size()];
+        for (int i = 0; i < chunks.size(); i++) {
+            auto r = chunks[i];
+            result[i] = TranscriptionChunk{r.start_ts + start, r.end_ts + start, strdup(r.text.c_str())};
+        }
+        return new StatusOrWhisperModelResponse{OkStatus, "", convertToMetricsStruct(transcription_result.perf_metrics), result, (int)chunks.size(), strdup(text.c_str())};
     } catch (...) {
         auto except = handle_exceptions();
-        return new StatusOrModelResponse{except->status, except->message};
+        return new StatusOrWhisperModelResponse{except->status, except->message};
     }
 }
 
diff --git a/openvino_bindings/src/bindings.h b/openvino_bindings/src/bindings.h
index 4528916..1f27f62 100644
--- a/openvino_bindings/src/bindings.h
+++ b/openvino_bindings/src/bindings.h
@@ -26,6 +26,11 @@ typedef struct {
     const char* name;
 } Device;
 
+typedef struct {
+  float start_ts;
+  float end_ts;
+  const char* text;
+} TranscriptionChunk;
 
 typedef struct {
     enum StatusEnum status;
@@ -81,6 +86,15 @@ typedef struct {
     const char* value;
 } StatusOrModelResponse;
 
+typedef struct {
+    enum StatusEnum status;
+    const char* message;
+    Metrics metrics;
+    TranscriptionChunk* value;
+    int size;
+    const char* text;
+} StatusOrWhisperModelResponse;
+
 typedef struct {
     enum StatusEnum status;
     const char* message;
@@ -96,6 +110,8 @@ EXPORT void freeStatusOrString(StatusOrString *status);
 EXPORT void freeStatusOrImageInference(StatusOrImageInference *status);
 EXPORT void freeStatusOrLLMInference(StatusOrLLMInference *status);
 EXPORT void freeStatusOrSpeechToText(StatusOrSpeechToText *status);
+EXPORT void freeStatusOrModelResponse(StatusOrModelResponse *status);
+EXPORT void freeStatusOrWhisperModelResponse(StatusOrWhisperModelResponse *status);
 EXPORT void freeStatusOrDevices(StatusOrDevices *status);
 
 EXPORT StatusOrImageInference* imageInferenceOpen(const char* model_path, const char* task, const char* device, const char* label_definitions_json);
@@ -126,7 +142,7 @@ EXPORT Status* graphRunnerStop(CGraphRunner instance);
 EXPORT StatusOrSpeechToText* speechToTextOpen(const char* model_path, const char* device);
 EXPORT Status* speechToTextLoadVideo(CSpeechToText instance, const char* video_path);
 EXPORT StatusOrInt* speechToTextVideoDuration(CSpeechToText instance);
-EXPORT StatusOrModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language);
+EXPORT StatusOrWhisperModelResponse* speechToTextTranscribe(CSpeechToText instance, int start, int duration, const char* language);
 
 EXPORT StatusOrDevices* getAvailableDevices();
 Status* handle_exceptions();

From c77fe809254be7db445a1f421428c494ea9a3d63 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 14:16:42 +0100
Subject: [PATCH 04/17] Add download button and stub search bar to
 transcription

---
 lib/interop/speech_to_text.dart               |  2 -
 lib/pages/transcription/playground.dart       |  9 +-
 .../providers/speech_inference_provider.dart  | 19 ++--
 .../transcription/widgets/transcription.dart  | 86 +++++++++++++++----
 lib/widgets/controls/search_bar.dart          |  2 +-
 5 files changed, 89 insertions(+), 29 deletions(-)

diff --git a/lib/interop/speech_to_text.dart b/lib/interop/speech_to_text.dart
index 1e07f17..4f57cb2 100644
--- a/lib/interop/speech_to_text.dart
+++ b/lib/interop/speech_to_text.dart
@@ -9,8 +9,6 @@ final ov = getBindings();
 class SpeechToText {
   final Pointer<StatusOrSpeechToText> instance;
 
-
-
   SpeechToText(this.instance);
 
   static Future<SpeechToText> init(String modelPath, String device) async {
diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart
index 4af04e3..b1f8b54 100644
--- a/lib/pages/transcription/playground.dart
+++ b/lib/pages/transcription/playground.dart
@@ -118,18 +118,21 @@ class _PlaygroundState extends State<Playground> with TickerProviderStateMixin{
                                     alignment: Alignment.bottomCenter,
                                     children: [
                                       Video(controller: controller),
-                                      Subtitles(transcription: inference.transcription, subtitleIndex: subtitleIndex),
+                                      Subtitles(
+                                        transcription: inference.transcription?.data,
+                                        subtitleIndex: subtitleIndex,
+                                      ),
                                     ]
                                   ),
                                 ),
                               ),
                               SizedBox(
-                                width: 312,
+                                width: 360,
                                 child: GridContainer(
                                   color: backgroundColor.of(theme),
                                   child: Transcription(
                                     onSeek: player.seek,
-                                    transcription: inference.transcription
+                                    transcription: inference.transcription,
                                   ),
                                 ),
                               )
diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart
index 2302b34..8e574f2 100644
--- a/lib/pages/transcription/providers/speech_inference_provider.dart
+++ b/lib/pages/transcription/providers/speech_inference_provider.dart
@@ -20,8 +20,11 @@ class SpeechInferenceProvider  extends ChangeNotifier {
 
   bool get videoLoaded => _videoPath != null;
 
-  DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>? _transcription;
-  Map<int, FutureOr<TranscriptionModelResponse>>? get transcription => _transcription?.data;
+  DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>? transcription;
+
+  bool get transcriptionComplete {
+    return transcription?.complete ?? false;
+  }
 
   String _language = "";
 
@@ -47,7 +50,7 @@ class SpeechInferenceProvider  extends ChangeNotifier {
   }
 
   void skipTo(int index) {
-    _transcription!.skipTo(index);
+    transcription!.skipTo(index);
   }
 
   Future<void> loadVideo(String path) async {
@@ -55,20 +58,20 @@ class SpeechInferenceProvider  extends ChangeNotifier {
     _videoPath = path;
     final duration = await _inference!.loadVideo(path);
     final sections = (duration / transcriptionPeriod).ceil();
-    _transcription = DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>(Section(0, sections));
+    transcription = DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>(Section(0, sections));
     notifyListeners();
   }
 
   Future<void> startTranscribing() async {
-    if (_transcription == null) {
+    if (transcription == null) {
       throw Exception("Can't transcribe before loading video");
     }
 
-    while (!_transcription!.complete) {
-      if (_transcription == null) {
+    while (!transcription!.complete) {
+      if (transcription == null) {
         return;
       }
-      await _transcription!.process((int i) {
+      await transcription!.process((int i) {
           return transcribe(i * transcriptionPeriod, transcriptionPeriod);
       });
       if (hasListeners) {
diff --git a/lib/pages/transcription/widgets/transcription.dart b/lib/pages/transcription/widgets/transcription.dart
index f521fb8..9301aa1 100644
--- a/lib/pages/transcription/widgets/transcription.dart
+++ b/lib/pages/transcription/widgets/transcription.dart
@@ -1,10 +1,14 @@
 import 'dart:async';
+import 'dart:io';
 
+import 'package:file_picker/file_picker.dart';
 import 'package:fluent_ui/fluent_ui.dart';
 import 'package:inference/interop/openvino_bindings.dart';
 import 'package:inference/pages/transcription/utils/message.dart';
 import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
+import 'package:inference/pages/transcription/utils/section.dart';
 import 'package:inference/theme_fluent.dart';
+import 'package:inference/widgets/controls/search_bar.dart';
 
 String formatDuration(int totalSeconds) {
   final duration = Duration(seconds: totalSeconds);
@@ -19,9 +23,30 @@ String formatDuration(int totalSeconds) {
 
 
 class Transcription extends StatelessWidget {
-  final Map<int, FutureOr<TranscriptionModelResponse>>? transcription;
+  final DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>? transcription;
   final Function(Duration)? onSeek;
-  const Transcription({super.key, this.transcription, this.onSeek});
+  const Transcription({super.key, this.onSeek, this.transcription});
+
+  void saveTranscript() async {
+    final file = await FilePicker.platform.saveFile(
+      dialogTitle: "Please select an output file:",
+      fileName: "transcription.txt",
+    );
+    if (file == null){
+      return;
+    }
+
+    String contents = "";
+    final indices = transcription!.data.keys.toList()..sort();
+    for (int i in indices) {
+      final part = transcription!.data[i] as TranscriptionModelResponse;
+      for (final chunk in part.chunks) {
+        contents += chunk.text;
+      }
+    }
+
+    await File(file).writeAsString(contents);
+  }
 
   @override
   Widget build(BuildContext context) {
@@ -29,19 +54,50 @@ class Transcription extends StatelessWidget {
       return Container();
     }
 
-    final messages = Message.parse(transcription!, transcriptionPeriod);
-
-    return SingleChildScrollView(
-      child: Padding(
-        padding: const EdgeInsets.symmetric(horizontal: 8),
-        child: Column(
-          crossAxisAlignment: CrossAxisAlignment.start,
-          children: [
-            for (final message in messages)
-              TranscriptionMessage(message: message, onSeek: onSeek)
-          ],
+    final messages = Message.parse(transcription!.data, transcriptionPeriod);
+
+    return Column(
+      children: [
+        Padding(
+          padding: const EdgeInsets.symmetric(vertical: 25, horizontal: 14),
+          child: Row(
+            children: [
+              SearchBar(onChange: (p) {}, placeholder: "Search in transcript",),
+              Padding(
+                padding: const EdgeInsets.only(left: 8.0),
+                child: Tooltip(
+                  message: transcription!.complete
+                    ? "Download transcript"
+                    : "Transcribing...",
+                  child: Button(
+                    onPressed: transcription!.complete
+                      ? () => saveTranscript()
+                      : null,
+                    child: const Padding(
+                      padding: EdgeInsets.symmetric(vertical: 2),
+                      child: Icon(FluentIcons.download),
+                    ),
+                  ),
+                ),
+              )
+            ],
+          ),
         ),
-      ),
+        Expanded(
+          child: SingleChildScrollView(
+            child: Padding(
+              padding: const EdgeInsets.only(left: 10, right: 18),
+              child: Column(
+                crossAxisAlignment: CrossAxisAlignment.start,
+                children: [
+                  for (final message in messages)
+                    TranscriptionMessage(message: message, onSeek: onSeek)
+                ],
+              ),
+            ),
+          ),
+        ),
+      ],
     );
   }
 }
@@ -74,7 +130,7 @@ class _TranscriptionMessageState extends State<TranscriptionMessage> {
           widget.onSeek?.call(widget.message.position);
         },
         child: Padding(
-          padding: const EdgeInsets.symmetric(vertical: 20),
+          padding: const EdgeInsets.symmetric(vertical: 20, horizontal: 4),
           child: Column(
             crossAxisAlignment: CrossAxisAlignment.start,
             children: [
diff --git a/lib/widgets/controls/search_bar.dart b/lib/widgets/controls/search_bar.dart
index 7142406..7b3e6a7 100644
--- a/lib/widgets/controls/search_bar.dart
+++ b/lib/widgets/controls/search_bar.dart
@@ -50,4 +50,4 @@ class _SearchBarState extends State<SearchBar> {
       ),
     );
   }
-}
\ No newline at end of file
+}

From 960de3d378488fb6fda660692dce55b35e85850c Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 16:16:15 +0100
Subject: [PATCH 05/17] Implement search

No tests and implementation is a bit ugly. But it works.
---
 lib/pages/transcription/playground.dart       |  15 +-
 lib/pages/transcription/utils/section.dart    |   3 +-
 .../transcription/widgets/paragraph.dart      |  95 +++++++++++
 .../transcription/widgets/transcription.dart  | 155 ++++++++----------
 4 files changed, 177 insertions(+), 91 deletions(-)
 create mode 100644 lib/pages/transcription/widgets/paragraph.dart

diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart
index b1f8b54..e9b7672 100644
--- a/lib/pages/transcription/playground.dart
+++ b/lib/pages/transcription/playground.dart
@@ -6,6 +6,7 @@ import 'package:inference/pages/computer_vision/widgets/model_properties.dart';
 import 'package:inference/pages/models/widgets/grid_container.dart';
 import 'package:inference/pages/transcription/widgets/subtitles.dart';
 import 'package:inference/pages/transcription/widgets/transcription.dart';
+import 'package:inference/pages/transcription/utils/message.dart';
 import 'package:inference/project.dart';
 import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
 import 'package:inference/theme_fluent.dart';
@@ -130,9 +131,17 @@ class _PlaygroundState extends State<Playground> with TickerProviderStateMixin{
                                 width: 360,
                                 child: GridContainer(
                                   color: backgroundColor.of(theme),
-                                  child: Transcription(
-                                    onSeek: player.seek,
-                                    transcription: inference.transcription,
+                                  child: Builder(
+                                    builder: (context) {
+                                      if (inference.transcription == null) {
+                                        return Container();
+                                      }
+                                      return Transcription(
+                                        onSeek: player.seek,
+                                        transcription: inference.transcription!,
+                                        messages: Message.parse(inference.transcription!.data, transcriptionPeriod),
+                                      );
+                                    }
                                   ),
                                 ),
                               )
diff --git a/lib/pages/transcription/utils/section.dart b/lib/pages/transcription/utils/section.dart
index 27ede1d..5c731b1 100644
--- a/lib/pages/transcription/utils/section.dart
+++ b/lib/pages/transcription/utils/section.dart
@@ -10,9 +10,10 @@ void moveToEnd<I>(List<I> list, I item) {
 
 class DynamicRangeLoading<I> {
   List<Section> sections = [];
+  int? size;
   Map<int, I> data = {};
 
-  DynamicRangeLoading(Section section): sections = [section];
+  DynamicRangeLoading(Section section): sections = [section], size = section.end;
 
   Section get activeSection => sections.first;
 
diff --git a/lib/pages/transcription/widgets/paragraph.dart b/lib/pages/transcription/widgets/paragraph.dart
new file mode 100644
index 0000000..1189580
--- /dev/null
+++ b/lib/pages/transcription/widgets/paragraph.dart
@@ -0,0 +1,95 @@
+
+import 'package:fluent_ui/fluent_ui.dart';
+import 'package:inference/theme_fluent.dart';
+import '../utils/message.dart';
+
+String formatDuration(int totalSeconds) {
+  final duration = Duration(seconds: totalSeconds);
+  final minutes = duration.inMinutes;
+  final seconds = totalSeconds % 60;
+
+  final minutesString = '$minutes'.padLeft(2, '0');
+  final secondsString = '$seconds'.padLeft(2, '0');
+  return '$minutesString:$secondsString';
+}
+
+class Paragraph extends StatefulWidget {
+  final Function(Duration)? onSeek;
+  final Message message;
+  final String? highlightedText;
+
+  const Paragraph({super.key, required this.message, this.onSeek, this.highlightedText});
+
+  @override
+  State<Paragraph> createState() => _ParagraphState();
+}
+
+class _ParagraphState extends State<Paragraph> {
+  bool hover = false;
+
+  @override
+  Widget build(BuildContext context) {
+    final theme = FluentTheme.of(context);
+    List<TextSpan> pieces = [];
+    if (widget.highlightedText != null) {
+      final pattern = RegExp(widget.highlightedText!, caseSensitive: false);
+      final sections = widget.message.message.split(pattern);
+      if (sections.isNotEmpty) {
+        pieces.add(TextSpan(text: sections.first));
+        for (int i = 1; i < sections.length; i++) {
+          pieces.add(
+            TextSpan(
+              text: widget.highlightedText!,
+              style: TextStyle(backgroundColor: theme.accentColor),
+            )
+          );
+          pieces.add(TextSpan(text: sections[i]));
+        }
+      }
+    } else {
+      pieces.add(TextSpan(text: widget.message.message));
+    }
+    return MouseRegion(
+      onEnter: (_) {
+        setState(() => hover = true);
+      },
+      onExit: (_) {
+        setState(() => hover = false);
+      },
+      child: GestureDetector(
+        onTap: () {
+          widget.onSeek?.call(widget.message.position);
+        },
+        child: Padding(
+          padding: const EdgeInsets.symmetric(vertical: 20, horizontal: 4),
+          child: Column(
+            crossAxisAlignment: CrossAxisAlignment.start,
+            children: [
+              Align(
+                alignment: Alignment.bottomRight,
+                child: Text(formatDuration(widget.message.position.inSeconds),
+                  style: TextStyle(
+                    fontSize: 9,
+                    color: subtleTextColor.of(theme),
+                  )
+                )
+              ),
+              Container(
+                decoration: BoxDecoration(
+                  color: hover ? subtleTextColor.of(theme).withOpacity(0.3) : null,
+                  borderRadius: const BorderRadius.all(Radius.circular(4)),
+                ),
+                padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 2),
+                child: RichText(
+                  text: TextSpan(
+                    children: pieces
+                  )
+                )
+              ),
+            ],
+          ),
+        ),
+      ),
+    );
+  }
+}
diff --git a/lib/pages/transcription/widgets/transcription.dart b/lib/pages/transcription/widgets/transcription.dart
index 9301aa1..8c040da 100644
--- a/lib/pages/transcription/widgets/transcription.dart
+++ b/lib/pages/transcription/widgets/transcription.dart
@@ -5,27 +5,26 @@ import 'package:file_picker/file_picker.dart';
 import 'package:fluent_ui/fluent_ui.dart';
 import 'package:inference/interop/openvino_bindings.dart';
 import 'package:inference/pages/transcription/utils/message.dart';
-import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
 import 'package:inference/pages/transcription/utils/section.dart';
-import 'package:inference/theme_fluent.dart';
+import 'package:inference/pages/transcription/widgets/paragraph.dart';
 import 'package:inference/widgets/controls/search_bar.dart';
 
-String formatDuration(int totalSeconds) {
-  final duration = Duration(seconds: totalSeconds);
-  final minutes = duration.inMinutes;
-  final seconds = totalSeconds % 60;
 
-  final minutesString = '$minutes'.padLeft(2, '0');
-  final secondsString = '$seconds'.padLeft(2, '0');
-  return '$minutesString:$secondsString';
-}
-
-
-
-class Transcription extends StatelessWidget {
+class Transcription extends StatefulWidget {
   final DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>? transcription;
   final Function(Duration)? onSeek;
-  const Transcription({super.key, this.onSeek, this.transcription});
+  final List<Message> messages;
+  const Transcription({super.key, this.onSeek, this.transcription, required this.messages});
+
+  @override
+  State<Transcription> createState() => _TranscriptionState();
+}
+
+class _TranscriptionState extends State<Transcription> {
+  final List<GlobalKey> _paragraphKeys = [];
+  final ScrollController _scrollController = ScrollController();
+  final GlobalKey scrollKey = GlobalKey();
+  String? searchText;
 
   void saveTranscript() async {
     final file = await FilePicker.platform.saveFile(
@@ -37,9 +36,9 @@ class Transcription extends StatelessWidget {
     }
 
     String contents = "";
-    final indices = transcription!.data.keys.toList()..sort();
+    final indices = widget.transcription!.data.keys.toList()..sort();
     for (int i in indices) {
-      final part = transcription!.data[i] as TranscriptionModelResponse;
+      final part = widget.transcription!.data[i] as TranscriptionModelResponse;
       for (final chunk in part.chunks) {
         contents += chunk.text;
       }
@@ -48,29 +47,55 @@ class Transcription extends StatelessWidget {
     await File(file).writeAsString(contents);
   }
 
-  @override
-  Widget build(BuildContext context) {
-    if (transcription == null) {
-      return Container();
-    }
+  void search(String text) {
+     setState(() {
+         searchText = text;
+     });
+
+    final pattern = RegExp(text, caseSensitive: false);
+    int? index;
+    for (int i = 0; i < widget.messages.length; i++) {
+      if (widget.messages[i].message.contains(pattern)) {
+        index = i;
+        break;
+      }
 
-    final messages = Message.parse(transcription!.data, transcriptionPeriod);
+    }
+    if (index != null){
+      final context = _paragraphKeys[index].currentContext;
+
+      if (context != null) {
+        final renderBox = context.findRenderObject() as RenderBox?;
+        if (renderBox != null) {
+          final position = renderBox.localToGlobal(Offset.zero, ancestor: scrollKey.currentContext?.findRenderObject());
+          final offset = _scrollController.offset + position.dy;
+          _scrollController.animateTo(
+            offset,
+            duration: const Duration(milliseconds: 500),
+            curve: Curves.easeInOut,
+          );
+        }
+      }
+    }
+  }
 
+  @override
+  Widget build(BuildContext context) {
     return Column(
       children: [
         Padding(
           padding: const EdgeInsets.symmetric(vertical: 25, horizontal: 14),
           child: Row(
             children: [
-              SearchBar(onChange: (p) {}, placeholder: "Search in transcript",),
+              SearchBar(onChange: search, placeholder: "Search in transcript",),
               Padding(
                 padding: const EdgeInsets.only(left: 8.0),
                 child: Tooltip(
-                  message: transcription!.complete
+                  message: widget.transcription!.complete
                     ? "Download transcript"
                     : "Transcribing...",
                   child: Button(
-                    onPressed: transcription!.complete
+                    onPressed: widget.transcription?.complete ?? false
                       ? () => saveTranscript()
                       : null,
                     child: const Padding(
@@ -85,14 +110,27 @@ class Transcription extends StatelessWidget {
         ),
         Expanded(
           child: SingleChildScrollView(
+            key: scrollKey,
+            controller: _scrollController,
             child: Padding(
               padding: const EdgeInsets.only(left: 10, right: 18),
               child: Column(
                 crossAxisAlignment: CrossAxisAlignment.start,
-                children: [
-                  for (final message in messages)
-                    TranscriptionMessage(message: message, onSeek: onSeek)
-                ],
+                children: List.generate(widget.messages.length, (index) {
+                    //Adjusting state in render is ugly. But might just work
+                    if (_paragraphKeys.length <= index) {
+                      print("length: ${_paragraphKeys.length}, index: $index");
+                      _paragraphKeys.add(GlobalKey());
+                    }
+
+                    return Paragraph(
+                      key: _paragraphKeys[index],
+                      message: widget.messages[index],
+                      highlightedText: searchText,
+                      onSeek: widget.onSeek,
+                    );
+
+                }),
               ),
             ),
           ),
@@ -101,60 +139,3 @@ class Transcription extends StatelessWidget {
     );
   }
 }
-
-class TranscriptionMessage extends StatefulWidget {
-  final Function(Duration)? onSeek;
-  final Message message;
-
-  const TranscriptionMessage({super.key, required this.message, this.onSeek});
-
-  @override
-  State<TranscriptionMessage> createState() => _TranscriptionMessageState();
-}
-
-class _TranscriptionMessageState extends State<TranscriptionMessage> {
-  bool hover = false;
-
-  @override
-  Widget build(BuildContext context) {
-    final theme = FluentTheme.of(context);
-    return MouseRegion(
-      onEnter: (_) {
-        setState(() => hover = true);
-      },
-      onExit: (_) {
-        setState(() => hover = false);
-      },
-      child: GestureDetector(
-        onTap: () {
-          widget.onSeek?.call(widget.message.position);
-        },
-        child: Padding(
-          padding: const EdgeInsets.symmetric(vertical: 20, horizontal: 4),
-          child: Column(
-            crossAxisAlignment: CrossAxisAlignment.start,
-            children: [
-              Align(
-                alignment: Alignment.bottomRight,
-                child: Text(formatDuration(widget.message.position.inSeconds),
-                  style: TextStyle(
-                    fontSize: 9,
-                    color: subtleTextColor.of(theme),
-                  )
-                )
-              ),
-              Container(
-                decoration: BoxDecoration(
-                  color: hover ? subtleTextColor.of(theme).withOpacity(0.3) : null,
-                  borderRadius: const BorderRadius.all(Radius.circular(4)),
-                ),
-                padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 2),
-                child: Text(widget.message.message)
-              ),
-            ],
-          ),
-        ),
-      ),
-    );
-  }
-}

From 56ec80d5509ebf65678e75baa71a3829058025e9 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 16:28:02 +0100
Subject: [PATCH 06/17] Explain ugly state change on build

---
 lib/pages/transcription/widgets/transcription.dart | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/pages/transcription/widgets/transcription.dart b/lib/pages/transcription/widgets/transcription.dart
index 8c040da..b02fcdc 100644
--- a/lib/pages/transcription/widgets/transcription.dart
+++ b/lib/pages/transcription/widgets/transcription.dart
@@ -117,9 +117,9 @@ class _TranscriptionState extends State<Transcription> {
               child: Column(
                 crossAxisAlignment: CrossAxisAlignment.start,
                 children: List.generate(widget.messages.length, (index) {
-                    //Adjusting state in render is ugly. But might just work
+                    // Adjusting state in render is ugly. But works.
+                    // This is done because we need a global key but the paragraphs are added as you go.
                     if (_paragraphKeys.length <= index) {
-                      print("length: ${_paragraphKeys.length}, index: $index");
                       _paragraphKeys.add(GlobalKey());
                     }
 

From 2c5db4afbbc55cbe286ae841ac3facddaf49f643 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 16:34:11 +0100
Subject: [PATCH 07/17] Hide performance metrics page for now

---
 lib/pages/transcription/transcription.dart | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/pages/transcription/transcription.dart b/lib/pages/transcription/transcription.dart
index 14353d9..0ea6f14 100644
--- a/lib/pages/transcription/transcription.dart
+++ b/lib/pages/transcription/transcription.dart
@@ -80,11 +80,11 @@ class _TranscriptionPageState extends State<TranscriptionPage> {
                     title: const Text("Playground"),
                     body: Playground(project: widget.project),
                   ),
-                  PaneItem(
-                    icon: const Icon(FluentIcons.project_collection),
-                    title: const Text("Performance metrics"),
-                    body: Container(),
-                  ),
+                  //PaneItem(
+                  //  icon: const Icon(FluentIcons.project_collection),
+                  //  title: const Text("Performance metrics"),
+                  //  body: Container(),
+                  //),
                 ],
               )
             ),

From d4195a3a966511c54e2b5d1909463ec1c3107388 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 16:43:07 +0100
Subject: [PATCH 08/17] Fix old drop area from being used

---
 lib/pages/transcription/playground.dart | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart
index e9b7672..5577170 100644
--- a/lib/pages/transcription/playground.dart
+++ b/lib/pages/transcription/playground.dart
@@ -10,7 +10,7 @@ import 'package:inference/pages/transcription/utils/message.dart';
 import 'package:inference/project.dart';
 import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
 import 'package:inference/theme_fluent.dart';
-import 'package:inference/utils/drop_area.dart';
+import 'package:inference/widgets/controls/drop_area.dart';
 import 'package:inference/widgets/controls/no_outline_button.dart';
 import 'package:inference/widgets/device_selector.dart';
 import 'package:media_kit/media_kit.dart';

From f1084d6123cccf1605a0e234587f38f1119711af Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 17:03:39 +0100
Subject: [PATCH 09/17] Fix color for paragraph in transcription

---
 lib/pages/transcription/widgets/paragraph.dart | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/pages/transcription/widgets/paragraph.dart b/lib/pages/transcription/widgets/paragraph.dart
index 1189580..c6ca4f1 100644
--- a/lib/pages/transcription/widgets/paragraph.dart
+++ b/lib/pages/transcription/widgets/paragraph.dart
@@ -82,6 +82,9 @@ class _ParagraphState extends State<Paragraph> {
                 padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 2),
                 child: RichText(
                   text: TextSpan(
+                    style: TextStyle(
+                      color: theme.inactiveColor
+                    ),
                     children: pieces
                   )
                 )

From 60e398665d6d39a08d889895401fd0a41074c82c Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 17:05:17 +0100
Subject: [PATCH 10/17] Add dispose for video player

---
 lib/pages/transcription/playground.dart | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart
index 5577170..344d00f 100644
--- a/lib/pages/transcription/playground.dart
+++ b/lib/pages/transcription/playground.dart
@@ -65,6 +65,12 @@ class _PlaygroundState extends State<Playground> with TickerProviderStateMixin{
     initializeVideoAndListeners(file);
   }
 
+  @override
+  void dispose() {
+    player.dispose();
+    super.dispose();
+  }
+
   @override
   Widget build(BuildContext context) {
     final theme = FluentTheme.of(context);

From 72de8efb51dba110406af094612147c97fdf977f Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 20 Nov 2024 17:10:27 +0100
Subject: [PATCH 11/17] Reimplement section tests and fix test for computer
 vision

---
 .../computer_vision/computer_vision.dart      |   1 -
 .../model_properties_test.dart                |   4 +-
 .../transcriptions/utils/section_test.dart    |  99 +++++++++++++++++
 test/section_test.dart                        | 100 ------------------
 4 files changed, 101 insertions(+), 103 deletions(-)
 create mode 100644 test/pages/transcriptions/utils/section_test.dart
 delete mode 100644 test/section_test.dart

diff --git a/lib/pages/computer_vision/computer_vision.dart b/lib/pages/computer_vision/computer_vision.dart
index 8c7522c..715a270 100644
--- a/lib/pages/computer_vision/computer_vision.dart
+++ b/lib/pages/computer_vision/computer_vision.dart
@@ -2,7 +2,6 @@ import 'package:fluent_ui/fluent_ui.dart';
 import 'package:go_router/go_router.dart';
 import 'package:inference/pages/computer_vision/batch_inference.dart';
 import 'package:inference/pages/computer_vision/live_inference.dart';
-import 'package:inference/pages/models/widgets/grid_container.dart';
 import 'package:inference/project.dart';
 import 'package:inference/providers/image_inference_provider.dart';
 import 'package:inference/providers/preference_provider.dart';
diff --git a/test/pages/computer_vision/model_properties_test.dart b/test/pages/computer_vision/model_properties_test.dart
index 70d1e6e..d034774 100644
--- a/test/pages/computer_vision/model_properties_test.dart
+++ b/test/pages/computer_vision/model_properties_test.dart
@@ -22,8 +22,8 @@ Widget testWidget(ImageInferenceProvider provider) {
       ),
     ],
     child: FluentApp(
-      home: const Center(
-        child: ModelProperties()
+      home: Center(
+        child: ModelProperties(project: provider.project)
       ),
     ),
   );
diff --git a/test/pages/transcriptions/utils/section_test.dart b/test/pages/transcriptions/utils/section_test.dart
new file mode 100644
index 0000000..3452a62
--- /dev/null
+++ b/test/pages/transcriptions/utils/section_test.dart
@@ -0,0 +1,99 @@
+import 'package:flutter_test/flutter_test.dart';
+import 'package:inference/pages/transcription/utils/section.dart';
+
+void main() {
+  group("Section", () {
+    group("process", () {
+       test("process sets values in data", () async {
+         final state = DynamicRangeLoading<int>(Section(0, 10));
+         for (int j = 0; j < 10; j++) {
+           await state.process((i) async {
+             return j;
+           });
+
+           expect(state.data[j], j);
+         }
+       });
+
+       test("process out of bounds throws error", () async {
+         final state = DynamicRangeLoading<int>(Section(0, 10));
+         for (int j = 0; j < 10; j++) {
+           await state.process((i) async {
+             return j;
+           });
+         }
+
+         expect(() async {
+           await state.process((i) async {
+             return 1;
+           });
+         }, throwsException);
+       });
+
+       test("process continues after skip is done", () async {
+         final state = DynamicRangeLoading<int>(Section(0, 10));
+         state.skipTo(8);
+         for (int j = 0; j < 2; j++) {
+           await state.process((i) async {
+             return j;
+           });
+         }
+         expect(state.getNextIndex(), 0);
+       });
+
+     });
+
+     test('getNextIndex throws error when state is complete', () {
+         final state = DynamicRangeLoading<int>(Section(0, 0));
+         expect(() {
+           state.getNextIndex();
+         },throwsException);
+     });
+
+     test('complete', () async {
+         final state = DynamicRangeLoading<int>(Section(0, 10));
+         for (int j = 0; j < 10; j++) {
+           expect(state.complete, false);
+           await state.process((i) async {
+             return j;
+           });
+         }
+         expect(state.complete, true);
+     });
+
+     group("skip", () {
+       test("skips to specific index", () async {
+         final state = DynamicRangeLoading<int>(Section(0, 10));
+         state.skipTo(5);
+         expect(state.getNextIndex(), 5);
+         expect(state.activeSection.begin, 5);
+         expect(state.activeSection.end, 10);
+       });
+
+       test("skips to partially complete section will go to end of that section ", () async {
+         final state = DynamicRangeLoading<int>(Section(0, 10));
+
+         for (int j = 0; j < 8; j++) {
+           await state.process((i) async {
+             return j;
+           });
+         }
+         state.skipTo(5);
+         expect(state.getNextIndex(), 8);
+       });
+
+       test("skips to fully complete section will not shift next index", () async {
+         final state = DynamicRangeLoading<int>(Section(0, 10));
+         state.skipTo(5);
+
+         for (int j = 0; j < 5; j++) {
+           await state.process((i) async {
+             return j;
+           });
+         }
+         state.skipTo(5);
+         expect(state.getNextIndex(), 0);
+       });
+    });
+  });
+}
diff --git a/test/section_test.dart b/test/section_test.dart
deleted file mode 100644
index 7aebd68..0000000
--- a/test/section_test.dart
+++ /dev/null
@@ -1,100 +0,0 @@
-import 'package:flutter_test/flutter_test.dart';
-
-void main() {
-  /*
-  group("Section", () {
-    group("process", () {
-    //   test("process sets values in data", () async {
-    //     final state = DynamicRangeLoading<int>(Section(0, 10));
-    //     for (int j = 0; j < 10; j++) {
-    //       await state.process((i) async {
-    //         return j;
-    //       });
-
-    //       expect(state.data[j], j);
-    //     }
-    //   });
-
-    //   test("process out of bounds throws error", () async {
-    //     final state = DynamicRangeLoading<int>(Section(0, 10));
-    //     for (int j = 0; j < 10; j++) {
-    //       await state.process((i) async {
-    //         return j;
-    //       });
-    //     }
-
-    //     expect(() async {
-    //       await state.process((i) async {
-    //         return 1;
-    //       });
-    //     }, throwsException);
-    //   });
-
-    //   test("process continues after skip is done", () async {
-    //     final state = DynamicRangeLoading<int>(Section(0, 10));
-    //     state.skipTo(8);
-    //     for (int j = 0; j < 2; j++) {
-    //       await state.process((i) async {
-    //         return j;
-    //       });
-    //     }
-    //     expect(state.getNextIndex(), 0);
-    //   });
-
-    // });
-
-    // test('getNextIndex throws error when state is complete', () {
-    //     final state = DynamicRangeLoading<int>(Section(0, 0));
-    //     expect(() {
-    //       state.getNextIndex();
-    //     },throwsException);
-    // });
-
-    // test('complete', () async {
-    //     final state = DynamicRangeLoading<int>(Section(0, 10));
-    //     for (int j = 0; j < 10; j++) {
-    //       expect(state.complete, false);
-    //       await state.process((i) async {
-    //         return j;
-    //       });
-    //     }
-    //     expect(state.complete, true);
-    // });
-
-    // group("skip", () {
-    //   test("skips to specific index", () async {
-    //     final state = DynamicRangeLoading<int>(Section(0, 10));
-    //     state.skipTo(5);
-    //     expect(state.getNextIndex(), 5);
-    //     expect(state.activeSection.begin, 5);
-    //     expect(state.activeSection.end, 10);
-    //   });
-
-    //   test("skips to partially complete section will go to end of that section ", () async {
-    //     final state = DynamicRangeLoading<int>(Section(0, 10));
-
-    //     for (int j = 0; j < 8; j++) {
-    //       await state.process((i) async {
-    //         return j;
-    //       });
-    //     }
-    //     state.skipTo(5);
-    //     expect(state.getNextIndex(), 8);
-    //   });
-
-    //   test("skips to fully complete section will not shift next index", () async {
-    //     final state = DynamicRangeLoading<int>(Section(0, 10));
-    //     state.skipTo(5);
-
-    //     for (int j = 0; j < 5; j++) {
-    //       await state.process((i) async {
-    //         return j;
-    //       });
-    //     }
-    //     state.skipTo(5);
-    //     expect(state.getNextIndex(), 0);
-    //   });
-    });
-  });
-  */
-}

From bb635ac6498cd707cdd80b3a10b8284595fba07c Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Thu, 21 Nov 2024 14:51:39 +0100
Subject: [PATCH 12/17] Lock ffmpeg in windows to 6.1.1

---
 .github/workflows/windows-build.yaml     |  3 ++-
 openvino_bindings/README.md              |  6 +++++-
 openvino_bindings/WORKSPACE              | 24 ++++++++++++------------
 openvino_bindings/third_party/.gitignore |  1 +
 openvino_bindings/third_party/vcpkg.json | 10 ++++++++++
 5 files changed, 30 insertions(+), 14 deletions(-)
 create mode 100644 openvino_bindings/third_party/.gitignore
 create mode 100644 openvino_bindings/third_party/vcpkg.json

diff --git a/.github/workflows/windows-build.yaml b/.github/workflows/windows-build.yaml
index e0beccd..8b47c6d 100644
--- a/.github/workflows/windows-build.yaml
+++ b/.github/workflows/windows-build.yaml
@@ -76,7 +76,8 @@ jobs:
         run: |
           git clone https://github.com/microsoft/vcpkg.git C:\vcpkg
           C:\vcpkg\bootstrap-vcpkg.bat
-          C:\vcpkg\vcpkg install ffmpeg
+          cd openvino_bindings/third_party
+          C:\vcpkg\vcpkg install
         shell: cmd
 
       # Step 10: Download and Install OpenVINO Runtime
diff --git a/openvino_bindings/README.md b/openvino_bindings/README.md
index d6f8c21..4715bef 100644
--- a/openvino_bindings/README.md
+++ b/openvino_bindings/README.md
@@ -95,7 +95,11 @@ A step by step guide can be found [here]('./docs/WINDOWS.md').
 [Install OpenVINO Runtime 24.5.0]( https://docs.openvino.ai/2024/get-started/install-openvino.html?PACKAGE=OPENVINO_GENAI&VERSION=v_2024_4_0&OP_SYSTEM=WINDOWS&DISTRIBUTION=ARCHIVE)  with GenAI flavor in `C:/Intel/openvino_24.5.0`.
 
 Build OpenCV in `C:/opencv/build`.
-Install ffmpeg: `vcpkg install ffmpeg`.
+Install ffmpeg: 
+```sh
+cd openvino_bindings/third_party
+vcpkg install
+```
 
 Install [mediapipe requirements](https://ai.google.dev/edge/mediapipe/framework/getting_started/install#installing_on_windows) and setup the environment variables.
 
diff --git a/openvino_bindings/WORKSPACE b/openvino_bindings/WORKSPACE
index 6a1707a..1238a9d 100644
--- a/openvino_bindings/WORKSPACE
+++ b/openvino_bindings/WORKSPACE
@@ -106,23 +106,23 @@ git_repository(
     tag = "v3.11.3",
 )
 
-#new_local_repository(
-#    name = "linux_ffmpeg",
-#    build_file = "//third_party/ffmpeg:linux.BUILD",
-#    path = "/usr"
-#)
-#
+new_local_repository(
+    name = "linux_ffmpeg",
+    build_file = "//third_party/ffmpeg:linux.BUILD",
+    path = "/usr"
+)
+
 new_local_repository(
     name = "mac_ffmpeg",
     build_file = "//third_party/ffmpeg:mac.BUILD",
     path = "/opt/homebrew/opt/ffmpeg@6",
 )
-#
-#new_local_repository(
-#    name = "windows_ffmpeg",
-#    build_file = "//third_party/ffmpeg:windows.BUILD",
-#    path = "C:/vcpkg/packages/ffmpeg_x64-windows",
-#)
+
+new_local_repository(
+    name = "windows_ffmpeg",
+    build_file = "//third_party/ffmpeg:windows.BUILD",
+    path = "./third_party/vcpkg_installed/x64-windows",
+)
 
 http_archive(
     name = "rules_pkg",
diff --git a/openvino_bindings/third_party/.gitignore b/openvino_bindings/third_party/.gitignore
new file mode 100644
index 0000000..8a1403e
--- /dev/null
+++ b/openvino_bindings/third_party/.gitignore
@@ -0,0 +1 @@
+vcpkg_installed
diff --git a/openvino_bindings/third_party/vcpkg.json b/openvino_bindings/third_party/vcpkg.json
new file mode 100644
index 0000000..2497f47
--- /dev/null
+++ b/openvino_bindings/third_party/vcpkg.json
@@ -0,0 +1,10 @@
+{
+  "name": "openvinotestdrivebindings",
+  "builtin-baseline": "c8582b4d83dbd36e1bebc08bf166b5eb807996b0",
+  "dependencies": [
+    "ffmpeg"
+  ],
+  "overrides": [
+    { "name": "ffmpeg", "version": "6.1.1" }
+  ]
+}

From bc223d4afb59a6e6b5fbd23241b2a2cfc997eb3c Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Thu, 21 Nov 2024 15:00:27 +0100
Subject: [PATCH 13/17] fix windows build flutter error

---
 .github/workflows/windows-build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/windows-build.yaml b/.github/workflows/windows-build.yaml
index 8b47c6d..70d7c3e 100644
--- a/.github/workflows/windows-build.yaml
+++ b/.github/workflows/windows-build.yaml
@@ -127,7 +127,7 @@ jobs:
       - uses: subosito/flutter-action@v2
         with:
           channel: 'stable'
-          flutter-version: '3.24.0'
+          flutter-version: '3.24.5'
       - name: Install project dependencies
         run: flutter pub get
       - name: Generate intermediates

From 3491c2a56dad2dba83ddd1985386dc786a2880b3 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Thu, 21 Nov 2024 15:17:05 +0100
Subject: [PATCH 14/17] fix ffmpeg vcpkg hopefully

---
 .github/workflows/windows-build.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/windows-build.yaml b/.github/workflows/windows-build.yaml
index 70d7c3e..440b880 100644
--- a/.github/workflows/windows-build.yaml
+++ b/.github/workflows/windows-build.yaml
@@ -73,12 +73,12 @@ jobs:
 
       # Step 9: Install vcpkg and ffmpeg
       - name: Install vcpkg and ffmpeg
+        shell: powershell
         run: |
-          git clone https://github.com/microsoft/vcpkg.git C:\vcpkg
+          if (!(Test-Path "C:\vcpkg")) { git clone https://github.com/microsoft/vcpkg.git C:\vcpkg }
           C:\vcpkg\bootstrap-vcpkg.bat
           cd openvino_bindings/third_party
           C:\vcpkg\vcpkg install
-        shell: cmd
 
       # Step 10: Download and Install OpenVINO Runtime
       - name: Download and Install OpenVINO Runtime 24.5.0

From f58ea0cd71e66a93464493b7aff86fe9d0fa43ac Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Fri, 22 Nov 2024 13:52:22 +0100
Subject: [PATCH 15/17] Implement performance metrics page

Added metrics for transcription page
Fixes subtitles for light colorscheme
---
 .../transcription/performance_metrics.dart    | 87 +++++++++++++++++++
 lib/pages/transcription/playground.dart       |  9 ++
 .../providers/speech_inference_provider.dart  | 16 +++-
 lib/pages/transcription/transcription.dart    | 11 +--
 lib/pages/transcription/utils/metrics.dart    | 54 ++++++++++++
 .../transcription/widgets/subtitles.dart      |  3 +-
 lib/widgets/performance_tile.dart             | 65 ++++++++++++++
 7 files changed, 238 insertions(+), 7 deletions(-)
 create mode 100644 lib/pages/transcription/performance_metrics.dart
 create mode 100644 lib/pages/transcription/utils/metrics.dart
 create mode 100644 lib/widgets/performance_tile.dart

diff --git a/lib/pages/transcription/performance_metrics.dart b/lib/pages/transcription/performance_metrics.dart
new file mode 100644
index 0000000..a12b554
--- /dev/null
+++ b/lib/pages/transcription/performance_metrics.dart
@@ -0,0 +1,87 @@
+import 'package:fluent_ui/fluent_ui.dart';
+import 'package:inference/pages/computer_vision/widgets/horizontal_rule.dart';
+import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
+import 'package:inference/widgets/performance_tile.dart';
+import 'package:intl/intl.dart';
+import 'package:provider/provider.dart';
+
+class PerformanceMetrics extends StatelessWidget {
+  const PerformanceMetrics({super.key});
+
+  @override
+  Widget build(BuildContext context) {
+    return Consumer<SpeechInferenceProvider>(
+      builder: (context, inference, child) {
+        final metrics = inference.metrics;
+        if (metrics == null) {
+          return Container();
+        }
+
+        Locale locale = Localizations.localeOf(context);
+        final nf = NumberFormat.decimalPatternDigits(
+            locale: locale.languageCode, decimalDigits: 0);
+
+        return Padding(
+          padding: const EdgeInsets.symmetric(vertical: 80),
+          child: Center(
+            child: SizedBox(
+              width: 887,
+              child: Column(
+                children: [
+                  Row(
+                    mainAxisAlignment: MainAxisAlignment.spaceEvenly,
+                    children: [
+                      PerformanceTile(
+                        title: "Time to first token (TTFT)",
+                        value: nf.format(metrics.ttft),
+                        unit: "ms",
+                        tall: true,
+                      ),
+                      PerformanceTile(
+                        title: "Time per output token (TPOT)",
+                        value: nf.format(metrics.tpot),
+                        unit: "ms",
+                        tall: true,
+                      ),
+                      PerformanceTile(
+                        title: "Generate total duration",
+                        value: nf.format(metrics.generateTime),
+                        unit: "ms",
+                        tall: true,
+                      ),
+                    ],
+                  ),
+                  const Padding(
+                    padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16),
+                    child: HorizontalRule(),
+                  ),
+                  Row(
+                    mainAxisAlignment: MainAxisAlignment.spaceEvenly,
+                    children: [
+                      PerformanceTile(
+                        title: "Load time",
+                        value: nf.format(metrics.loadTime),
+                        unit: "ms",
+                      ),
+                      PerformanceTile(
+                        title: "Detokenization duration",
+                        value: nf.format(metrics.detokenizationTime),
+                        unit: "ms",
+                      ),
+                      PerformanceTile(
+                        title: "Throughput",
+                        value: nf.format(metrics.throughput),
+                        unit: "tokens/sec",
+                      ),
+                    ],
+                  ),
+                ],
+              ),
+            ),
+          ),
+        );
+      }
+    );
+  }
+}
+
diff --git a/lib/pages/transcription/playground.dart b/lib/pages/transcription/playground.dart
index 344d00f..5f3e03b 100644
--- a/lib/pages/transcription/playground.dart
+++ b/lib/pages/transcription/playground.dart
@@ -65,6 +65,15 @@ class _PlaygroundState extends State<Playground> with TickerProviderStateMixin{
     initializeVideoAndListeners(file);
   }
 
+  @override
+  void initState() {
+    super.initState();
+    final inference = Provider.of<SpeechInferenceProvider>(context, listen: false);
+    if (inference.videoPath != null) {
+      initializeVideoAndListeners(inference.videoPath!);
+    }
+  }
+
   @override
   void dispose() {
     player.dispose();
diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart
index 8e574f2..fedbf72 100644
--- a/lib/pages/transcription/providers/speech_inference_provider.dart
+++ b/lib/pages/transcription/providers/speech_inference_provider.dart
@@ -3,9 +3,11 @@ import 'dart:async';
 import 'package:flutter/material.dart';
 import 'package:inference/interop/openvino_bindings.dart';
 import 'package:inference/interop/speech_to_text.dart';
+import 'package:inference/pages/transcription/utils/metrics.dart';
 import 'package:inference/pages/transcription/utils/section.dart';
 import 'package:inference/project.dart';
 
+
 const transcriptionPeriod = 10;
 
 class SpeechInferenceProvider  extends ChangeNotifier {
@@ -21,6 +23,7 @@ class SpeechInferenceProvider  extends ChangeNotifier {
   bool get videoLoaded => _videoPath != null;
 
   DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>? transcription;
+  DMetrics? metrics;
 
   bool get transcriptionComplete {
     return transcription?.complete ?? false;
@@ -62,6 +65,15 @@ class SpeechInferenceProvider  extends ChangeNotifier {
     notifyListeners();
   }
 
+  void addMetrics(TranscriptionModelResponse response) {
+    if (metrics == null) {
+      metrics = DMetrics.fromCMetrics(response.metrics);
+    } else {
+      metrics!.addCMetrics(response.metrics);
+    }
+    notifyListeners();
+  }
+
   Future<void> startTranscribing() async {
     if (transcription == null) {
       throw Exception("Can't transcribe before loading video");
@@ -72,7 +84,9 @@ class SpeechInferenceProvider  extends ChangeNotifier {
         return;
       }
       await transcription!.process((int i) {
-          return transcribe(i * transcriptionPeriod, transcriptionPeriod);
+          final request = transcribe(i * transcriptionPeriod, transcriptionPeriod);
+          request.then(addMetrics);
+          return request;
       });
       if (hasListeners) {
         notifyListeners();
diff --git a/lib/pages/transcription/transcription.dart b/lib/pages/transcription/transcription.dart
index 0ea6f14..1d9fd00 100644
--- a/lib/pages/transcription/transcription.dart
+++ b/lib/pages/transcription/transcription.dart
@@ -3,6 +3,7 @@ import 'package:go_router/go_router.dart';
 import 'package:inference/project.dart';
 import 'package:inference/providers/preference_provider.dart';
 import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
+import 'package:inference/pages/transcription/performance_metrics.dart';
 import 'package:inference/pages/transcription/playground.dart';
 import 'package:provider/provider.dart';
 
@@ -80,11 +81,11 @@ class _TranscriptionPageState extends State<TranscriptionPage> {
                     title: const Text("Playground"),
                     body: Playground(project: widget.project),
                   ),
-                  //PaneItem(
-                  //  icon: const Icon(FluentIcons.project_collection),
-                  //  title: const Text("Performance metrics"),
-                  //  body: Container(),
-                  //),
+                  PaneItem(
+                    icon: const Icon(FluentIcons.line_chart),
+                    title: const Text("Performance metrics"),
+                    body: const PerformanceMetrics(),
+                  ),
                 ],
               )
             ),
diff --git a/lib/pages/transcription/utils/metrics.dart b/lib/pages/transcription/utils/metrics.dart
new file mode 100644
index 0000000..481c9f3
--- /dev/null
+++ b/lib/pages/transcription/utils/metrics.dart
@@ -0,0 +1,54 @@
+import 'package:inference/interop/generated_bindings.dart';
+
+class DMetrics {
+  double loadTime;
+  double generateTime;
+  double tokenizationTime;
+  double detokenizationTime;
+  double ttft;
+  double tpot;
+  double throughput;
+  int numberOfGeneratedTokens;
+  int numberOfInputTokens;
+
+  int n = 1; // number of added metrics
+
+  DMetrics({
+    required this.loadTime,
+    required this.generateTime,
+    required this.tokenizationTime,
+    required this.detokenizationTime,
+    required this.ttft,
+    required this.tpot,
+    required this.throughput,
+    required this.numberOfGeneratedTokens,
+    required this.numberOfInputTokens,
+  });
+
+  void addCMetrics(Metrics metrics) {
+    //loadTime = metrics.load_time;
+    generateTime += metrics.generate_time;
+    tokenizationTime += metrics.tokenization_time;
+    detokenizationTime += metrics.detokenization_time;
+    ttft = (ttft * (n / (n + 1))) + metrics.ttft / n;
+    tpot = (tpot * (n / (n + 1))) + metrics.tpot / n;
+    throughput = (throughput * (n / (n + 1))) + metrics.throughput / n;
+    numberOfGeneratedTokens += metrics.number_of_generated_tokens;
+    numberOfInputTokens += metrics.number_of_input_tokens;
+    n += 1;
+  }
+
+  factory DMetrics.fromCMetrics(Metrics metrics) {
+    return DMetrics(
+      loadTime: metrics.load_time,
+      generateTime: metrics.generate_time,
+      tokenizationTime: metrics.tokenization_time,
+      detokenizationTime: metrics.detokenization_time,
+      ttft: metrics.ttft,
+      tpot: metrics.tpot,
+      throughput: metrics.throughput,
+      numberOfGeneratedTokens: metrics.number_of_generated_tokens,
+      numberOfInputTokens: metrics.number_of_input_tokens,
+    );
+  }
+}
diff --git a/lib/pages/transcription/widgets/subtitles.dart b/lib/pages/transcription/widgets/subtitles.dart
index 9971c9a..da17b0c 100644
--- a/lib/pages/transcription/widgets/subtitles.dart
+++ b/lib/pages/transcription/widgets/subtitles.dart
@@ -44,7 +44,8 @@ class Subtitles extends StatelessWidget {
                   Text(text,
                     textAlign: TextAlign.center,
                     style: const TextStyle(
-                      fontSize: fontSize
+                      fontSize: fontSize,
+                      color: Colors.white,
                     )
                   )
                 ],
diff --git a/lib/widgets/performance_tile.dart b/lib/widgets/performance_tile.dart
new file mode 100644
index 0000000..6ab5dd0
--- /dev/null
+++ b/lib/widgets/performance_tile.dart
@@ -0,0 +1,65 @@
+import 'package:fluent_ui/fluent_ui.dart';
+
+class PerformanceTile extends StatelessWidget {
+  final String title;
+  final String value;
+  final String unit;
+  final bool tall;
+
+  const PerformanceTile({
+      super.key,
+      required this.title,
+      required this.value,
+      required this.unit,
+      this.tall = false,
+  });
+
+  @override
+  Widget build(BuildContext context) {
+    final theme = FluentTheme.of(context);
+    return Padding(
+      padding: const EdgeInsets.all(8.0),
+      child: Acrylic(
+        elevation: 5,
+        shadowColor: Colors.black,
+        shape: RoundedRectangleBorder (
+          borderRadius: BorderRadius.circular(4),
+        ),
+        child: SizedBox(
+          width: 268,
+          height: tall ? 200 : 124,
+          child: Center(
+            child: Column(
+              mainAxisAlignment: MainAxisAlignment.center,
+              crossAxisAlignment: CrossAxisAlignment.center,
+              children: [
+                Text(
+                  title,
+                  style: const TextStyle(
+                    fontSize: 14,
+                  ),
+                ),
+                RichText(
+                  text: TextSpan(
+                    style: TextStyle(
+                      color: theme.inactiveColor,
+                    ),
+                    children: [
+                      TextSpan(text: value,
+                        style: const TextStyle(
+                          fontSize: 30,
+                        )
+                      ),
+                      TextSpan(text: " $unit"),
+                    ]
+                  )
+                ),
+              ],
+            )
+          )
+        ),
+      ),
+    );
+  }
+
+}

From 6bddb727891a8b9fa28f217bbefc2d2564020b6c Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Fri, 22 Nov 2024 13:58:09 +0100
Subject: [PATCH 16/17] Add model properties to performance metrics

---
 .../transcription/performance_metrics.dart    | 147 ++++++++++--------
 lib/pages/transcription/transcription.dart    |   2 +-
 2 files changed, 79 insertions(+), 70 deletions(-)

diff --git a/lib/pages/transcription/performance_metrics.dart b/lib/pages/transcription/performance_metrics.dart
index a12b554..e1795a1 100644
--- a/lib/pages/transcription/performance_metrics.dart
+++ b/lib/pages/transcription/performance_metrics.dart
@@ -1,87 +1,96 @@
 import 'package:fluent_ui/fluent_ui.dart';
 import 'package:inference/pages/computer_vision/widgets/horizontal_rule.dart';
+import 'package:inference/pages/computer_vision/widgets/model_properties.dart';
 import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
+import 'package:inference/project.dart';
 import 'package:inference/widgets/performance_tile.dart';
 import 'package:intl/intl.dart';
 import 'package:provider/provider.dart';
 
 class PerformanceMetrics extends StatelessWidget {
-  const PerformanceMetrics({super.key});
+  final Project project;
+  const PerformanceMetrics({super.key, required this.project});
 
   @override
   Widget build(BuildContext context) {
-    return Consumer<SpeechInferenceProvider>(
-      builder: (context, inference, child) {
-        final metrics = inference.metrics;
-        if (metrics == null) {
-          return Container();
-        }
+    return Row(
+      children: [
+        Expanded(
+          child: Consumer<SpeechInferenceProvider>(
+            builder: (context, inference, child) {
+              final metrics = inference.metrics;
+              if (metrics == null) {
+                return Container();
+              }
 
-        Locale locale = Localizations.localeOf(context);
-        final nf = NumberFormat.decimalPatternDigits(
-            locale: locale.languageCode, decimalDigits: 0);
+              Locale locale = Localizations.localeOf(context);
+              final nf = NumberFormat.decimalPatternDigits(
+                  locale: locale.languageCode, decimalDigits: 0);
 
-        return Padding(
-          padding: const EdgeInsets.symmetric(vertical: 80),
-          child: Center(
-            child: SizedBox(
-              width: 887,
-              child: Column(
-                children: [
-                  Row(
-                    mainAxisAlignment: MainAxisAlignment.spaceEvenly,
-                    children: [
-                      PerformanceTile(
-                        title: "Time to first token (TTFT)",
-                        value: nf.format(metrics.ttft),
-                        unit: "ms",
-                        tall: true,
-                      ),
-                      PerformanceTile(
-                        title: "Time per output token (TPOT)",
-                        value: nf.format(metrics.tpot),
-                        unit: "ms",
-                        tall: true,
-                      ),
-                      PerformanceTile(
-                        title: "Generate total duration",
-                        value: nf.format(metrics.generateTime),
-                        unit: "ms",
-                        tall: true,
-                      ),
-                    ],
+              return Padding(
+                padding: const EdgeInsets.symmetric(vertical: 80),
+                child: Center(
+                  child: SizedBox(
+                    width: 887,
+                    child: Column(
+                      children: [
+                        Row(
+                          mainAxisAlignment: MainAxisAlignment.spaceEvenly,
+                          children: [
+                            PerformanceTile(
+                              title: "Time to first token (TTFT)",
+                              value: nf.format(metrics.ttft),
+                              unit: "ms",
+                              tall: true,
+                            ),
+                            PerformanceTile(
+                              title: "Time per output token (TPOT)",
+                              value: nf.format(metrics.tpot),
+                              unit: "ms",
+                              tall: true,
+                            ),
+                            PerformanceTile(
+                              title: "Generate total duration",
+                              value: nf.format(metrics.generateTime),
+                              unit: "ms",
+                              tall: true,
+                            ),
+                          ],
+                        ),
+                        const Padding(
+                          padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16),
+                          child: HorizontalRule(),
+                        ),
+                        Row(
+                          mainAxisAlignment: MainAxisAlignment.spaceEvenly,
+                          children: [
+                            PerformanceTile(
+                              title: "Load time",
+                              value: nf.format(metrics.loadTime),
+                              unit: "ms",
+                            ),
+                            PerformanceTile(
+                              title: "Detokenization duration",
+                              value: nf.format(metrics.detokenizationTime),
+                              unit: "ms",
+                            ),
+                            PerformanceTile(
+                              title: "Throughput",
+                              value: nf.format(metrics.throughput),
+                              unit: "tokens/sec",
+                            ),
+                          ],
+                        ),
+                      ],
+                    ),
                   ),
-                  const Padding(
-                    padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16),
-                    child: HorizontalRule(),
-                  ),
-                  Row(
-                    mainAxisAlignment: MainAxisAlignment.spaceEvenly,
-                    children: [
-                      PerformanceTile(
-                        title: "Load time",
-                        value: nf.format(metrics.loadTime),
-                        unit: "ms",
-                      ),
-                      PerformanceTile(
-                        title: "Detokenization duration",
-                        value: nf.format(metrics.detokenizationTime),
-                        unit: "ms",
-                      ),
-                      PerformanceTile(
-                        title: "Throughput",
-                        value: nf.format(metrics.throughput),
-                        unit: "tokens/sec",
-                      ),
-                    ],
-                  ),
-                ],
-              ),
-            ),
+                ),
+              );
+            }
           ),
-        );
-      }
+        ),
+        ModelProperties(project: project),
+      ],
     );
   }
 }
-
diff --git a/lib/pages/transcription/transcription.dart b/lib/pages/transcription/transcription.dart
index 1d9fd00..46f54af 100644
--- a/lib/pages/transcription/transcription.dart
+++ b/lib/pages/transcription/transcription.dart
@@ -84,7 +84,7 @@ class _TranscriptionPageState extends State<TranscriptionPage> {
                   PaneItem(
                     icon: const Icon(FluentIcons.line_chart),
                     title: const Text("Performance metrics"),
-                    body: const PerformanceMetrics(),
+                    body: PerformanceMetrics(project: widget.project),
                   ),
                 ],
               )

From c9788eaceac6ba4dcf1ba01649e1b5f4c3f894ca Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Fri, 22 Nov 2024 17:09:54 +0100
Subject: [PATCH 17/17] Fix transcription from continuening after dispose

---
 .../transcription/performance_metrics.dart    | 137 +++++++++---------
 .../providers/speech_inference_provider.dart  |  16 +-
 2 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/lib/pages/transcription/performance_metrics.dart b/lib/pages/transcription/performance_metrics.dart
index e1795a1..0abf3e1 100644
--- a/lib/pages/transcription/performance_metrics.dart
+++ b/lib/pages/transcription/performance_metrics.dart
@@ -1,6 +1,7 @@
 import 'package:fluent_ui/fluent_ui.dart';
 import 'package:inference/pages/computer_vision/widgets/horizontal_rule.dart';
 import 'package:inference/pages/computer_vision/widgets/model_properties.dart';
+import 'package:inference/pages/models/widgets/grid_container.dart';
 import 'package:inference/pages/transcription/providers/speech_inference_provider.dart';
 import 'package:inference/project.dart';
 import 'package:inference/widgets/performance_tile.dart';
@@ -16,77 +17,79 @@ class PerformanceMetrics extends StatelessWidget {
     return Row(
       children: [
         Expanded(
-          child: Consumer<SpeechInferenceProvider>(
-            builder: (context, inference, child) {
-              final metrics = inference.metrics;
-              if (metrics == null) {
-                return Container();
-              }
+          child: GridContainer(
+            child: Consumer<SpeechInferenceProvider>(
+              builder: (context, inference, child) {
+                final metrics = inference.metrics;
+                if (metrics == null) {
+                  return Container();
+                }
 
-              Locale locale = Localizations.localeOf(context);
-              final nf = NumberFormat.decimalPatternDigits(
-                  locale: locale.languageCode, decimalDigits: 0);
+                Locale locale = Localizations.localeOf(context);
+                final nf = NumberFormat.decimalPatternDigits(
+                    locale: locale.languageCode, decimalDigits: 0);
 
-              return Padding(
-                padding: const EdgeInsets.symmetric(vertical: 80),
-                child: Center(
-                  child: SizedBox(
-                    width: 887,
-                    child: Column(
-                      children: [
-                        Row(
-                          mainAxisAlignment: MainAxisAlignment.spaceEvenly,
-                          children: [
-                            PerformanceTile(
-                              title: "Time to first token (TTFT)",
-                              value: nf.format(metrics.ttft),
-                              unit: "ms",
-                              tall: true,
-                            ),
-                            PerformanceTile(
-                              title: "Time per output token (TPOT)",
-                              value: nf.format(metrics.tpot),
-                              unit: "ms",
-                              tall: true,
-                            ),
-                            PerformanceTile(
-                              title: "Generate total duration",
-                              value: nf.format(metrics.generateTime),
-                              unit: "ms",
-                              tall: true,
-                            ),
-                          ],
-                        ),
-                        const Padding(
-                          padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16),
-                          child: HorizontalRule(),
-                        ),
-                        Row(
-                          mainAxisAlignment: MainAxisAlignment.spaceEvenly,
-                          children: [
-                            PerformanceTile(
-                              title: "Load time",
-                              value: nf.format(metrics.loadTime),
-                              unit: "ms",
-                            ),
-                            PerformanceTile(
-                              title: "Detokenization duration",
-                              value: nf.format(metrics.detokenizationTime),
-                              unit: "ms",
-                            ),
-                            PerformanceTile(
-                              title: "Throughput",
-                              value: nf.format(metrics.throughput),
-                              unit: "tokens/sec",
-                            ),
-                          ],
-                        ),
-                      ],
+                return Padding(
+                  padding: const EdgeInsets.symmetric(vertical: 80),
+                  child: Center(
+                    child: SizedBox(
+                      width: 887,
+                      child: Column(
+                        children: [
+                          Row(
+                            mainAxisAlignment: MainAxisAlignment.spaceEvenly,
+                            children: [
+                              PerformanceTile(
+                                title: "Time to first token (TTFT)",
+                                value: nf.format(metrics.ttft),
+                                unit: "ms",
+                                tall: true,
+                              ),
+                              PerformanceTile(
+                                title: "Time per output token (TPOT)",
+                                value: nf.format(metrics.tpot),
+                                unit: "ms",
+                                tall: true,
+                              ),
+                              PerformanceTile(
+                                title: "Generate total duration",
+                                value: nf.format(metrics.generateTime),
+                                unit: "ms",
+                                tall: true,
+                              ),
+                            ],
+                          ),
+                          const Padding(
+                            padding: EdgeInsets.symmetric(horizontal: 16.0, vertical: 16),
+                            child: HorizontalRule(),
+                          ),
+                          Row(
+                            mainAxisAlignment: MainAxisAlignment.spaceEvenly,
+                            children: [
+                              PerformanceTile(
+                                title: "Load time",
+                                value: nf.format(metrics.loadTime),
+                                unit: "ms",
+                              ),
+                              PerformanceTile(
+                                title: "Detokenization duration",
+                                value: nf.format(metrics.detokenizationTime),
+                                unit: "ms",
+                              ),
+                              PerformanceTile(
+                                title: "Throughput",
+                                value: nf.format(metrics.throughput),
+                                unit: "tokens/sec",
+                              ),
+                            ],
+                          ),
+                        ],
+                      ),
                     ),
                   ),
-                ),
-              );
-            }
+                );
+              }
+            ),
           ),
         ),
         ModelProperties(project: project),
diff --git a/lib/pages/transcription/providers/speech_inference_provider.dart b/lib/pages/transcription/providers/speech_inference_provider.dart
index fedbf72..606b9e7 100644
--- a/lib/pages/transcription/providers/speech_inference_provider.dart
+++ b/lib/pages/transcription/providers/speech_inference_provider.dart
@@ -20,6 +20,8 @@ class SpeechInferenceProvider  extends ChangeNotifier {
   String? _videoPath;
   String? get videoPath => _videoPath;
 
+  bool forceStop = false;
+
   bool get videoLoaded => _videoPath != null;
 
   DynamicRangeLoading<FutureOr<TranscriptionModelResponse>>? transcription;
@@ -79,13 +81,17 @@ class SpeechInferenceProvider  extends ChangeNotifier {
       throw Exception("Can't transcribe before loading video");
     }
 
-    while (!transcription!.complete) {
+    forceStop = false;
+
+    while ((!transcription!.complete) || !forceStop) {
       if (transcription == null) {
         return;
       }
       await transcription!.process((int i) {
           final request = transcribe(i * transcriptionPeriod, transcriptionPeriod);
-          request.then(addMetrics);
+          if (!forceStop) {
+            request.then(addMetrics);
+          }
           return request;
       });
       if (hasListeners) {
@@ -103,4 +109,10 @@ class SpeechInferenceProvider  extends ChangeNotifier {
     return _project == project && _device == device;
   }
 
+  @override
+  void dispose() {
+    forceStop = true;
+    super.dispose();
+  }
+
 }