Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Whisper pipeline #39

Open
wants to merge 19 commits into
base: fluent-ui-migration
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/windows-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ jobs:

# Step 9: Install vcpkg and ffmpeg
- name: Install vcpkg and ffmpeg
shell: powershell
run: |
git clone https://github.com/microsoft/vcpkg.git C:\vcpkg
if (!(Test-Path "C:\vcpkg")) { git clone https://github.com/microsoft/vcpkg.git C:\vcpkg }
C:\vcpkg\bootstrap-vcpkg.bat
C:\vcpkg\vcpkg install ffmpeg
shell: cmd
cd openvino_bindings/third_party
C:\vcpkg\vcpkg install

# Step 10: Download and Install OpenVINO Runtime
- name: Download and Install OpenVINO Runtime 24.5.0
Expand Down
131 changes: 131 additions & 0 deletions lib/interop/generated_bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,37 @@ class OpenVINO {
late final _freeStatusOrSpeechToText = _freeStatusOrSpeechToTextPtr
.asFunction<void Function(ffi.Pointer<StatusOrSpeechToText>)>();

void freeStatusOrModelResponse(
ffi.Pointer<StatusOrModelResponse> status,
) {
return _freeStatusOrModelResponse(
status,
);
}

late final _freeStatusOrModelResponsePtr = _lookup<
ffi.NativeFunction<
ffi.Void Function(ffi.Pointer<StatusOrModelResponse>)>>(
'freeStatusOrModelResponse');
late final _freeStatusOrModelResponse = _freeStatusOrModelResponsePtr
.asFunction<void Function(ffi.Pointer<StatusOrModelResponse>)>();

void freeStatusOrWhisperModelResponse(
ffi.Pointer<StatusOrWhisperModelResponse> status,
) {
return _freeStatusOrWhisperModelResponse(
status,
);
}

late final _freeStatusOrWhisperModelResponsePtr = _lookup<
ffi.NativeFunction<
ffi.Void Function(ffi.Pointer<StatusOrWhisperModelResponse>)>>(
'freeStatusOrWhisperModelResponse');
late final _freeStatusOrWhisperModelResponse =
_freeStatusOrWhisperModelResponsePtr.asFunction<
void Function(ffi.Pointer<StatusOrWhisperModelResponse>)>();

void freeStatusOrDevices(
ffi.Pointer<StatusOrDevices> status,
) {
Expand Down Expand Up @@ -644,6 +675,80 @@ class OpenVINO {
late final _graphRunnerStop = _graphRunnerStopPtr
.asFunction<ffi.Pointer<Status> Function(CGraphRunner)>();

ffi.Pointer<StatusOrSpeechToText> speechToTextOpen(
ffi.Pointer<pkg_ffi.Utf8> model_path,
ffi.Pointer<pkg_ffi.Utf8> device,
) {
return _speechToTextOpen(
model_path,
device,
);
}

late final _speechToTextOpenPtr = _lookup<
ffi.NativeFunction<
ffi.Pointer<StatusOrSpeechToText> Function(ffi.Pointer<pkg_ffi.Utf8>,
ffi.Pointer<pkg_ffi.Utf8>)>>('speechToTextOpen');
late final _speechToTextOpen = _speechToTextOpenPtr.asFunction<
ffi.Pointer<StatusOrSpeechToText> Function(
ffi.Pointer<pkg_ffi.Utf8>, ffi.Pointer<pkg_ffi.Utf8>)>();

ffi.Pointer<Status> speechToTextLoadVideo(
CSpeechToText instance,
ffi.Pointer<pkg_ffi.Utf8> video_path,
) {
return _speechToTextLoadVideo(
instance,
video_path,
);
}

late final _speechToTextLoadVideoPtr = _lookup<
ffi.NativeFunction<
ffi.Pointer<Status> Function(CSpeechToText,
ffi.Pointer<pkg_ffi.Utf8>)>>('speechToTextLoadVideo');
late final _speechToTextLoadVideo = _speechToTextLoadVideoPtr.asFunction<
ffi.Pointer<Status> Function(CSpeechToText, ffi.Pointer<pkg_ffi.Utf8>)>();

ffi.Pointer<StatusOrInt> speechToTextVideoDuration(
CSpeechToText instance,
) {
return _speechToTextVideoDuration(
instance,
);
}

late final _speechToTextVideoDurationPtr = _lookup<
ffi.NativeFunction<ffi.Pointer<StatusOrInt> Function(CSpeechToText)>>(
'speechToTextVideoDuration');
late final _speechToTextVideoDuration = _speechToTextVideoDurationPtr
.asFunction<ffi.Pointer<StatusOrInt> Function(CSpeechToText)>();

ffi.Pointer<StatusOrWhisperModelResponse> speechToTextTranscribe(
CSpeechToText instance,
int start,
int duration,
ffi.Pointer<pkg_ffi.Utf8> language,
) {
return _speechToTextTranscribe(
instance,
start,
duration,
language,
);
}

late final _speechToTextTranscribePtr = _lookup<
ffi.NativeFunction<
ffi.Pointer<StatusOrWhisperModelResponse> Function(
CSpeechToText,
ffi.Int,
ffi.Int,
ffi.Pointer<pkg_ffi.Utf8>)>>('speechToTextTranscribe');
late final _speechToTextTranscribe = _speechToTextTranscribePtr.asFunction<
ffi.Pointer<StatusOrWhisperModelResponse> Function(
CSpeechToText, int, int, ffi.Pointer<pkg_ffi.Utf8>)>();

ffi.Pointer<StatusOrDevices> getAvailableDevices() {
return _getAvailableDevices();
}
Expand Down Expand Up @@ -762,6 +867,16 @@ final class Device extends ffi.Struct {
external ffi.Pointer<pkg_ffi.Utf8> name;
}

final class TranscriptionChunk extends ffi.Struct {
@ffi.Float()
external double start_ts;

@ffi.Float()
external double end_ts;

external ffi.Pointer<pkg_ffi.Utf8> text;
}

final class Status extends ffi.Struct {
@ffi.Int()
external int status;
Expand Down Expand Up @@ -862,6 +977,22 @@ final class StatusOrModelResponse extends ffi.Struct {
external ffi.Pointer<pkg_ffi.Utf8> value;
}

final class StatusOrWhisperModelResponse extends ffi.Struct {
@ffi.Int()
external int status;

external ffi.Pointer<pkg_ffi.Utf8> message;

external Metrics metrics;

external ffi.Pointer<TranscriptionChunk> value;

@ffi.Int()
external int size;

external ffi.Pointer<pkg_ffi.Utf8> text;
}

final class StatusOrTTIModelResponse extends ffi.Struct {
@ffi.Int()
external int status;
Expand Down
14 changes: 14 additions & 0 deletions lib/interop/openvino_bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ class SerializationOutput {

}

class Chunk {
final double start;
final double end;
final String text;
const Chunk(this.start, this.end, this.text);
}

class TranscriptionModelResponse {
final List<Chunk> chunks;
final Metrics metrics;
final String text;
const TranscriptionModelResponse(this.chunks, this.metrics, this.text);
}

class ModelResponse {
final String content;
final Metrics metrics;
Expand Down
125 changes: 66 additions & 59 deletions lib/interop/speech_to_text.dart
Original file line number Diff line number Diff line change
Expand Up @@ -9,72 +9,79 @@ final ov = getBindings();
class SpeechToText {
final Pointer<StatusOrSpeechToText> instance;



SpeechToText(this.instance);

static Future<SpeechToText> init(String modelPath, String device) async {
throw UnimplementedError();
//final result = await Isolate.run(() {
// final modelPathPtr = modelPath.toNativeUtf8();
// final devicePtr = device.toNativeUtf8();
// final status = ov.speechToTextOpen(modelPathPtr, devicePtr);
// calloc.free(modelPathPtr);
// calloc.free(devicePtr);

// return status;
//});

//print("${result.ref.status}, ${result.ref.message}");
//if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
// throw "SpeechToText open error: ${result.ref.status} ${result.ref.message.toDartString()}";
//}

//return SpeechToText(result);
final result = await Isolate.run(() {
final modelPathPtr = modelPath.toNativeUtf8();
final devicePtr = device.toNativeUtf8();
final status = ov.speechToTextOpen(modelPathPtr, devicePtr);
calloc.free(modelPathPtr);
calloc.free(devicePtr);

return status;
});

print("${result.ref.status}, ${result.ref.message}");
if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
throw "SpeechToText open error: ${result.ref.status} ${result.ref.message.toDartString()}";
}

return SpeechToText(result);
}

Future<int> loadVideo(String videoPath) async{
throw UnimplementedError();
//int instanceAddress = instance.ref.value.address;
//{
// final result = await Isolate.run(() {
// final videoPathPtr = videoPath.toNativeUtf8();
// final status = ov.speechToTextLoadVideo(Pointer<Void>.fromAddress(instanceAddress), videoPathPtr);
// calloc.free(videoPathPtr);
// return status;
// });

// if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
// throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
// }
//}

//{
// final result = await Isolate.run(() {
// final status = ov.speechToTextVideoDuration(Pointer<Void>.fromAddress(instanceAddress));
// return status;
// });
// if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
// throw "SpeechToText VideoDuration error: ${result.ref.status} ${result.ref.message.toDartString()}";
// }
// return result.ref.value;
//}
int instanceAddress = instance.ref.value.address;
{
final result = await Isolate.run(() {
final videoPathPtr = videoPath.toNativeUtf8();
final status = ov.speechToTextLoadVideo(Pointer<Void>.fromAddress(instanceAddress), videoPathPtr);
calloc.free(videoPathPtr);
return status;
});

if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
}
}

{
final result = await Isolate.run(() {
final status = ov.speechToTextVideoDuration(Pointer<Void>.fromAddress(instanceAddress));
return status;
});
if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
throw "SpeechToText VideoDuration error: ${result.ref.status} ${result.ref.message.toDartString()}";
}
return result.ref.value;
}
}

Future<String> transcribe(int start, int duration, String language) async{
throw UnimplementedError();
//int instanceAddress = instance.ref.value.address;
//final result = await Isolate.run(() {
// final languagePtr = language.toNativeUtf8();
// final status = ov.speechToTextTranscribe(Pointer<Void>.fromAddress(instanceAddress), start, duration, languagePtr);
// calloc.free(languagePtr);
// return status;
//});

//if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
// throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
//}

//return result.ref.value.toDartString();
Future<TranscriptionModelResponse> transcribe(int start, int duration, String language) async{
int instanceAddress = instance.ref.value.address;
final result = await Isolate.run(() {
final languagePtr = language.toNativeUtf8();
final status = ov.speechToTextTranscribe(Pointer<Void>.fromAddress(instanceAddress), start, duration, languagePtr);
calloc.free(languagePtr);
return status;
});

if (StatusEnum.fromValue(result.ref.status) != StatusEnum.OkStatus) {
throw "SpeechToText LoadVideo error: ${result.ref.status} ${result.ref.message.toDartString()}";
}

List<Chunk> chunks = [];
for (int i = 0; i < result.ref.size; i++) {
chunks.add(Chunk(
result.ref.value[i].start_ts,
result.ref.value[i].end_ts,
result.ref.value[i].text.toDartString()
));
}
final metrics = result.ref.metrics;
final text = result.ref.text.toDartString();
ov.freeStatusOrWhisperModelResponse(result);

return TranscriptionModelResponse(chunks, metrics, text);
}
}
2 changes: 2 additions & 0 deletions lib/main.dart
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import 'package:inference/theme_fluent.dart';
import 'package:inference/providers/preference_provider.dart';
import 'package:inference/providers/project_provider.dart';
import 'package:inference/public_models.dart';
import 'package:media_kit/media_kit.dart';
import 'package:provider/provider.dart';


Expand All @@ -25,6 +26,7 @@ void testConnection() async {
}

void main() {
MediaKit.ensureInitialized();
testConnection();
runApp(const App());
}
Expand Down
2 changes: 1 addition & 1 deletion lib/pages/computer_vision/batch_inference.dart
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class BatchInference extends StatelessWidget {
),
),
),
const ModelProperties(),
ModelProperties(project: batchInference.imageInference.project),
],
);
}
Expand Down
1 change: 0 additions & 1 deletion lib/pages/computer_vision/computer_vision.dart
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import 'package:fluent_ui/fluent_ui.dart';
import 'package:go_router/go_router.dart';
import 'package:inference/pages/computer_vision/batch_inference.dart';
import 'package:inference/pages/computer_vision/live_inference.dart';
import 'package:inference/pages/models/widgets/grid_container.dart';
import 'package:inference/project.dart';
import 'package:inference/providers/image_inference_provider.dart';
import 'package:inference/providers/preference_provider.dart';
Expand Down
2 changes: 1 addition & 1 deletion lib/pages/computer_vision/live_inference.dart
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class _LiveInferenceState extends State<LiveInference> {
],
),
),
const ModelProperties(),
ModelProperties(project: widget.project),
],
);
}
Expand Down
Loading
Loading