Skip to content

Commit

Permalink
feat(ios): support wav base64 for transcribe & add transcribeData for…
Browse files Browse the repository at this point in the history
… no head data
  • Loading branch information
jhen0409 committed Nov 6, 2024
1 parent 6301734 commit 2d7b130
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 80 deletions.
6 changes: 3 additions & 3 deletions example/ios/Podfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1298,10 +1298,10 @@ EXTERNAL SOURCES:

SPEC CHECKSUMS:
boost: d3f49c53809116a5d38da093a8aa78bf551aed09
DoubleConversion: 5189b271737e1565bdce30deb4a08d647e3f5f54
DoubleConversion: fea03f2699887d960129cc54bba7e52542b6f953
FBLazyVector: fbc4957d9aa695250b55d879c1d86f79d7e69ab4
fmt: ff9d55029c625d3757ed641535fd4a75fedc7ce9
glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b
glog: c5d68082e772fa1c511173d6b30a9de2c05a69a2
hermes-engine: b361c9ef5ef3cda53f66e195599b47e1f84ffa35
libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913
RCT-Folly: 7169b2b1c44399c76a47b5deaaba715eeeb476c0
Expand Down Expand Up @@ -1355,7 +1355,7 @@ SPEC CHECKSUMS:
SocketRocket: f32cd54efbe0f095c4d7594881e52619cfe80b17
SSZipArchive: c69881e8ac5521f0e622291387add5f60f30f3c4
whisper-rn: d8fa919d9e9cffc1ac6e72d7da9a95b9d9344641
Yoga: e64aa65de36c0832d04e8c7bd614396c77a80047
Yoga: 13c8ef87792450193e117976337b8527b49e8c03

PODFILE CHECKSUM: 80498ebe77b4fb086f5bdde47d6aa337318b67f0

Expand Down
140 changes: 103 additions & 37 deletions ios/RNWhisper.mm
Original file line number Diff line number Diff line change
Expand Up @@ -105,42 +105,17 @@ - (NSArray *)supportedEvents {
];
}

RCT_REMAP_METHOD(transcribeFile,
withContextId:(int)contextId
withJobId:(int)jobId
withWaveFile:(NSString *)waveFilePath
withOptions:(NSDictionary *)options
withResolver:(RCTPromiseResolveBlock)resolve
withRejecter:(RCTPromiseRejectBlock)reject)
- (void)transcribeData:(RNWhisperContext *)context
withContextId:(int)contextId
withJobId:(int)jobId
withData:(float *)data
withDataCount:(int)count
withOptions:(NSDictionary *)options
withResolver:(RCTPromiseResolveBlock)resolve
withRejecter:(RCTPromiseRejectBlock)reject
{
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];

if (context == nil) {
reject(@"whisper_error", @"Context not found", nil);
return;
}
if ([context isCapturing]) {
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
return;
}
if ([context isTranscribing]) {
reject(@"whisper_error", @"Context is already transcribing", nil);
return;
}

NSString *path = waveFilePath;
if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
path = [RNWhisperDownloader downloadFile:path toFile:nil];
}

int count = 0;
float *waveFile = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
if (waveFile == nil) {
reject(@"whisper_error", @"Invalid file", nil);
return;
}
[context transcribeFile:jobId
audioData:waveFile
[context transcribeData:jobId
audioData:data
audioDataCount:count
options:options
onProgress: ^(int progress) {
Expand Down Expand Up @@ -173,18 +148,109 @@ - (NSArray *)supportedEvents {
}
onEnd: ^(int code) {
if (code != 0 && code != 999) {
free(waveFile);
reject(@"whisper_cpp_error", [NSString stringWithFormat:@"Failed to transcribe the file. Code: %d", code], nil);
return;
}
free(waveFile);
NSMutableDictionary *result = [context getTextSegments];
result[@"isAborted"] = @([context isStoppedByAction]);
resolve(result);
}
];
}

RCT_REMAP_METHOD(transcribeFile,
withContextId:(int)contextId
withJobId:(int)jobId
withWaveFile:(NSString *)waveFilePathOrDataBase64
withOptions:(NSDictionary *)options
withResolver:(RCTPromiseResolveBlock)resolve
withRejecter:(RCTPromiseRejectBlock)reject)
{
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];

if (context == nil) {
reject(@"whisper_error", @"Context not found", nil);
return;
}
if ([context isCapturing]) {
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
return;
}
if ([context isTranscribing]) {
reject(@"whisper_error", @"Context is already transcribing", nil);
return;
}

float *data = nil;
int count = 0;
if ([waveFilePathOrDataBase64 hasPrefix:@"http://"] || [waveFilePathOrDataBase64 hasPrefix:@"https://"]) {
NSString *path = [RNWhisperDownloader downloadFile:waveFilePathOrDataBase64 toFile:nil];
data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
} else if ([waveFilePathOrDataBase64 hasPrefix:@"data:audio/wav;base64,"]) {
NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[waveFilePathOrDataBase64 substringFromIndex:22] options:0];
data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
} else {
data = [RNWhisperAudioUtils decodeWaveFile:waveFilePathOrDataBase64 count:&count];
}
if (data == nil) {
reject(@"whisper_error", @"Invalid file", nil);
return;
}

[self transcribeData:context
withContextId:contextId
withJobId:jobId
withData:data
withDataCount:count
withOptions:options
withResolver:resolve
withRejecter:reject
];
}

RCT_REMAP_METHOD(transcribeData,
withContextId:(int)contextId
withJobId:(int)jobId
withData:(NSString *)dataBase64 // pcm data base64 encoded
withOptions:(NSDictionary *)options
withResolver:(RCTPromiseResolveBlock)resolve
withRejecter:(RCTPromiseRejectBlock)reject)
{
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];

if (context == nil) {
reject(@"whisper_error", @"Context not found", nil);
return;
}
if ([context isCapturing]) {
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
return;
}
if ([context isTranscribing]) {
reject(@"whisper_error", @"Context is already transcribing", nil);
return;
}

NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:dataBase64 options:0];
int count = 0;
float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];

if (data == nil) {
reject(@"whisper_error", @"Invalid data", nil);
return;
}

[self transcribeData:context
withContextId:contextId
withJobId:jobId
withData:data
withDataCount:count
withOptions:options
withResolver:resolve
withRejecter:reject
];
}

RCT_REMAP_METHOD(startRealtimeTranscribe,
withContextId:(int)contextId
withJobId:(int)jobId
Expand Down
1 change: 1 addition & 0 deletions ios/RNWhisperAudioUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

@interface RNWhisperAudioUtils : NSObject

+ (float *)decodeWaveData:(NSData*)data count:(int *)count cutHeader:(BOOL)cutHeader;
+ (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;

@end
33 changes: 20 additions & 13 deletions ios/RNWhisperAudioUtils.m
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,32 @@

@implementation RNWhisperAudioUtils

+ (float *)decodeWaveData:(NSData*)data count:(int *)count cutHeader:(BOOL)cutHeader {
NSData *waveData = data;
if (cutHeader) {
// just cut 44 bytes from the beginning
waveData = [data subdataWithRange:NSMakeRange(44, [data length]-44)];
}
const short *shortArray = (const short *)[waveData bytes];
int shortCount = (int) ([waveData length] / sizeof(short));
float *floatArray = (float *) malloc(shortCount * sizeof(float));
for (NSInteger i = 0; i < shortCount; i++) {
float floatValue = ((float)shortArray[i]) / 32767.0;
floatValue = MAX(floatValue, -1.0);
floatValue = MIN(floatValue, 1.0);
floatArray[i] = floatValue;
}
*count = shortCount;
return floatArray;
}

+ (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
NSURL *url = [NSURL fileURLWithPath:filePath];
NSData *fileData = [NSData dataWithContentsOfURL:url];
if (fileData == nil) {
return nil;
}
NSMutableData *waveData = [[NSMutableData alloc] init];
[waveData appendData:[fileData subdataWithRange:NSMakeRange(44, [fileData length]-44)]];
const short *shortArray = (const short *)[waveData bytes];
int shortCount = (int) ([waveData length] / sizeof(short));
float *floatArray = (float *) malloc(shortCount * sizeof(float));
for (NSInteger i = 0; i < shortCount; i++) {
float floatValue = ((float)shortArray[i]) / 32767.0;
floatValue = MAX(floatValue, -1.0);
floatValue = MIN(floatValue, 1.0);
floatArray[i] = floatValue;
}
*count = shortCount;
return floatArray;
return [RNWhisperAudioUtils decodeWaveData:fileData count:count cutHeader:YES];
}

@end
2 changes: 1 addition & 1 deletion ios/RNWhisperContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ typedef struct {
- (OSStatus)transcribeRealtime:(int)jobId
options:(NSDictionary *)options
onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe;
- (void)transcribeFile:(int)jobId
- (void)transcribeData:(int)jobId
audioData:(float *)audioData
audioDataCount:(int)audioDataCount
options:(NSDictionary *)options
Expand Down
2 changes: 1 addition & 1 deletion ios/RNWhisperContext.mm
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ - (OSStatus)transcribeRealtime:(int)jobId
bool tdrzEnable;
};

- (void)transcribeFile:(int)jobId
- (void)transcribeData:(int)jobId
audioData:(float *)audioData
audioDataCount:(int)audioDataCount
options:(NSDictionary *)options
Expand Down
8 changes: 7 additions & 1 deletion src/NativeRNWhisper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,13 @@ export interface Spec extends TurboModule {
transcribeFile(
contextId: number,
jobId: number,
path: string,
pathOrBase64: string,
options: {}, // TranscribeOptions & { onProgress?: boolean, onNewSegments?: boolean }
): Promise<TranscribeResult>;
transcribeData(
contextId: number,
jobId: number,
dataBase64: string,
options: {}, // TranscribeOptions & { onProgress?: boolean, onNewSegments?: boolean }
): Promise<TranscribeResult>;
startRealtimeTranscribe(
Expand Down
68 changes: 44 additions & 24 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,32 +202,10 @@ export class WhisperContext {
this.reasonNoGPU = reasonNoGPU
}

/** Transcribe audio file */
transcribe(
filePath: string | number,
options: TranscribeFileOptions = {},
): {
/** Stop the transcribe */
private transcribeWithNativeMethod(method: 'transcribeFile' | 'transcribeData', data: string, options: TranscribeFileOptions = {}): {
stop: () => Promise<void>
/** Transcribe result promise */
promise: Promise<TranscribeResult>
} {
let path = ''
if (typeof filePath === 'number') {
try {
const source = Image.resolveAssetSource(filePath)
if (source) path = source.uri
} catch (e) {
throw new Error(`Invalid asset: ${filePath}`)
}
} else {
if (filePath.startsWith('http'))
throw new Error(
'Transcribe remote file is not supported, please download it first',
)
path = filePath
}
if (path.startsWith('file://')) path = path.slice(7)
const jobId: number = Math.floor(Math.random() * 10000)

const { onProgress, onNewSegments, ...rest } = options
Expand Down Expand Up @@ -276,7 +254,7 @@ export class WhisperContext {
removeProgressListener()
removeNewSegmenetsListener()
},
promise: RNWhisper.transcribeFile(this.id, jobId, path, {
promise: RNWhisper[method](this.id, jobId, data, {
...rest,
onProgress: !!onProgress,
onNewSegments: !!onNewSegments,
Expand All @@ -298,6 +276,48 @@ export class WhisperContext {
}
}

/**
* Transcribe audio file (path or base64 encoded wav file)
* base64: need add `data:audio/wav;base64,` prefix
*/
transcribe(
filePathOrBase64: string | number,
options: TranscribeFileOptions = {},
): {
/** Stop the transcribe */
stop: () => Promise<void>
/** Transcribe result promise */
promise: Promise<TranscribeResult>
} {
let path = ''
if (typeof filePathOrBase64 === 'number') {
try {
const source = Image.resolveAssetSource(filePathOrBase64)
if (source) path = source.uri
} catch (e) {
throw new Error(`Invalid asset: ${filePathOrBase64}`)
}
} else {
if (filePathOrBase64.startsWith('http'))
throw new Error(
'Transcribe remote file is not supported, please download it first',
)
path = filePathOrBase64
}
if (path.startsWith('file://')) path = path.slice(7)
return this.transcribeWithNativeMethod('transcribeFile', path, options)
}

/**
* Transcribe audio data (base64 encoded float32 PCM data)
*/
transcribeData(data: string, options: TranscribeFileOptions = {}): {
stop: () => Promise<void>
promise: Promise<TranscribeResult>
} {
return this.transcribeWithNativeMethod('transcribeData', data, options)
}

/** Transcribe the microphone audio stream, the microphone user permission is required */
async transcribeRealtime(options: TranscribeRealtimeOptions = {}): Promise<{
/** Stop the realtime transcribe */
Expand Down

0 comments on commit 2d7b130

Please sign in to comment.