From 750ab66e3b8a32670d937d4b835d95d5f7d12dfe Mon Sep 17 00:00:00 2001 From: Jhen Date: Sat, 23 Sep 2023 12:53:27 +0800 Subject: [PATCH 1/4] feat(ios): add option to saving recorded audio as wav on startRealtimeTranscribe --- ios/RNWhisper.xcodeproj/project.pbxproj | 24 ++++++++-- ios/RNWhisperAudioUtils.h | 8 ++++ ios/RNWhisperAudioUtils.m | 64 +++++++++++++++++++++++++ ios/RNWhisperContext.mm | 12 +++++ src/index.ts | 1 - 5 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 ios/RNWhisperAudioUtils.h create mode 100644 ios/RNWhisperAudioUtils.m diff --git a/ios/RNWhisper.xcodeproj/project.pbxproj b/ios/RNWhisper.xcodeproj/project.pbxproj index d4b0ce4..b6610ad 100644 --- a/ios/RNWhisper.xcodeproj/project.pbxproj +++ b/ios/RNWhisper.xcodeproj/project.pbxproj @@ -8,6 +8,9 @@ /* Begin PBXBuildFile section */ 5E555C0D2413F4C50049A1A2 /* RNWhisper.mm in Sources */ = {isa = PBXBuildFile; fileRef = B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */; }; + 7FE0BBA12ABE6C7B0049B4E4 /* RNWhisperDownloader.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */; }; + 7FE0BBA22ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */; }; + 7FE0BBA32ABE6C7B0049B4E4 /* RNWhisperContext.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -24,6 +27,13 @@ /* Begin PBXFileReference section */ 134814201AA4EA6300B7C361 /* libRNWhisper.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libRNWhisper.a; sourceTree = BUILT_PRODUCTS_DIR; }; + 7FE0BB9A2ABE6C7B0049B4E4 /* RNWhisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisper.h; sourceTree = ""; }; + 7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperDownloader.m; sourceTree = ""; }; + 7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperAudioUtils.m; sourceTree = ""; }; + 7FE0BB9D2ABE6C7B0049B4E4 /* RNWhisperContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperContext.h; sourceTree = ""; }; + 7FE0BB9E2ABE6C7B0049B4E4 /* RNWhisperDownloader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperDownloader.h; sourceTree = ""; }; + 7FE0BB9F2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperAudioUtils.h; sourceTree = ""; }; + 7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RNWhisperContext.mm; sourceTree = ""; }; B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RNWhisper.mm; sourceTree = ""; }; /* End PBXFileReference section */ @@ -49,6 +59,13 @@ 58B511D21A9E6C8500147676 = { isa = PBXGroup; children = ( + 7FE0BB9F2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.h */, + 7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */, + 7FE0BB9A2ABE6C7B0049B4E4 /* RNWhisper.h */, + 7FE0BB9D2ABE6C7B0049B4E4 /* RNWhisperContext.h */, + 7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */, + 7FE0BB9E2ABE6C7B0049B4E4 /* RNWhisperDownloader.h */, + 7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */, B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */, 134814211AA4EA7D00B7C361 /* Products */, ); @@ -112,6 +129,9 @@ buildActionMask = 2147483647; files = ( 5E555C0D2413F4C50049A1A2 /* RNWhisper.mm in Sources */, + 7FE0BBA22ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m in Sources */, + 7FE0BBA32ABE6C7B0049B4E4 /* RNWhisperContext.mm in Sources */, + 7FE0BBA12ABE6C7B0049B4E4 /* RNWhisperDownloader.m in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -223,9 +243,7 @@ "$(SRCROOT)/../../react-native/React/**", ); LIBRARY_SEARCH_PATHS = "$(inherited)"; - OTHER_LDFLAGS = ( - "-ObjC", - ); + OTHER_LDFLAGS = "-ObjC"; PRODUCT_NAME = RNWhisper; SKIP_INSTALL = YES; }; diff --git a/ios/RNWhisperAudioUtils.h b/ios/RNWhisperAudioUtils.h new file mode 100644 index 0000000..dffe2e5 --- /dev/null +++ b/ios/RNWhisperAudioUtils.h @@ -0,0 +1,8 @@ +#import + +@interface RNWhisperAudioUtils : NSObject + ++ (NSData *)concatShortBuffers:(NSMutableArray *)buffers sliceSize:(int)sliceSize lastSliceSize:(int)lastSliceSize; ++ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile; + +@end diff --git a/ios/RNWhisperAudioUtils.m b/ios/RNWhisperAudioUtils.m new file mode 100644 index 0000000..bd7db33 --- /dev/null +++ b/ios/RNWhisperAudioUtils.m @@ -0,0 +1,64 @@ +#import "RNWhisperAudioUtils.h" +#import "whisper.h" + +@implementation RNWhisperAudioUtils + ++ (NSData *)concatShortBuffers:(NSMutableArray *)buffers sliceSize:(int)sliceSize lastSliceSize:(int)lastSliceSize { + NSMutableData *outputData = [NSMutableData data]; + for (NSValue *buffer in buffers) { + int size = sliceSize; + if (buffer == buffers.lastObject) { + size = lastSliceSize; + } + short *bufferPtr = buffer.pointerValue; + [outputData appendBytes:bufferPtr length:size * sizeof(short)]; + } + return outputData; +} + ++ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile { + NSMutableData *outputData = [NSMutableData data]; + + // WAVE header + [outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id + int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length); + [outputData appendBytes:&chunkSize length:sizeof(chunkSize)]; + [outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format + [outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id + + int subchunk1Size = CFSwapInt32HostToLittle(16); + [outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)]; + + short audioFormat = CFSwapInt16HostToLittle(1); // PCM + [outputData appendBytes:&audioFormat length:sizeof(audioFormat)]; + + short numChannels = CFSwapInt16HostToLittle(1); // mono + [outputData appendBytes:&numChannels length:sizeof(numChannels)]; + + int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE); + [outputData appendBytes:&sampleRate length:sizeof(sampleRate)]; + + // (bitDepth * sampleRate * channels) >> 3 + int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8); + [outputData appendBytes:&byteRate length:sizeof(byteRate)]; + + // (bitDepth * channels) >> 3 + short blockAlign = CFSwapInt16HostToLittle(16 / 8); + [outputData appendBytes:&blockAlign length:sizeof(blockAlign)]; + + // bitDepth + short bitsPerSample = CFSwapInt16HostToLittle(16); + [outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)]; + + [outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id + int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length); + [outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)]; + + // Audio data + [outputData appendData:rawData]; + + // Save to file + [outputData writeToFile:audioOutputFile atomically:YES]; +} + +@end diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index 57baa8f..a300064 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -1,4 +1,5 @@ #import "RNWhisperContext.h" +#import "RNWhisperAudioUtils.h" #define NUM_BYTES_PER_BUFFER 16 * 1024 @@ -212,6 +213,17 @@ - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state { NSLog(@"[RNWhisper] Transcribe end"); result[@"isStoppedByAction"] = @(state->isStoppedByAction); result[@"isCapturing"] = @(false); + + // Save wav if needed + if (state->options[@"audioOutputPath"] != nil) { + [RNWhisperAudioUtils + saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices + sliceSize:state->audioSliceSec * WHISPER_SAMPLE_RATE + lastSliceSize:nSamplesOfIndex] + audioOutputFile:state->options[@"audioOutputPath"] + ]; + } + state->transcribeHandler(state->jobId, @"end", result); } else if (code == 0) { result[@"isCapturing"] = @(true); diff --git a/src/index.ts b/src/index.ts index bb126b4..f8eaa84 100644 --- a/src/index.ts +++ b/src/index.ts @@ -60,7 +60,6 @@ export type TranscribeRealtimeOptions = TranscribeOptions & { realtimeAudioSliceSec?: number /** * Output path for audio file. If not set, the audio file will not be saved - * TODO: Support iOS * (Default: Undefined) */ audioOutputPath?: string From 30251df15e0f8704844e92efd3e6636b06954743 Mon Sep 17 00:00:00 2001 From: Jhen Date: Sat, 23 Sep 2023 13:02:33 +0800 Subject: [PATCH 2/4] feat(example): add audioOutputPath example & play button --- example/ios/Podfile.lock | 21 +++++++++++++++------ example/package.json | 1 + example/src/App.tsx | 31 +++++++++++++++++++++++++++++++ example/yarn.lock | 5 +++++ 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index a94a1fd..15a13c6 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -750,16 +750,21 @@ PODS: - React-perflogger (= 0.71.11) - RNFS (2.20.0): - React-Core - - RNZipArchive (6.0.9): + - RNSound (0.11.2): - React-Core - - RNZipArchive/Core (= 6.0.9) + - RNSound/Core (= 0.11.2) + - RNSound/Core (0.11.2): + - React-Core + - RNZipArchive (6.1.0): + - React-Core + - RNZipArchive/Core (= 6.1.0) - SSZipArchive (~> 2.2) - - RNZipArchive/Core (6.0.9): + - RNZipArchive/Core (6.1.0): - React-Core - SSZipArchive (~> 2.2) - SocketRocket (0.6.0) - SSZipArchive (2.4.3) - - whisper-rn (0.3.5): + - whisper-rn (0.3.6): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -835,6 +840,7 @@ DEPENDENCIES: - React-runtimeexecutor (from `../node_modules/react-native/ReactCommon/runtimeexecutor`) - ReactCommon/turbomodule/core (from `../node_modules/react-native/ReactCommon`) - RNFS (from `../node_modules/react-native-fs`) + - RNSound (from `../node_modules/react-native-sound`) - RNZipArchive (from `../node_modules/react-native-zip-archive`) - whisper-rn (from `../..`) - Yoga (from `../node_modules/react-native/ReactCommon/yoga`) @@ -935,6 +941,8 @@ EXTERNAL SOURCES: :path: "../node_modules/react-native/ReactCommon" RNFS: :path: "../node_modules/react-native-fs" + RNSound: + :path: "../node_modules/react-native-sound" RNZipArchive: :path: "../node_modules/react-native-zip-archive" whisper-rn: @@ -994,10 +1002,11 @@ SPEC CHECKSUMS: React-runtimeexecutor: 4817d63dbc9d658f8dc0ec56bd9b83ce531129f0 ReactCommon: 08723d2ed328c5cbcb0de168f231bc7bae7f8aa1 RNFS: 4ac0f0ea233904cb798630b3c077808c06931688 - RNZipArchive: 68a0c6db4b1c103f846f1559622050df254a3ade + RNSound: 6c156f925295bdc83e8e422e7d8b38d33bc71852 + RNZipArchive: ef9451b849c45a29509bf44e65b788829ab07801 SocketRocket: fccef3f9c5cedea1353a9ef6ada904fde10d6608 SSZipArchive: fe6a26b2a54d5a0890f2567b5cc6de5caa600aef - whisper-rn: 6f293154b175fee138a994fa00d0f414fb1f44e9 + whisper-rn: e80c0482f6a632faafd601f98f10da0255c1e1ec Yoga: f7decafdc5e8c125e6fa0da38a687e35238420fa YogaKit: f782866e155069a2cca2517aafea43200b01fd5a diff --git a/example/package.json b/example/package.json index b43ae6a..0778175 100644 --- a/example/package.json +++ b/example/package.json @@ -12,6 +12,7 @@ "react": "18.2.0", "react-native": "0.71.11", "react-native-fs": "^2.20.0", + "react-native-sound": "^0.11.2", "react-native-zip-archive": "^6.1.0" }, "devDependencies": { diff --git a/example/src/App.tsx b/example/src/App.tsx index d75a013..06833a8 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -11,6 +11,7 @@ import { } from 'react-native' import RNFS from 'react-native-fs' import { unzip } from 'react-native-zip-archive' +import Sound from 'react-native-sound' import { initWhisper, libVersion } from '../../src' // whisper.rn import type { WhisperContext } from '../../src' import contextOpts from './context-opts' @@ -78,6 +79,8 @@ const fileDir = `${RNFS.DocumentDirectoryPath}/whisper` console.log('[App] fileDir', fileDir) +const recordFile = `${fileDir}/realtime.wav` + const modelHost = 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main' const createDir = async (log: any) => { @@ -259,6 +262,7 @@ export default function App() { } log('Start realtime transcribing...') try { + await createDir(log) const { stop, subscribe } = await whisperContext.transcribeRealtime({ language: 'en', @@ -266,6 +270,8 @@ export default function App() { realtimeAudioSec: 60, // Slice audio into 25 (or < 30) sec chunks for better performance realtimeAudioSliceSec: 25, + // Save audio on stop + audioOutputPath: recordFile, }) setStopTranscribe({ stop }) subscribe((evt) => { @@ -343,6 +349,31 @@ export default function App() { > Clear Download files + { + if (!await RNFS.exists(recordFile)) { + log('Recorded file does not exist') + return + } + const player = new Sound(recordFile, '', (e) => { + if (e) { + log('error', e) + return + } + player.play((success) => { + if (success) { + log('successfully finished playing'); + } else { + log('playback failed due to audio decoding errors'); + } + player.release(); + }); + }) + }} + > + Play Recorded file + ) diff --git a/example/yarn.lock b/example/yarn.lock index a15ef03..1247401 100644 --- a/example/yarn.lock +++ b/example/yarn.lock @@ -4172,6 +4172,11 @@ react-native-gradle-plugin@^0.71.19: resolved "https://registry.yarnpkg.com/react-native-gradle-plugin/-/react-native-gradle-plugin-0.71.19.tgz#3379e28341fcd189bc1f4691cefc84c1a4d7d232" integrity sha512-1dVk9NwhoyKHCSxcrM6vY6cxmojeATsBobDicX0ZKr7DgUF2cBQRTKsimQFvzH8XhOVXyH8p4HyDSZNIFI8OlQ== +react-native-sound@^0.11.2: + version "0.11.2" + resolved "https://registry.yarnpkg.com/react-native-sound/-/react-native-sound-0.11.2.tgz#e542dc5b9e16ab4b3ac7e6eaddb1fc8d98da9038" + integrity sha512-LmGc8lgOK3qecYMVQpyHvww/C+wgT6sWeMpVbOe4NCRGC2yKd4fo4U0KBUo9PO7AqKESO3I/2GZg1/C0+bwiiA== + react-native-zip-archive@^6.1.0: version "6.1.0" resolved "https://registry.yarnpkg.com/react-native-zip-archive/-/react-native-zip-archive-6.1.0.tgz#beed62dea9c7ff1e4fd4b6ce0e496ede5ab2f96f" From 1967edea3569376e23b6fd83b1b2f8f02f5d0fad Mon Sep 17 00:00:00 2001 From: Jhen Date: Sat, 23 Sep 2023 14:53:14 +0800 Subject: [PATCH 3/4] feat(ios): use sliceNSamples to concatShortBuffers --- ios/RNWhisperAudioUtils.h | 2 +- ios/RNWhisperAudioUtils.m | 10 ++++------ ios/RNWhisperContext.mm | 3 +-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/ios/RNWhisperAudioUtils.h b/ios/RNWhisperAudioUtils.h index dffe2e5..3749daf 100644 --- a/ios/RNWhisperAudioUtils.h +++ b/ios/RNWhisperAudioUtils.h @@ -2,7 +2,7 @@ @interface RNWhisperAudioUtils : NSObject -+ (NSData *)concatShortBuffers:(NSMutableArray *)buffers sliceSize:(int)sliceSize lastSliceSize:(int)lastSliceSize; ++ (NSData *)concatShortBuffers:(NSMutableArray *)buffers sliceNSamples:(NSMutableArray *)sliceNSamples; + (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile; @end diff --git a/ios/RNWhisperAudioUtils.m b/ios/RNWhisperAudioUtils.m index bd7db33..bd8d443 100644 --- a/ios/RNWhisperAudioUtils.m +++ b/ios/RNWhisperAudioUtils.m @@ -3,13 +3,11 @@ @implementation RNWhisperAudioUtils -+ (NSData *)concatShortBuffers:(NSMutableArray *)buffers sliceSize:(int)sliceSize lastSliceSize:(int)lastSliceSize { ++ (NSData *)concatShortBuffers:(NSMutableArray *)buffers sliceNSamples:(NSMutableArray *)sliceNSamples { NSMutableData *outputData = [NSMutableData data]; - for (NSValue *buffer in buffers) { - int size = sliceSize; - if (buffer == buffers.lastObject) { - size = lastSliceSize; - } + for (int i = 0; i < buffers.count; i++) { + int size = [sliceNSamples objectAtIndex:i].intValue; + NSValue *buffer = [buffers objectAtIndex:i]; short *bufferPtr = buffer.pointerValue; [outputData appendBytes:bufferPtr length:size * sizeof(short)]; } diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index f50226b..6acea4a 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -251,8 +251,7 @@ - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state { if (state->options[@"audioOutputPath"] != nil) { [RNWhisperAudioUtils saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices - sliceSize:state->audioSliceSec * WHISPER_SAMPLE_RATE - lastSliceSize:nSamplesOfIndex] + sliceNSamples:state->sliceNSamples] audioOutputFile:state->options[@"audioOutputPath"] ]; } From d95689448dd17e73f1fcae9f6c1dfee8a939fd68 Mon Sep 17 00:00:00 2001 From: Jhen Date: Sat, 23 Sep 2023 15:09:11 +0800 Subject: [PATCH 4/4] feat: add todo --- android/src/main/java/com/rnwhisper/WhisperContext.java | 1 + ios/RNWhisperContext.mm | 1 + 2 files changed, 2 insertions(+) diff --git a/android/src/main/java/com/rnwhisper/WhisperContext.java b/android/src/main/java/com/rnwhisper/WhisperContext.java index 3f8552e..cd8889b 100644 --- a/android/src/main/java/com/rnwhisper/WhisperContext.java +++ b/android/src/main/java/com/rnwhisper/WhisperContext.java @@ -293,6 +293,7 @@ public void run() { Log.e(NAME, "Error transcribing realtime: " + e.getMessage()); } } + // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage Log.d(NAME, "Begin saving wav file to " + audioOutputPath); saveWavFile(concatShortBuffers(shortBufferSlices), audioOutputPath); if (!isTranscribing) { diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index 6acea4a..6401fd6 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -249,6 +249,7 @@ - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state { // Save wav if needed if (state->options[@"audioOutputPath"] != nil) { + // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage [RNWhisperAudioUtils saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices sliceNSamples:state->sliceNSamples]