feat(ios): add option to saving recorded audio as wav on startRealtim…

…eTranscribe (#132) * feat(ios): add option to saving recorded audio as wav on startRealtimeTranscribe * feat(example): add audioOutputPath example & play button * feat(ios): use sliceNSamples to concatShortBuffers * feat: add todo
mybigday · Sep 23, 2023 · fe8216c · fe8216c
1 parent 965409d
commit fe8216c
Show file tree

Hide file tree

Showing 10 changed files with 150 additions and 4 deletions.
diff --git a/android/src/main/java/com/rnwhisper/WhisperContext.java b/android/src/main/java/com/rnwhisper/WhisperContext.java
@@ -293,6 +293,7 @@ public void run() {
               Log.e(NAME, "Error transcribing realtime: " + e.getMessage());
             }
           }
+          // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
           Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
           saveWavFile(concatShortBuffers(shortBufferSlices), audioOutputPath);
           if (!isTranscribing) {

diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
@@ -750,6 +750,11 @@ PODS:
     - React-perflogger (= 0.71.11)
   - RNFS (2.20.0):
     - React-Core
+  - RNSound (0.11.2):
+    - React-Core
+    - RNSound/Core (= 0.11.2)
+  - RNSound/Core (0.11.2):
+    - React-Core
   - RNZipArchive (6.1.0):
     - React-Core
     - RNZipArchive/Core (= 6.1.0)
@@ -835,6 +840,7 @@ DEPENDENCIES:
   - React-runtimeexecutor (from `../node_modules/react-native/ReactCommon/runtimeexecutor`)
   - ReactCommon/turbomodule/core (from `../node_modules/react-native/ReactCommon`)
   - RNFS (from `../node_modules/react-native-fs`)
+  - RNSound (from `../node_modules/react-native-sound`)
   - RNZipArchive (from `../node_modules/react-native-zip-archive`)
   - whisper-rn (from `../..`)
   - Yoga (from `../node_modules/react-native/ReactCommon/yoga`)
@@ -935,6 +941,8 @@ EXTERNAL SOURCES:
     :path: "../node_modules/react-native/ReactCommon"
   RNFS:
     :path: "../node_modules/react-native-fs"
+  RNSound:
+    :path: "../node_modules/react-native-sound"
   RNZipArchive:
     :path: "../node_modules/react-native-zip-archive"
   whisper-rn:
@@ -994,6 +1002,7 @@ SPEC CHECKSUMS:
   React-runtimeexecutor: 4817d63dbc9d658f8dc0ec56bd9b83ce531129f0
   ReactCommon: 08723d2ed328c5cbcb0de168f231bc7bae7f8aa1
   RNFS: 4ac0f0ea233904cb798630b3c077808c06931688
+  RNSound: 6c156f925295bdc83e8e422e7d8b38d33bc71852
   RNZipArchive: ef9451b849c45a29509bf44e65b788829ab07801
   SocketRocket: fccef3f9c5cedea1353a9ef6ada904fde10d6608
   SSZipArchive: fe6a26b2a54d5a0890f2567b5cc6de5caa600aef

diff --git a/example/package.json b/example/package.json
@@ -12,6 +12,7 @@
     "react": "18.2.0",
     "react-native": "0.71.11",
     "react-native-fs": "^2.20.0",
+    "react-native-sound": "^0.11.2",
     "react-native-zip-archive": "^6.1.0"
   },
   "devDependencies": {

diff --git a/example/src/App.tsx b/example/src/App.tsx
@@ -11,6 +11,7 @@ import {
 } from 'react-native'
 import RNFS from 'react-native-fs'
 import { unzip } from 'react-native-zip-archive'
+import Sound from 'react-native-sound'
 import { initWhisper, libVersion } from '../../src' // whisper.rn
 import type { WhisperContext } from '../../src'
 import contextOpts from './context-opts'
@@ -78,6 +79,8 @@ const fileDir = `${RNFS.DocumentDirectoryPath}/whisper`
 
 console.log('[App] fileDir', fileDir)
 
+const recordFile = `${fileDir}/realtime.wav`
+
 const modelHost = 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main'
 
 const createDir = async (log: any) => {
@@ -259,13 +262,16 @@ export default function App() {
               }
               log('Start realtime transcribing...')
               try {
+                await createDir(log)
                 const { stop, subscribe } =
                   await whisperContext.transcribeRealtime({
                     language: 'en',
                     // Record duration in seconds
                     realtimeAudioSec: 60,
                     // Slice audio into 25 (or < 30) sec chunks for better performance
                     realtimeAudioSliceSec: 25,
+                    // Save audio on stop
+                    audioOutputPath: recordFile,
                     // Voice Activity Detection - Start transcribing when speech is detected
                     // useVad: true,
                   })
@@ -345,6 +351,31 @@ export default function App() {
         >
           <Text style={styles.buttonText}>Clear Download files</Text>
         </TouchableOpacity>
+        <TouchableOpacity
+          style={[styles.button, styles.buttonClear]}
+          onPress={async () => {
+            if (!await RNFS.exists(recordFile)) {
+              log('Recorded file does not exist')
+              return
+            }
+            const player = new Sound(recordFile, '', (e) => {
+              if (e) {
+                log('error', e)
+                return
+              }
+              player.play((success) => {
+                if (success) {
+                  log('successfully finished playing');
+                } else {
+                  log('playback failed due to audio decoding errors');
+                }
+                player.release();
+              });
+            })
+          }}
+        >
+          <Text style={styles.buttonText}>Play Recorded file</Text>
+        </TouchableOpacity>
       </SafeAreaView>
     </ScrollView>
   )

diff --git a/example/yarn.lock b/example/yarn.lock
@@ -4172,6 +4172,11 @@ react-native-gradle-plugin@^0.71.19:
   resolved "https://registry.yarnpkg.com/react-native-gradle-plugin/-/react-native-gradle-plugin-0.71.19.tgz#3379e28341fcd189bc1f4691cefc84c1a4d7d232"
   integrity sha512-1dVk9NwhoyKHCSxcrM6vY6cxmojeATsBobDicX0ZKr7DgUF2cBQRTKsimQFvzH8XhOVXyH8p4HyDSZNIFI8OlQ==
 
+react-native-sound@^0.11.2:
+  version "0.11.2"
+  resolved "https://registry.yarnpkg.com/react-native-sound/-/react-native-sound-0.11.2.tgz#e542dc5b9e16ab4b3ac7e6eaddb1fc8d98da9038"
+  integrity sha512-LmGc8lgOK3qecYMVQpyHvww/C+wgT6sWeMpVbOe4NCRGC2yKd4fo4U0KBUo9PO7AqKESO3I/2GZg1/C0+bwiiA==
+
 react-native-zip-archive@^6.1.0:
   version "6.1.0"
   resolved "https://registry.yarnpkg.com/react-native-zip-archive/-/react-native-zip-archive-6.1.0.tgz#beed62dea9c7ff1e4fd4b6ce0e496ede5ab2f96f"

diff --git a/ios/RNWhisper.xcodeproj/project.pbxproj b/ios/RNWhisper.xcodeproj/project.pbxproj
@@ -8,6 +8,9 @@
 
 /* Begin PBXBuildFile section */
 		5E555C0D2413F4C50049A1A2 /* RNWhisper.mm in Sources */ = {isa = PBXBuildFile; fileRef = B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */; };
+		7FE0BBA12ABE6C7B0049B4E4 /* RNWhisperDownloader.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */; };
+		7FE0BBA22ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */; };
+		7FE0BBA32ABE6C7B0049B4E4 /* RNWhisperContext.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -24,6 +27,13 @@
 
 /* Begin PBXFileReference section */
 		134814201AA4EA6300B7C361 /* libRNWhisper.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libRNWhisper.a; sourceTree = BUILT_PRODUCTS_DIR; };
+		7FE0BB9A2ABE6C7B0049B4E4 /* RNWhisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisper.h; sourceTree = "<group>"; };
+		7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperDownloader.m; sourceTree = "<group>"; };
+		7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperAudioUtils.m; sourceTree = "<group>"; };
+		7FE0BB9D2ABE6C7B0049B4E4 /* RNWhisperContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperContext.h; sourceTree = "<group>"; };
+		7FE0BB9E2ABE6C7B0049B4E4 /* RNWhisperDownloader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperDownloader.h; sourceTree = "<group>"; };
+		7FE0BB9F2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperAudioUtils.h; sourceTree = "<group>"; };
+		7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RNWhisperContext.mm; sourceTree = "<group>"; };
 		B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RNWhisper.mm; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
@@ -49,6 +59,13 @@
 		58B511D21A9E6C8500147676 = {
 			isa = PBXGroup;
 			children = (
+				7FE0BB9F2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.h */,
+				7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */,
+				7FE0BB9A2ABE6C7B0049B4E4 /* RNWhisper.h */,
+				7FE0BB9D2ABE6C7B0049B4E4 /* RNWhisperContext.h */,
+				7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */,
+				7FE0BB9E2ABE6C7B0049B4E4 /* RNWhisperDownloader.h */,
+				7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */,
 				B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */,
 				134814211AA4EA7D00B7C361 /* Products */,
 			);
@@ -112,6 +129,9 @@
 			buildActionMask = 2147483647;
 			files = (
 				5E555C0D2413F4C50049A1A2 /* RNWhisper.mm in Sources */,
+				7FE0BBA22ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m in Sources */,
+				7FE0BBA32ABE6C7B0049B4E4 /* RNWhisperContext.mm in Sources */,
+				7FE0BBA12ABE6C7B0049B4E4 /* RNWhisperDownloader.m in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -223,9 +243,7 @@
 					"$(SRCROOT)/../../react-native/React/**",
 				);
 				LIBRARY_SEARCH_PATHS = "$(inherited)";
-				OTHER_LDFLAGS = (
-					"-ObjC",
-				);
+				OTHER_LDFLAGS = "-ObjC";
 				PRODUCT_NAME = RNWhisper;
 				SKIP_INSTALL = YES;
 			};

diff --git a/ios/RNWhisperAudioUtils.h b/ios/RNWhisperAudioUtils.h
@@ -0,0 +1,8 @@
+#import <Foundation/Foundation.h>
+
+@interface RNWhisperAudioUtils : NSObject
+
++ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
++ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
+
+@end
diff --git a/ios/RNWhisperAudioUtils.m b/ios/RNWhisperAudioUtils.m
@@ -0,0 +1,62 @@
+#import "RNWhisperAudioUtils.h"
+#import "whisper.h"
+
+@implementation RNWhisperAudioUtils
+
++ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
+    NSMutableData *outputData = [NSMutableData data];
+    for (int i = 0; i < buffers.count; i++) {
+        int size = [sliceNSamples objectAtIndex:i].intValue;
+        NSValue *buffer = [buffers objectAtIndex:i];
+        short *bufferPtr = buffer.pointerValue;
+        [outputData appendBytes:bufferPtr length:size * sizeof(short)];
+    }
+    return outputData;
+}
+
++ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
+    NSMutableData *outputData = [NSMutableData data];
+
+    // WAVE header
+    [outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
+    int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
+    [outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
+    [outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
+    [outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id
+
+    int subchunk1Size = CFSwapInt32HostToLittle(16);
+    [outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];
+
+    short audioFormat = CFSwapInt16HostToLittle(1); // PCM
+    [outputData appendBytes:&audioFormat length:sizeof(audioFormat)];
+
+    short numChannels = CFSwapInt16HostToLittle(1); // mono
+    [outputData appendBytes:&numChannels length:sizeof(numChannels)];
+
+    int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
+    [outputData appendBytes:&sampleRate length:sizeof(sampleRate)];
+
+    // (bitDepth * sampleRate * channels) >> 3
+    int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
+    [outputData appendBytes:&byteRate length:sizeof(byteRate)];
+
+    // (bitDepth * channels) >> 3
+    short blockAlign = CFSwapInt16HostToLittle(16 / 8);
+    [outputData appendBytes:&blockAlign length:sizeof(blockAlign)];
+
+    // bitDepth
+    short bitsPerSample = CFSwapInt16HostToLittle(16);
+    [outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];
+
+    [outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
+    int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
+    [outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];
+
+    // Audio data
+    [outputData appendData:rawData];
+
+    // Save to file
+    [outputData writeToFile:audioOutputFile atomically:YES];
+}
+
+@end
diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm
@@ -1,4 +1,5 @@
 #import "RNWhisperContext.h"
+#import "RNWhisperAudioUtils.h"
 #include <vector>
 
 #define NUM_BYTES_PER_BUFFER 16 * 1024
@@ -245,6 +246,17 @@ - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
         NSLog(@"[RNWhisper] Transcribe end");
         result[@"isStoppedByAction"] = @(state->isStoppedByAction);
         result[@"isCapturing"] = @(false);
+
+        // Save wav if needed
+        if (state->options[@"audioOutputPath"] != nil) {
+            // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
+            [RNWhisperAudioUtils
+                saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
+                                sliceNSamples:state->sliceNSamples]
+                audioOutputFile:state->options[@"audioOutputPath"]
+            ];
+        }
+
         state->transcribeHandler(state->jobId, @"end", result);
     } else if (code == 0) {
         result[@"isCapturing"] = @(true);

diff --git a/src/index.ts b/src/index.ts
@@ -60,7 +60,6 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
   realtimeAudioSliceSec?: number
   /**
    * Output path for audio file. If not set, the audio file will not be saved
-   * TODO: Support iOS
    * (Default: Undefined)
    */
   audioOutputPath?: string