Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ios): add option to saving recorded audio as wav on startRealtimeTranscribe #132

Merged
merged 5 commits into from
Sep 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions android/src/main/java/com/rnwhisper/WhisperContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ public void run() {
Log.e(NAME, "Error transcribing realtime: " + e.getMessage());
}
}
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
saveWavFile(concatShortBuffers(shortBufferSlices), audioOutputPath);
if (!isTranscribing) {
Expand Down
9 changes: 9 additions & 0 deletions example/ios/Podfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,11 @@ PODS:
- React-perflogger (= 0.71.11)
- RNFS (2.20.0):
- React-Core
- RNSound (0.11.2):
- React-Core
- RNSound/Core (= 0.11.2)
- RNSound/Core (0.11.2):
- React-Core
- RNZipArchive (6.1.0):
- React-Core
- RNZipArchive/Core (= 6.1.0)
Expand Down Expand Up @@ -835,6 +840,7 @@ DEPENDENCIES:
- React-runtimeexecutor (from `../node_modules/react-native/ReactCommon/runtimeexecutor`)
- ReactCommon/turbomodule/core (from `../node_modules/react-native/ReactCommon`)
- RNFS (from `../node_modules/react-native-fs`)
- RNSound (from `../node_modules/react-native-sound`)
- RNZipArchive (from `../node_modules/react-native-zip-archive`)
- whisper-rn (from `../..`)
- Yoga (from `../node_modules/react-native/ReactCommon/yoga`)
Expand Down Expand Up @@ -935,6 +941,8 @@ EXTERNAL SOURCES:
:path: "../node_modules/react-native/ReactCommon"
RNFS:
:path: "../node_modules/react-native-fs"
RNSound:
:path: "../node_modules/react-native-sound"
RNZipArchive:
:path: "../node_modules/react-native-zip-archive"
whisper-rn:
Expand Down Expand Up @@ -994,6 +1002,7 @@ SPEC CHECKSUMS:
React-runtimeexecutor: 4817d63dbc9d658f8dc0ec56bd9b83ce531129f0
ReactCommon: 08723d2ed328c5cbcb0de168f231bc7bae7f8aa1
RNFS: 4ac0f0ea233904cb798630b3c077808c06931688
RNSound: 6c156f925295bdc83e8e422e7d8b38d33bc71852
RNZipArchive: ef9451b849c45a29509bf44e65b788829ab07801
SocketRocket: fccef3f9c5cedea1353a9ef6ada904fde10d6608
SSZipArchive: fe6a26b2a54d5a0890f2567b5cc6de5caa600aef
Expand Down
1 change: 1 addition & 0 deletions example/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"react": "18.2.0",
"react-native": "0.71.11",
"react-native-fs": "^2.20.0",
"react-native-sound": "^0.11.2",
"react-native-zip-archive": "^6.1.0"
},
"devDependencies": {
Expand Down
31 changes: 31 additions & 0 deletions example/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
} from 'react-native'
import RNFS from 'react-native-fs'
import { unzip } from 'react-native-zip-archive'
import Sound from 'react-native-sound'
import { initWhisper, libVersion } from '../../src' // whisper.rn
import type { WhisperContext } from '../../src'
import contextOpts from './context-opts'
Expand Down Expand Up @@ -78,6 +79,8 @@ const fileDir = `${RNFS.DocumentDirectoryPath}/whisper`

console.log('[App] fileDir', fileDir)

const recordFile = `${fileDir}/realtime.wav`

const modelHost = 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main'

const createDir = async (log: any) => {
Expand Down Expand Up @@ -259,13 +262,16 @@ export default function App() {
}
log('Start realtime transcribing...')
try {
await createDir(log)
const { stop, subscribe } =
await whisperContext.transcribeRealtime({
language: 'en',
// Record duration in seconds
realtimeAudioSec: 60,
// Slice audio into 25 (or < 30) sec chunks for better performance
realtimeAudioSliceSec: 25,
// Save audio on stop
audioOutputPath: recordFile,
// Voice Activity Detection - Start transcribing when speech is detected
// useVad: true,
})
Expand Down Expand Up @@ -345,6 +351,31 @@ export default function App() {
>
<Text style={styles.buttonText}>Clear Download files</Text>
</TouchableOpacity>
<TouchableOpacity
style={[styles.button, styles.buttonClear]}
onPress={async () => {
if (!await RNFS.exists(recordFile)) {
log('Recorded file does not exist')
return
}
const player = new Sound(recordFile, '', (e) => {
if (e) {
log('error', e)
return
}
player.play((success) => {
if (success) {
log('successfully finished playing');
} else {
log('playback failed due to audio decoding errors');
}
player.release();
});
})
}}
>
<Text style={styles.buttonText}>Play Recorded file</Text>
</TouchableOpacity>
</SafeAreaView>
</ScrollView>
)
Expand Down
5 changes: 5 additions & 0 deletions example/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4172,6 +4172,11 @@ react-native-gradle-plugin@^0.71.19:
resolved "https://registry.yarnpkg.com/react-native-gradle-plugin/-/react-native-gradle-plugin-0.71.19.tgz#3379e28341fcd189bc1f4691cefc84c1a4d7d232"
integrity sha512-1dVk9NwhoyKHCSxcrM6vY6cxmojeATsBobDicX0ZKr7DgUF2cBQRTKsimQFvzH8XhOVXyH8p4HyDSZNIFI8OlQ==

react-native-sound@^0.11.2:
version "0.11.2"
resolved "https://registry.yarnpkg.com/react-native-sound/-/react-native-sound-0.11.2.tgz#e542dc5b9e16ab4b3ac7e6eaddb1fc8d98da9038"
integrity sha512-LmGc8lgOK3qecYMVQpyHvww/C+wgT6sWeMpVbOe4NCRGC2yKd4fo4U0KBUo9PO7AqKESO3I/2GZg1/C0+bwiiA==

react-native-zip-archive@^6.1.0:
version "6.1.0"
resolved "https://registry.yarnpkg.com/react-native-zip-archive/-/react-native-zip-archive-6.1.0.tgz#beed62dea9c7ff1e4fd4b6ce0e496ede5ab2f96f"
Expand Down
24 changes: 21 additions & 3 deletions ios/RNWhisper.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

/* Begin PBXBuildFile section */
5E555C0D2413F4C50049A1A2 /* RNWhisper.mm in Sources */ = {isa = PBXBuildFile; fileRef = B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */; };
7FE0BBA12ABE6C7B0049B4E4 /* RNWhisperDownloader.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */; };
7FE0BBA22ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */; };
7FE0BBA32ABE6C7B0049B4E4 /* RNWhisperContext.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */; };
/* End PBXBuildFile section */

/* Begin PBXCopyFilesBuildPhase section */
Expand All @@ -24,6 +27,13 @@

/* Begin PBXFileReference section */
134814201AA4EA6300B7C361 /* libRNWhisper.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libRNWhisper.a; sourceTree = BUILT_PRODUCTS_DIR; };
7FE0BB9A2ABE6C7B0049B4E4 /* RNWhisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisper.h; sourceTree = "<group>"; };
7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperDownloader.m; sourceTree = "<group>"; };
7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperAudioUtils.m; sourceTree = "<group>"; };
7FE0BB9D2ABE6C7B0049B4E4 /* RNWhisperContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperContext.h; sourceTree = "<group>"; };
7FE0BB9E2ABE6C7B0049B4E4 /* RNWhisperDownloader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperDownloader.h; sourceTree = "<group>"; };
7FE0BB9F2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperAudioUtils.h; sourceTree = "<group>"; };
7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RNWhisperContext.mm; sourceTree = "<group>"; };
B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RNWhisper.mm; sourceTree = "<group>"; };
/* End PBXFileReference section */

Expand All @@ -49,6 +59,13 @@
58B511D21A9E6C8500147676 = {
isa = PBXGroup;
children = (
7FE0BB9F2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.h */,
7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */,
7FE0BB9A2ABE6C7B0049B4E4 /* RNWhisper.h */,
7FE0BB9D2ABE6C7B0049B4E4 /* RNWhisperContext.h */,
7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */,
7FE0BB9E2ABE6C7B0049B4E4 /* RNWhisperDownloader.h */,
7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */,
B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */,
134814211AA4EA7D00B7C361 /* Products */,
);
Expand Down Expand Up @@ -112,6 +129,9 @@
buildActionMask = 2147483647;
files = (
5E555C0D2413F4C50049A1A2 /* RNWhisper.mm in Sources */,
7FE0BBA22ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m in Sources */,
7FE0BBA32ABE6C7B0049B4E4 /* RNWhisperContext.mm in Sources */,
7FE0BBA12ABE6C7B0049B4E4 /* RNWhisperDownloader.m in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
Expand Down Expand Up @@ -223,9 +243,7 @@
"$(SRCROOT)/../../react-native/React/**",
);
LIBRARY_SEARCH_PATHS = "$(inherited)";
OTHER_LDFLAGS = (
"-ObjC",
);
OTHER_LDFLAGS = "-ObjC";
PRODUCT_NAME = RNWhisper;
SKIP_INSTALL = YES;
};
Expand Down
8 changes: 8 additions & 0 deletions ios/RNWhisperAudioUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#import <Foundation/Foundation.h>

@interface RNWhisperAudioUtils : NSObject

+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;

@end
62 changes: 62 additions & 0 deletions ios/RNWhisperAudioUtils.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#import "RNWhisperAudioUtils.h"
#import "whisper.h"

@implementation RNWhisperAudioUtils

+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
NSMutableData *outputData = [NSMutableData data];
for (int i = 0; i < buffers.count; i++) {
int size = [sliceNSamples objectAtIndex:i].intValue;
NSValue *buffer = [buffers objectAtIndex:i];
short *bufferPtr = buffer.pointerValue;
[outputData appendBytes:bufferPtr length:size * sizeof(short)];
}
return outputData;
}

+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
NSMutableData *outputData = [NSMutableData data];

// WAVE header
[outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
[outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
[outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
[outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id

int subchunk1Size = CFSwapInt32HostToLittle(16);
[outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];

short audioFormat = CFSwapInt16HostToLittle(1); // PCM
[outputData appendBytes:&audioFormat length:sizeof(audioFormat)];

short numChannels = CFSwapInt16HostToLittle(1); // mono
[outputData appendBytes:&numChannels length:sizeof(numChannels)];

int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
[outputData appendBytes:&sampleRate length:sizeof(sampleRate)];

// (bitDepth * sampleRate * channels) >> 3
int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
[outputData appendBytes:&byteRate length:sizeof(byteRate)];

// (bitDepth * channels) >> 3
short blockAlign = CFSwapInt16HostToLittle(16 / 8);
[outputData appendBytes:&blockAlign length:sizeof(blockAlign)];

// bitDepth
short bitsPerSample = CFSwapInt16HostToLittle(16);
[outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];

[outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
[outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];

// Audio data
[outputData appendData:rawData];

// Save to file
[outputData writeToFile:audioOutputFile atomically:YES];
}

@end
12 changes: 12 additions & 0 deletions ios/RNWhisperContext.mm
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#import "RNWhisperContext.h"
#import "RNWhisperAudioUtils.h"
#include <vector>

#define NUM_BYTES_PER_BUFFER 16 * 1024
Expand Down Expand Up @@ -245,6 +246,17 @@ - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
NSLog(@"[RNWhisper] Transcribe end");
result[@"isStoppedByAction"] = @(state->isStoppedByAction);
result[@"isCapturing"] = @(false);

// Save wav if needed
if (state->options[@"audioOutputPath"] != nil) {
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
[RNWhisperAudioUtils
saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
sliceNSamples:state->sliceNSamples]
audioOutputFile:state->options[@"audioOutputPath"]
];
}

state->transcribeHandler(state->jobId, @"end", result);
} else if (code == 0) {
result[@"isCapturing"] = @(true);
Expand Down
1 change: 0 additions & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
realtimeAudioSliceSec?: number
/**
* Output path for audio file. If not set, the audio file will not be saved
* TODO: Support iOS
* (Default: Undefined)
*/
audioOutputPath?: string
Expand Down
Loading