Skip to content

Commit

Permalink
Add Swift example for generating subtitles (#318)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Sep 18, 2023
1 parent 2d51ca4 commit 692a47d
Show file tree
Hide file tree
Showing 9 changed files with 654 additions and 11 deletions.
131 changes: 129 additions & 2 deletions sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
#include <utility>
#include <vector>

#include "sherpa-onnx/csrc/circular-buffer.h"
#include "sherpa-onnx/csrc/display.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
#include "sherpa-onnx/csrc/online-recognizer.h"
#include "sherpa-onnx/csrc/voice-activity-detector.h"

struct SherpaOnnxOnlineRecognizer {
std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
Expand Down Expand Up @@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer,
recognizer->impl->DecodeStreams(ss.data(), n);
}

SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) {
sherpa_onnx::OnlineRecognizerResult result =
recognizer->impl->GetResult(stream->impl.get());
Expand Down Expand Up @@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer,
recognizer->impl->DecodeStreams(ss.data(), n);
}

SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
SherpaOnnxOfflineStream *stream) {
const sherpa_onnx::OfflineRecognitionResult &result =
stream->impl->GetResult();
Expand Down Expand Up @@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult(
delete[] r->timestamps;
delete r;
}

// ============================================================
// For VAD
// ============================================================
//
struct SherpaOnnxCircularBuffer {
std::unique_ptr<sherpa_onnx::CircularBuffer> impl;
};

SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) {
SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer;
buffer->impl = std::make_unique<sherpa_onnx::CircularBuffer>(capacity);
return buffer;
}

void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) {
delete buffer;
}

void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer,
const float *p, int32_t n) {
buffer->impl->Push(p, n);
}

const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer,
int32_t start_index, int32_t n) {
std::vector<float> v = buffer->impl->Get(start_index, n);

float *p = new float[n];
std::copy(v.begin(), v.end(), p);
return p;
}

void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; }

void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) {
buffer->impl->Pop(n);
}

int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) {
return buffer->impl->Size();
}

void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) {
buffer->impl->Reset();
}

struct SherpaOnnxVoiceActivityDetector {
std::unique_ptr<sherpa_onnx::VoiceActivityDetector> impl;
};

SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) {
sherpa_onnx::VadModelConfig vad_config;

vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, "");
vad_config.silero_vad.threshold =
SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5);

vad_config.silero_vad.min_silence_duration =
SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5);

vad_config.silero_vad.min_speech_duration =
SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25);

vad_config.silero_vad.window_size =
SHERPA_ONNX_OR(config->silero_vad.window_size, 512);

vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
vad_config.debug = SHERPA_ONNX_OR(config->debug, false);

if (vad_config.debug) {
fprintf(stderr, "%s\n", vad_config.ToString().c_str());
}

SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector;
p->impl = std::make_unique<sherpa_onnx::VoiceActivityDetector>(
vad_config, buffer_size_in_seconds);

return p;
}

void SherpaOnnxDestroyVoiceActivityDetector(
SherpaOnnxVoiceActivityDetector *p) {
delete p;
}

void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) {
p->impl->AcceptWaveform(samples, n);
}

int32_t SherpaOnnxVoiceActivityDetectorEmpty(
SherpaOnnxVoiceActivityDetector *p) {
return p->impl->Empty();
}

SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
SherpaOnnxVoiceActivityDetector *p) {
p->impl->Pop();
}

SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) {
const sherpa_onnx::SpeechSegment &segment = p->impl->Front();

SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment;
ans->start = segment.start;
ans->samples = new float[segment.samples.size()];
std::copy(segment.samples.begin(), segment.samples.end(), ans->samples);
ans->n = segment.samples.size();

return ans;
}

void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) {
delete[] p->samples;
delete p;
}

void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
p->impl->Reset();
}
125 changes: 123 additions & 2 deletions sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams(
/// @return A pointer containing the result. The user has to invoke
/// DestroyOnlineRecognizerResult() to free the returned pointer to
/// avoid memory leak.
SHERPA_ONNX_API SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream);

/// Destroy the pointer returned by GetOnlineStreamResult().
Expand Down Expand Up @@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
/// @return Return a pointer to the result. The user has to invoke
/// DestroyOnlineRecognizerResult() to free the returned pointer to
/// avoid memory leak.
SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
SHERPA_ONNX_API const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
SherpaOnnxOfflineStream *stream);

/// Destroy the pointer returned by GetOfflineStreamResult().
Expand All @@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
SHERPA_ONNX_API void DestroyOfflineRecognizerResult(
const SherpaOnnxOfflineRecognizerResult *r);

// ============================================================
// For VAD
// ============================================================

SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
// Path to the silero VAD model
const char *model;

// threshold to classify a segment as speech
//
// If the predicted probability of a segment is larger than this
// value, then it is classified as speech.
float threshold;

// in seconds
float min_silence_duration;

// in seconds
float min_speech_duration;

int window_size;
} SherpaOnnxSileroVadModelConfig;

SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
SherpaOnnxSileroVadModelConfig silero_vad;

int32_t sample_rate;
int32_t num_threads;
const char *provider;
int32_t debug;
} SherpaOnnxVadModelConfig;

SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
SherpaOnnxCircularBuffer;

// Return an instance of circular buffer. The user has to use
// SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid
// memory leak.
SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
int32_t capacity);

// Free the pointer returned by SherpaOnnxCreateCircularBuffer()
SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer(
SherpaOnnxCircularBuffer *buffer);

SHERPA_ONNX_API void SherpaOnnxCircularBufferPush(
SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n);

// Return n samples starting at the given index.
//
// Return a pointer to an array containing n samples starting at start_index.
// The user has to use SherpaOnnxCircularBufferFree() to free the returned
// pointer to avoid memory leak.
SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet(
SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n);

// Free the pointer returned by SherpaOnnxCircularBufferGet().
SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p);

// Remove n elements from the buffer
SHERPA_ONNX_API void SherpaOnnxCircularBufferPop(
SherpaOnnxCircularBuffer *buffer, int32_t n);

// Return number of elements in the buffer.
SHERPA_ONNX_API int32_t
SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer);

// Clear all elements in the buffer
SHERPA_ONNX_API void SherpaOnnxCircularBufferReset(
SherpaOnnxCircularBuffer *buffer);

SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment {
// The start index in samples of this segment
int32_t start;

// pointer to the array containing the samples
float *samples;

// number of samples in this segment
int32_t n;
} SherpaOnnxSpeechSegment;

typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector;

// Return an instance of VoiceActivityDetector.
// The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free
// the returned pointer to avoid memory leak.
SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector *
SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config,
float buffer_size_in_seconds);

SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector(
SherpaOnnxVoiceActivityDetector *p);

SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n);

// Return 1 if there are no speech segments available.
// Return 0 if there are speech segments.
SHERPA_ONNX_API int32_t
SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p);

// Return the first speech segment.
// It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1.
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
SherpaOnnxVoiceActivityDetector *p);

// Return the first speech segment.
// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
// pointer to avoid memory leak.
SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p);

// Free the pointer returned SherpaOnnxVoiceActivityDetectorFront().
SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
const SherpaOnnxSpeechSegment *p);

// Re-initialize the voice activity detector.
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
SherpaOnnxVoiceActivityDetector *p);

#ifdef __cplusplus
} /* extern "C" */
#endif
Expand Down
4 changes: 2 additions & 2 deletions sherpa-onnx/csrc/hypothesis.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ void Hypotheses::Add(Hypothesis hyp) {
} else {
it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob);

if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0){
if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0) {
it->second.lm_log_prob =
LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob);
LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions sherpa-onnx/csrc/silero-vad-model-config.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ struct SileroVadModelConfig {

// threshold to classify a segment as speech
//
// The predicted probability of a segment is larger than this
// If the predicted probability of a segment is larger than this
// value, then it is classified as speech.
float threshold = 0.5;

Expand All @@ -25,7 +25,7 @@ struct SileroVadModelConfig {

// 512, 1024, 1536 samples for 16000 Hz
// 256, 512, 768 samples for 800 Hz
int window_size = 512; // in samples
int32_t window_size = 512; // in samples

SileroVadModelConfig() = default;

Expand Down
1 change: 1 addition & 0 deletions swift-api-examples/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
decode-file
decode-file-non-streaming
generate-subtitles
Loading

0 comments on commit 692a47d

Please sign in to comment.