Skip to content

Commit

Permalink
Add VAD + Non-streaming ASR model for Android
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Sep 23, 2023
1 parent 5e1a924 commit c244a40
Show file tree
Hide file tree
Showing 15 changed files with 278 additions and 12 deletions.
1 change: 1 addition & 0 deletions android/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ google-services.json

# Android Profiling
*.hprof
*.so
10 changes: 10 additions & 0 deletions android/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,13 @@
Please refer to
https://k2-fsa.github.io/sherpa/onnx/android/index.html
for usage.

- [SherpaOnnx](./SherpaOnnx) It uses a streaming ASR model.

- [SherpaOnnx2Pass](./SherpaOnnx2Pass) It uses a streaming ASR model
for the first pass and use a non-streaming ASR model for the second pass.

- [SherpaOnnxVad](./SherpaOnnxVad) It demonstrates how to use a VAD

- [SherpaOnnxVadAsr](./SherpaOnnxVadAsr) It uses a VAD with a non-streaming
ASR model.
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ data class OnlineRecognizerConfig(
var enableEndpoint: Boolean = true,
var decodingMethod: String = "greedy_search",
var maxActivePaths: Int = 4,
var hotwordsFile: String = "",
var hotwordsScore: Float = 1.5f,
)

data class OfflineTransducerModelConfig(
Expand Down Expand Up @@ -87,6 +89,8 @@ data class OfflineRecognizerConfig(
// var lmConfig: OfflineLMConfig(), // TODO(fangjun): enable it
var decodingMethod: String = "greedy_search",
var maxActivePaths: Int = 4,
var hotwordsFile: String = "",
var hotwordsScore: Float = 1.5f,
)

class SherpaOnnx(
Expand Down Expand Up @@ -370,6 +374,19 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
)
}

5 -> {
val modelDir = "sherpa-onnx-zipformer-multi-zh-hans-2023-9-2"
return OfflineModelConfig(
transducer = OfflineTransducerModelConfig(
encoder = "$modelDir/encoder-epoch-20-avg-1.int8.onnx",
decoder = "$modelDir/decoder-epoch-20-avg-1.onnx",
joiner = "$modelDir/joiner-epoch-20-avg-1.int8.onnx",
),
tokens = "$modelDir/tokens.txt",
modelType = "zipformer2",
)
}

}
return null
}
4 changes: 2 additions & 2 deletions android/SherpaOnnxVadAsr/app/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ plugins {

android {
namespace 'com.k2fsa.sherpa.onnx'
compileSdk 32
compileSdk 33

defaultConfig {
applicationId "com.k2fsa.sherpa.onnx"
minSdk 21
targetSdk 32
targetSdk 33
versionCode 1
versionName "1.0"

Expand Down
2 changes: 2 additions & 0 deletions android/SherpaOnnxVadAsr/app/src/main/AndroidManifest.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:tools="http://schemas.android.com/tools">

<uses-permission android:name="android.permission.RECORD_AUDIO" />

<application
android:allowBackup="true"
android:dataExtractionRules="@xml/data_extraction_rules"
Expand Down
1 change: 1 addition & 0 deletions android/SherpaOnnxVadAsr/app/src/main/assets/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.onnx
Original file line number Diff line number Diff line change
@@ -1,11 +1,213 @@
package com.k2fsa.sherpa.onnx

import androidx.appcompat.app.AppCompatActivity
import android.Manifest
import android.content.pm.PackageManager
import android.media.AudioFormat
import android.media.AudioRecord
import android.media.MediaRecorder
import android.os.Bundle
import android.text.method.ScrollingMovementMethod
import android.util.Log
import android.view.View
import android.widget.Button
import android.widget.TextView
import androidx.appcompat.app.AppCompatActivity
import androidx.core.app.ActivityCompat
import kotlin.concurrent.thread


private const val TAG = "sherpa-onnx"
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200

class MainActivity : AppCompatActivity() {

private lateinit var recordButton: Button
private lateinit var textView: TextView

private lateinit var vad: Vad

private var audioRecord: AudioRecord? = null
private var recordingThread: Thread? = null
private val audioSource = MediaRecorder.AudioSource.MIC
private val sampleRateInHz = 16000
private val channelConfig = AudioFormat.CHANNEL_IN_MONO

// Note: We don't use AudioFormat.ENCODING_PCM_FLOAT
// since the AudioRecord.read(float[]) needs API level >= 23
// but we are targeting API level >= 21
private val audioFormat = AudioFormat.ENCODING_PCM_16BIT

private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)

// Non-streaming ASR
private lateinit var offlineRecognizer: SherpaOnnxOffline

private var idx: Int = 0
private var lastText: String = ""

@Volatile
private var isRecording: Boolean = false

override fun onRequestPermissionsResult(
requestCode: Int, permissions: Array<String>, grantResults: IntArray
) {
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
val permissionToRecordAccepted = if (requestCode == REQUEST_RECORD_AUDIO_PERMISSION) {
grantResults[0] == PackageManager.PERMISSION_GRANTED
} else {
false
}

if (!permissionToRecordAccepted) {
Log.e(TAG, "Audio record is disallowed")
finish()
}

Log.i(TAG, "Audio record is permitted")
}

override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
setContentView(R.layout.activity_main)

ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)

Log.i(TAG, "Start to initialize model")
initVadModel()
Log.i(TAG, "Finished initializing model")

Log.i(TAG, "Start to initialize non-streaimng recognizer")
initOfflineRecognizer()
Log.i(TAG, "Finished initializing non-streaming recognizer")

recordButton = findViewById(R.id.record_button)
recordButton.setOnClickListener { onclick() }

textView = findViewById(R.id.my_text)
textView.movementMethod = ScrollingMovementMethod()
}

private fun onclick() {
if (!isRecording) {
val ret = initMicrophone()
if (!ret) {
Log.e(TAG, "Failed to initialize microphone")
return
}
Log.i(TAG, "state: ${audioRecord?.state}")
audioRecord!!.startRecording()
recordButton.setText(R.string.stop)
isRecording = true

textView.text = ""
lastText = ""
idx = 0

vad.reset()
recordingThread = thread(true) {
processSamples()
}
Log.i(TAG, "Started recording")
} else {
isRecording = false

audioRecord!!.stop()
audioRecord!!.release()
audioRecord = null

recordButton.setText(R.string.start)
Log.i(TAG, "Stopped recording")
}
}

private fun initVadModel() {
val type = 0
println("Select VAD model type ${type}")
val config = getVadModelConfig(type)

vad = Vad(
assetManager = application.assets,
config = config!!,
)
}

private fun initMicrophone(): Boolean {
if (ActivityCompat.checkSelfPermission(
this, Manifest.permission.RECORD_AUDIO
) != PackageManager.PERMISSION_GRANTED
) {
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
return false
}

val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
Log.i(
TAG, "buffer size in milliseconds: ${numBytes * 1000.0f / sampleRateInHz}"
)

audioRecord = AudioRecord(
audioSource,
sampleRateInHz,
channelConfig,
audioFormat,
numBytes * 2 // a sample has two bytes as we are using 16-bit PCM
)
return true
}

private fun processSamples() {
Log.i(TAG, "processing samples")

val bufferSize = 512 // in samples
val buffer = ShortArray(bufferSize)

while (isRecording) {
val ret = audioRecord?.read(buffer, 0, buffer.size)
if (ret != null && ret > 0) {
val samples = FloatArray(ret) { buffer[it] / 32768.0f }

vad.acceptWaveform(samples)
while(!vad.empty()) {
var objArray = vad.front()
val samples = objArray[1] as FloatArray
val text = runSecondPass(samples)

if (text.isNotBlank()) {
lastText = "${lastText}\n${idx}: ${text}"
idx += 1
}

vad.pop();
}

val isSpeechDetected = vad.isSpeechDetected()

runOnUiThread {
textView.text = lastText.lowercase()
}
}
}
}

private fun initOfflineRecognizer() {
// Please change getOfflineModelConfig() to add new models
// See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
// for a list of available models
val secondType = 0
println("Select model type ${secondType} for the second pass")

val config = OfflineRecognizerConfig(
featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80),
modelConfig = getOfflineModelConfig(type = secondType)!!,
)

offlineRecognizer = SherpaOnnxOffline(
assetManager = application.assets,
config = config,
)
}

private fun runSecondPass(samples: FloatArray): String {
return offlineRecognizer.decode(samples, sampleRateInHz)
}
}
Empty file.
Empty file.
Empty file.
Empty file.
37 changes: 29 additions & 8 deletions android/SherpaOnnxVadAsr/app/src/main/res/layout/activity_main.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,34 @@
android:layout_height="match_parent"
tools:context=".MainActivity">

<TextView
android:layout_width="wrap_content"
android:layout_height="wrap_content"
android:text="Hello World!"
app:layout_constraintBottom_toBottomOf="parent"
app:layout_constraintEnd_toEndOf="parent"
app:layout_constraintStart_toStartOf="parent"
app:layout_constraintTop_toTopOf="parent" />
<LinearLayout
android:layout_width="match_parent"
android:layout_height="match_parent"
android:gravity="center"
android:orientation="vertical">

<TextView
android:id="@+id/my_text"
android:layout_width="match_parent"
android:layout_height="match_parent"
android:layout_weight="2.5"
android:padding="24dp"
android:scrollbars="vertical"
android:singleLine="false"
android:text="@string/hint"
app:layout_constraintBottom_toBottomOf="parent"
app:layout_constraintEnd_toEndOf="parent"
app:layout_constraintStart_toStartOf="parent"
android:gravity="bottom"
app:layout_constraintTop_toTopOf="parent" />

<Button
android:id="@+id/record_button"
android:layout_width="wrap_content"
android:layout_height="wrap_content"
android:layout_weight="0.5"
android:text="@string/start" />
</LinearLayout>


</androidx.constraintlayout.widget.ConstraintLayout>
12 changes: 11 additions & 1 deletion android/SherpaOnnxVadAsr/app/src/main/res/values/strings.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
<resources>
<string name="app_name">SherpaOnnxVadAsr</string>
<string name="app_name">ASR with Next-gen Kaldi</string>
<string name="hint">Click the Start button to play speech-to-text with Next-gen Kaldi.
\n
\n\n\n
The source code and pre-trained models are publicly available.
Please see https://github.com/k2-fsa/sherpa-onnx for details.
\n\n
Speech recognition with Next-gen Kaldi using VAD and non-streaming ASR models.
</string>
<string name="start">Start</string>
<string name="stop">Stop</string>
</resources>

0 comments on commit c244a40

Please sign in to comment.