-
Notifications
You must be signed in to change notification settings - Fork 470
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add VAD + Non-streaming ASR model for Android
- Loading branch information
1 parent
5e1a924
commit c244a40
Showing
15 changed files
with
278 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,3 +31,4 @@ google-services.json | |
|
||
# Android Profiling | ||
*.hprof | ||
*.so |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.onnx |
204 changes: 203 additions & 1 deletion
204
android/SherpaOnnxVadAsr/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,213 @@ | ||
package com.k2fsa.sherpa.onnx | ||
|
||
import androidx.appcompat.app.AppCompatActivity | ||
import android.Manifest | ||
import android.content.pm.PackageManager | ||
import android.media.AudioFormat | ||
import android.media.AudioRecord | ||
import android.media.MediaRecorder | ||
import android.os.Bundle | ||
import android.text.method.ScrollingMovementMethod | ||
import android.util.Log | ||
import android.view.View | ||
import android.widget.Button | ||
import android.widget.TextView | ||
import androidx.appcompat.app.AppCompatActivity | ||
import androidx.core.app.ActivityCompat | ||
import kotlin.concurrent.thread | ||
|
||
|
||
private const val TAG = "sherpa-onnx" | ||
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200 | ||
|
||
class MainActivity : AppCompatActivity() { | ||
|
||
private lateinit var recordButton: Button | ||
private lateinit var textView: TextView | ||
|
||
private lateinit var vad: Vad | ||
|
||
private var audioRecord: AudioRecord? = null | ||
private var recordingThread: Thread? = null | ||
private val audioSource = MediaRecorder.AudioSource.MIC | ||
private val sampleRateInHz = 16000 | ||
private val channelConfig = AudioFormat.CHANNEL_IN_MONO | ||
|
||
// Note: We don't use AudioFormat.ENCODING_PCM_FLOAT | ||
// since the AudioRecord.read(float[]) needs API level >= 23 | ||
// but we are targeting API level >= 21 | ||
private val audioFormat = AudioFormat.ENCODING_PCM_16BIT | ||
|
||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO) | ||
|
||
// Non-streaming ASR | ||
private lateinit var offlineRecognizer: SherpaOnnxOffline | ||
|
||
private var idx: Int = 0 | ||
private var lastText: String = "" | ||
|
||
@Volatile | ||
private var isRecording: Boolean = false | ||
|
||
override fun onRequestPermissionsResult( | ||
requestCode: Int, permissions: Array<String>, grantResults: IntArray | ||
) { | ||
super.onRequestPermissionsResult(requestCode, permissions, grantResults) | ||
val permissionToRecordAccepted = if (requestCode == REQUEST_RECORD_AUDIO_PERMISSION) { | ||
grantResults[0] == PackageManager.PERMISSION_GRANTED | ||
} else { | ||
false | ||
} | ||
|
||
if (!permissionToRecordAccepted) { | ||
Log.e(TAG, "Audio record is disallowed") | ||
finish() | ||
} | ||
|
||
Log.i(TAG, "Audio record is permitted") | ||
} | ||
|
||
override fun onCreate(savedInstanceState: Bundle?) { | ||
super.onCreate(savedInstanceState) | ||
setContentView(R.layout.activity_main) | ||
|
||
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION) | ||
|
||
Log.i(TAG, "Start to initialize model") | ||
initVadModel() | ||
Log.i(TAG, "Finished initializing model") | ||
|
||
Log.i(TAG, "Start to initialize non-streaimng recognizer") | ||
initOfflineRecognizer() | ||
Log.i(TAG, "Finished initializing non-streaming recognizer") | ||
|
||
recordButton = findViewById(R.id.record_button) | ||
recordButton.setOnClickListener { onclick() } | ||
|
||
textView = findViewById(R.id.my_text) | ||
textView.movementMethod = ScrollingMovementMethod() | ||
} | ||
|
||
private fun onclick() { | ||
if (!isRecording) { | ||
val ret = initMicrophone() | ||
if (!ret) { | ||
Log.e(TAG, "Failed to initialize microphone") | ||
return | ||
} | ||
Log.i(TAG, "state: ${audioRecord?.state}") | ||
audioRecord!!.startRecording() | ||
recordButton.setText(R.string.stop) | ||
isRecording = true | ||
|
||
textView.text = "" | ||
lastText = "" | ||
idx = 0 | ||
|
||
vad.reset() | ||
recordingThread = thread(true) { | ||
processSamples() | ||
} | ||
Log.i(TAG, "Started recording") | ||
} else { | ||
isRecording = false | ||
|
||
audioRecord!!.stop() | ||
audioRecord!!.release() | ||
audioRecord = null | ||
|
||
recordButton.setText(R.string.start) | ||
Log.i(TAG, "Stopped recording") | ||
} | ||
} | ||
|
||
private fun initVadModel() { | ||
val type = 0 | ||
println("Select VAD model type ${type}") | ||
val config = getVadModelConfig(type) | ||
|
||
vad = Vad( | ||
assetManager = application.assets, | ||
config = config!!, | ||
) | ||
} | ||
|
||
private fun initMicrophone(): Boolean { | ||
if (ActivityCompat.checkSelfPermission( | ||
this, Manifest.permission.RECORD_AUDIO | ||
) != PackageManager.PERMISSION_GRANTED | ||
) { | ||
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION) | ||
return false | ||
} | ||
|
||
val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat) | ||
Log.i( | ||
TAG, "buffer size in milliseconds: ${numBytes * 1000.0f / sampleRateInHz}" | ||
) | ||
|
||
audioRecord = AudioRecord( | ||
audioSource, | ||
sampleRateInHz, | ||
channelConfig, | ||
audioFormat, | ||
numBytes * 2 // a sample has two bytes as we are using 16-bit PCM | ||
) | ||
return true | ||
} | ||
|
||
private fun processSamples() { | ||
Log.i(TAG, "processing samples") | ||
|
||
val bufferSize = 512 // in samples | ||
val buffer = ShortArray(bufferSize) | ||
|
||
while (isRecording) { | ||
val ret = audioRecord?.read(buffer, 0, buffer.size) | ||
if (ret != null && ret > 0) { | ||
val samples = FloatArray(ret) { buffer[it] / 32768.0f } | ||
|
||
vad.acceptWaveform(samples) | ||
while(!vad.empty()) { | ||
var objArray = vad.front() | ||
val samples = objArray[1] as FloatArray | ||
val text = runSecondPass(samples) | ||
|
||
if (text.isNotBlank()) { | ||
lastText = "${lastText}\n${idx}: ${text}" | ||
idx += 1 | ||
} | ||
|
||
vad.pop(); | ||
} | ||
|
||
val isSpeechDetected = vad.isSpeechDetected() | ||
|
||
runOnUiThread { | ||
textView.text = lastText.lowercase() | ||
} | ||
} | ||
} | ||
} | ||
|
||
private fun initOfflineRecognizer() { | ||
// Please change getOfflineModelConfig() to add new models | ||
// See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
// for a list of available models | ||
val secondType = 0 | ||
println("Select model type ${secondType} for the second pass") | ||
|
||
val config = OfflineRecognizerConfig( | ||
featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80), | ||
modelConfig = getOfflineModelConfig(type = secondType)!!, | ||
) | ||
|
||
offlineRecognizer = SherpaOnnxOffline( | ||
assetManager = application.assets, | ||
config = config, | ||
) | ||
} | ||
|
||
private fun runSecondPass(samples: FloatArray): String { | ||
return offlineRecognizer.decode(samples, sampleRateInHz) | ||
} | ||
} |
1 change: 1 addition & 0 deletions
1
android/SherpaOnnxVadAsr/app/src/main/java/com/k2fsa/sherpa/onnx/SherpaOnnx.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../../../../../../../SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx/SherpaOnnx.kt |
1 change: 1 addition & 0 deletions
1
android/SherpaOnnxVadAsr/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../../../../../../../SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt |
Empty file.
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
12 changes: 11 additions & 1 deletion
12
android/SherpaOnnxVadAsr/app/src/main/res/values/strings.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,13 @@ | ||
<resources> | ||
<string name="app_name">SherpaOnnxVadAsr</string> | ||
<string name="app_name">ASR with Next-gen Kaldi</string> | ||
<string name="hint">Click the Start button to play speech-to-text with Next-gen Kaldi. | ||
\n | ||
\n\n\n | ||
The source code and pre-trained models are publicly available. | ||
Please see https://github.com/k2-fsa/sherpa-onnx for details. | ||
\n\n | ||
Speech recognition with Next-gen Kaldi using VAD and non-streaming ASR models. | ||
</string> | ||
<string name="start">Start</string> | ||
<string name="stop">Stop</string> | ||
</resources> |