Skip to content

Commit

Permalink
Speed up the ASR, without transcode.
Browse files Browse the repository at this point in the history
  • Loading branch information
winlinvip committed Jan 27, 2024
1 parent 3a540cc commit 77a1a78
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 19 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,5 +213,10 @@ The changelog:
* Allow user retry when error. v1.0.34
* Refine badcase for user input. v1.0.35
* Fix bug for setting window for robot. [v1.0.36](https://github.com/ossrs/ai-talk/releases/tag/v1.0.36)
* Support setup API proxy and key for ASR,Chat,TTS. v1.0.37
* Support Tencent Speech to speed up. v1.0.37
* Support share logging text mode. v1.0.38
* Fix some badcase for sentence determine. [v1.0.39](https://github.com/ossrs/ai-talk/releases/tag/v1.0.39)
* Speed up the ASR, without transcode. v1.0.40

Winlin, 2023.12
31 changes: 12 additions & 19 deletions backend/openai.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,29 +80,22 @@ func NewOpenAIASRService(opts ...func(service *openaiASRService)) ASRService {
}

func (v *openaiASRService) RequestASR(ctx context.Context, inputFile, language, prompt string, onBeforeRequest func()) (*ASRResult, error) {
outputFile := fmt.Sprintf("%v.m4a", inputFile)

// Transcode input audio in opus or aac, to aac in m4a format.
outputFile := fmt.Sprintf("%v.mp4", inputFile)
if os.Getenv("AIT_KEEP_FILES") != "true" {
defer os.Remove(outputFile)
}
if true {
err := exec.CommandContext(ctx, "ffmpeg",
"-i", inputFile,
"-vn", "-c:a", "aac", "-ac", "1", "-ar", "16000", "-ab", "50k",
outputFile,
).Run()

if err != nil {
return nil, errors.Errorf("Error converting the file")
}
logger.Tf(ctx, "Convert audio %v to %v ok", inputFile, outputFile)
}

duration, _, err := ffprobeAudio(ctx, outputFile)
if err != nil {
return nil, errors.Wrapf(err, "ffprobe")
// Transcode input audio in opus or aac, to aac in m4a format.
// If need to encode to aac, use:
// "-c:a", "aac", "-ac", "1", "-ar", "16000", "-ab", "30k",
if err := exec.CommandContext(ctx, "ffmpeg",
"-i", inputFile,
"-vn", "-c:a", "copy",
outputFile,
).Run(); err != nil {
return nil, errors.Errorf("Error converting the file")
}
logger.Tf(ctx, "Convert audio %v to %v ok", inputFile, outputFile)

if onBeforeRequest != nil {
onBeforeRequest()
Expand All @@ -124,7 +117,7 @@ func (v *openaiASRService) RequestASR(ctx context.Context, inputFile, language,
return nil, errors.Wrapf(err, "asr")
}

return &ASRResult{Text: resp.Text, Duration: time.Duration(duration * float64(time.Second))}, nil
return &ASRResult{Text: resp.Text, Duration: time.Duration(resp.Duration * float64(time.Second))}, nil
}

func ffprobeAudio(ctx context.Context, filename string) (duration float64, bitrate int, err error) {
Expand Down

0 comments on commit 77a1a78

Please sign in to comment.