From 77a1a786695fe47d30a498b9a160e3914bab2235 Mon Sep 17 00:00:00 2001 From: winlin Date: Sat, 27 Jan 2024 09:07:33 +0800 Subject: [PATCH] Speed up the ASR, without transcode. --- README.md | 5 +++++ backend/openai.go | 31 ++++++++++++------------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 1e3293a..dabd045 100644 --- a/README.md +++ b/README.md @@ -213,5 +213,10 @@ The changelog: * Allow user retry when error. v1.0.34 * Refine badcase for user input. v1.0.35 * Fix bug for setting window for robot. [v1.0.36](https://github.com/ossrs/ai-talk/releases/tag/v1.0.36) +* Support setup API proxy and key for ASR,Chat,TTS. v1.0.37 +* Support Tencent Speech to speed up. v1.0.37 +* Support share logging text mode. v1.0.38 +* Fix some badcase for sentence determine. [v1.0.39](https://github.com/ossrs/ai-talk/releases/tag/v1.0.39) +* Speed up the ASR, without transcode. v1.0.40 Winlin, 2023.12 diff --git a/backend/openai.go b/backend/openai.go index 4b87547..33d2d60 100644 --- a/backend/openai.go +++ b/backend/openai.go @@ -80,29 +80,22 @@ func NewOpenAIASRService(opts ...func(service *openaiASRService)) ASRService { } func (v *openaiASRService) RequestASR(ctx context.Context, inputFile, language, prompt string, onBeforeRequest func()) (*ASRResult, error) { - outputFile := fmt.Sprintf("%v.m4a", inputFile) - - // Transcode input audio in opus or aac, to aac in m4a format. + outputFile := fmt.Sprintf("%v.mp4", inputFile) if os.Getenv("AIT_KEEP_FILES") != "true" { defer os.Remove(outputFile) } - if true { - err := exec.CommandContext(ctx, "ffmpeg", - "-i", inputFile, - "-vn", "-c:a", "aac", "-ac", "1", "-ar", "16000", "-ab", "50k", - outputFile, - ).Run() - - if err != nil { - return nil, errors.Errorf("Error converting the file") - } - logger.Tf(ctx, "Convert audio %v to %v ok", inputFile, outputFile) - } - duration, _, err := ffprobeAudio(ctx, outputFile) - if err != nil { - return nil, errors.Wrapf(err, "ffprobe") + // Transcode input audio in opus or aac, to aac in m4a format. + // If need to encode to aac, use: + // "-c:a", "aac", "-ac", "1", "-ar", "16000", "-ab", "30k", + if err := exec.CommandContext(ctx, "ffmpeg", + "-i", inputFile, + "-vn", "-c:a", "copy", + outputFile, + ).Run(); err != nil { + return nil, errors.Errorf("Error converting the file") } + logger.Tf(ctx, "Convert audio %v to %v ok", inputFile, outputFile) if onBeforeRequest != nil { onBeforeRequest() @@ -124,7 +117,7 @@ func (v *openaiASRService) RequestASR(ctx context.Context, inputFile, language, return nil, errors.Wrapf(err, "asr") } - return &ASRResult{Text: resp.Text, Duration: time.Duration(duration * float64(time.Second))}, nil + return &ASRResult{Text: resp.Text, Duration: time.Duration(resp.Duration * float64(time.Second))}, nil } func ffprobeAudio(ctx context.Context, filename string) (duration float64, bitrate int, err error) {