diff --git a/backend/main.go b/backend/main.go index 49dcc69..947e7da 100644 --- a/backend/main.go +++ b/backend/main.go @@ -529,6 +529,27 @@ func handleStageStart(ctx context.Context, w http.ResponseWriter, r *http.Reques return nil } +func handleStartConversation(ctx context.Context, w http.ResponseWriter, r *http.Request) error { + // The stage uuid, user must create it before upload question audio. + q := r.URL.Query() + sid := q.Get("sid") + if sid == "" { + return errors.Errorf("empty sid") + } + + stage := talkServer.QueryStage(sid) + if stage == nil { + return errors.Errorf("invalid sid %v", sid) + } + + // Keep alive the stage. + stage.KeepAlive() + stage.lastSentence = time.Now() + + ohttp.WriteData(ctx, w, r, nil) + return nil +} + // When user ask a question, which is a request with audio, which is identified by rid (request id). func handleUploadQuestionAudio(ctx context.Context, w http.ResponseWriter, r *http.Request) error { // The stage uuid, user must create it before upload question audio. @@ -545,7 +566,6 @@ func handleUploadQuestionAudio(ctx context.Context, w http.ResponseWriter, r *ht // Keep alive the stage. stage.KeepAlive() - stage.lastSentence = time.Now() // Switch to the context of stage. ctx = stage.loggingCtx @@ -611,8 +631,8 @@ func handleUploadQuestionAudio(ctx context.Context, w http.ResponseWriter, r *ht stage.lastAsrDuration = resp.Duration stage.lastRequestAsrText = asrText } - logger.Tf(ctx, "ASR ok, robot=%v(%v), lang=%v, prompt=<%v>, resp is <%v>", - robot.uuid, robot.label, robot.asrLanguage, stage.previousAsrText, asrText) + logger.Tf(ctx, "ASR ok, robot=%v(%v), lang=%v, speech=%v, prompt=<%v>, resp is <%v>", + robot.uuid, robot.label, robot.asrLanguage, stage.lastAsrDuration, stage.previousAsrText, asrText) // Important trace log. logger.Tf(ctx, "You: %v", asrText) @@ -929,6 +949,13 @@ func doMain(ctx context.Context) error { } }) + handler.HandleFunc("/api/ai-talk/conversation/", func(w http.ResponseWriter, r *http.Request) { + if err := handleStartConversation(ctx, w, r); err != nil { + logger.Ef(ctx, "Handle audio failed, err %+v", err) + http.Error(w, err.Error(), http.StatusInternalServerError) + } + }) + handler.HandleFunc("/api/ai-talk/upload/", func(w http.ResponseWriter, r *http.Request) { if err := handleUploadQuestionAudio(ctx, w, r); err != nil { logger.Ef(ctx, "Handle audio failed, err %+v", err) diff --git a/backend/openai.go b/backend/openai.go index 33d2d60..f1425a1 100644 --- a/backend/openai.go +++ b/backend/openai.go @@ -108,7 +108,8 @@ func (v *openaiASRService) RequestASR(ctx context.Context, inputFile, language, openai.AudioRequest{ Model: os.Getenv("AIT_ASR_MODEL"), FilePath: outputFile, - Format: openai.AudioResponseFormatJSON, + // Note that must use verbose JSON, to get the duration of file. + Format: openai.AudioResponseFormatVerboseJSON, Language: language, Prompt: prompt, }, diff --git a/src/App.js b/src/App.js index 2924235..d9d95d0 100644 --- a/src/App.js +++ b/src/App.js @@ -98,6 +98,18 @@ function AppImpl({info, verbose, robot, robotReady, stageUUID, playerRef}) { if (!robotReady) return; const processUserInput = async(userMayInput) => { + // End conversation, for stat the elapsed time cost accurately. + await new Promise((resolve, reject) => { + fetch(`/api/ai-talk/conversation/?sid=${stageUUID}&robot=${robot.uuid}&umi=${userMayInput}`, { + method: 'POST', + }).then(response => { + return response.json(); + }).then((data) => { + verbose(`TTS: Conversation started`); + resolve(); + }).catch(error => reject(error)); + }); + // Upload the user input audio to the server. const requestUUID = await new Promise((resolve, reject) => { verbose(`ASR: Uploading ${ref.current.audioChunks.length} chunks, robot=${robot.uuid}`);