From a7c00953a3b3152a47d746c9805bbbe64cf94c86 Mon Sep 17 00:00:00 2001 From: canisminor1990 Date: Wed, 15 Nov 2023 00:13:09 +0800 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20refactor:=20Refactor=20api?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 22 +-- api/azure-speech.ts | 28 ---- api/edge-speech.ts | 13 ++ api/microsoft-speech.ts | 12 +- api/open-stt.ts | 19 +++ api/openai-tts.ts | 19 +++ package.json | 3 +- src/const/api.ts | 32 ++-- src/data/{nameList.ts => voiceList.ts} | 0 src/index.ts | 10 +- src/server.ts | 7 +- src/server/cors.ts | 140 ------------------ src/server/createEdgeSpeechComletion.ts | 103 +++++++++++++ ...t.ts => createMicrosoftSpeechComletion.ts} | 28 +++- .../createOpenaiAudioSpeechCompletion.ts | 26 ++++ ...eateOpenaiAudioTranscriptionsCompletion.ts | 29 ++++ src/server/getAllowOrigins.ts | 15 -- src/server/handleAzureSpeechRequest.ts | 52 ------- src/server/types.ts | 58 ++++++++ src/services/fetchAzureSpeech.ts | 39 ----- src/services/fetchEdgeSpeech.ts | 109 ++------------ src/services/fetchMicrosoftSpeech.ts | 32 ++-- src/services/fetchOpenaiSTT.ts | 45 +++--- src/services/fetchOpenaiTTS.ts | 41 ++--- src/useAzureSpeech/demos/index.tsx | 84 ----------- src/useAzureSpeech/index.md | 11 -- src/useAzureSpeech/index.ts | 15 -- src/useEdgeSpeech/demos/index.tsx | 19 ++- src/useEdgeSpeech/index.ts | 2 +- src/useMicrosoftSpeech/demos/index.tsx | 34 +++-- src/useMicrosoftSpeech/index.md | 2 +- src/useMicrosoftSpeech/index.ts | 2 +- src/useOpenaiSTT/demos/index.tsx | 3 +- src/useOpenaiTTS/demos/index.tsx | 5 +- src/useOpenaiTTS/index.ts | 2 +- src/useSpeechSynthes/demos/index.tsx | 8 +- src/useSpeechSynthes/index.ts | 6 +- src/utils/genSSML.ts | 34 +++-- src/utils/getVoiceList.ts | 6 +- 39 files changed, 477 insertions(+), 638 deletions(-) delete mode 100644 api/azure-speech.ts create mode 100644 api/edge-speech.ts create mode 100644 api/open-stt.ts create mode 100644 api/openai-tts.ts rename src/data/{nameList.ts => voiceList.ts} (100%) delete mode 100644 src/server/cors.ts create mode 100644 src/server/createEdgeSpeechComletion.ts rename src/server/{handleMicrosoftSpeechRequest.ts => createMicrosoftSpeechComletion.ts} (55%) create mode 100644 src/server/createOpenaiAudioSpeechCompletion.ts create mode 100644 src/server/createOpenaiAudioTranscriptionsCompletion.ts delete mode 100644 src/server/getAllowOrigins.ts delete mode 100644 src/server/handleAzureSpeechRequest.ts create mode 100644 src/server/types.ts delete mode 100644 src/services/fetchAzureSpeech.ts delete mode 100644 src/useAzureSpeech/demos/index.tsx delete mode 100644 src/useAzureSpeech/index.md delete mode 100644 src/useAzureSpeech/index.ts diff --git a/README.md b/README.md index f1c9683..f01621d 100644 --- a/README.md +++ b/README.md @@ -86,17 +86,17 @@ Click button below to deploy your private plugins' gateway. This project provides some additional configuration items set with environment variables: -| Environment Variable | Description | Default | -| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------- | -| `ALLOW_ORIGINS` | Allow origins , string or string array | | -| `OPENAI_API_KEY` | This is the API key you apply on the OpenAI account page | `sk-xxxxxx...xxxxxx` | -| `OPENAI_PROXY_URL` | If you manually configure the OpenAI interface proxy, you can use this configuration item to override the default OpenAI API request base URL | `https://api.openai.com/v1` | -| `AZURE_SPEECH_KEY` | This is the API key of Azure Speech Service | | -| `AZURE_SPEECH_REGION` | This is the region of Azure Speech Service | | -| `AZURE_SPEECH_PROXY_URL` | If you manually configure the AZURE Speech interface proxy, you can use this configuration item to override the default Speech API request base URL | `/api/azure-speech` | -| `MICROSOFT_SPEECH_PROXY_URL` | If you manually configure the Microsoft Speech interface proxy, you can use this configuration item to override the default Speech API request base URL | `/api/microsoft-speech` | -| `EDDGE_API_TOKEN` | This is the API key of Edge Speech Service | | -| `EDDGE_PROXY_URL` | If you manually configure the Edge interface proxy, you can use this configuration item to override the default Edge wss request base URL | | +| Environment Variable | Description | Default | +| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------- | +| `ALLOW_ORIGINS` | Allow origins , string or string array | | +| `OPENAI_API_KEY` | This is the API key you apply on the OpenAI account page | `sk-xxxxxx...xxxxxx` | +| `OPENAI_PROXY_URL` | If you manually configure the OpenAI interface proxy, you can use this configuration item to override the default OpenAI API request base URL | `https://api.openai.com/v1` | +| `AZURE_SPEECH_KEY` | This is the API key of Azure Speech Service | | +| `AZURE_SPEECH_REGION` | This is the region of Azure Speech Service | | +| `AZURE_SPEECH_PROXY_URL` | If you manually configure the AZURE Speech interface proxy, you can use this configuration item to override the default Speech API request base URL | `/api/azure-speech` | +| `MICROSOFT_SPEECH_API_URL` | If you manually configure the Microsoft Speech interface proxy, you can use this configuration item to override the default Speech API request base URL | `/api/microsoft-speech` | +| `EDGE_API_TOKEN` | This is the API key of Edge Speech Service | | +| `EDGE_SPEECH_API_URL` | If you manually configure the Edge interface proxy, you can use this configuration item to override the default Edge wss request base URL | |
diff --git a/api/azure-speech.ts b/api/azure-speech.ts deleted file mode 100644 index f6d90da..0000000 --- a/api/azure-speech.ts +++ /dev/null @@ -1,28 +0,0 @@ -// TODO: fix vercel error -// Error: The Edge Function "api/azure-speech" is referencing unsupported modules: -// - https-proxy-agent: net, tls, url -// - microsoft-cognitiveservices-speech-sdk: vc-blob-asset:speech-processor.js, fs, net, tls - -/* -import cors from '../src/server/cors'; -import { getAllowOrigins } from '../src/server/getAllowOrigins'; -import { handleAzureSpeechRequest } from '../src/server/handleAzureSpeechRequest'; - -export const config = { - runtime: 'edge', -}; - -export default async (req: Request) => { - if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); - const origin = getAllowOrigins(req); - if (!origin) return new Response('Origin Not Allowed', { status: 403 }); - const res = await handleAzureSpeechRequest(req); - return cors(req, res, { methods: ['POST'], origin }); -}; -*/ - -export default async (req: Request) => { - if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); - - return new Response('WIP'); -}; diff --git a/api/edge-speech.ts b/api/edge-speech.ts new file mode 100644 index 0000000..c4f9639 --- /dev/null +++ b/api/edge-speech.ts @@ -0,0 +1,13 @@ +import { createEdgeSpeechComletion } from '../src/server/createEdgeSpeechComletion'; +import { EdgeSpeechPayload } from '../src/server/types'; + +export const config = { + runtime: 'edge', +}; + +export default async (req: Request) => { + if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); + const payload = (await req.json()) as EdgeSpeechPayload; + const res = await createEdgeSpeechComletion({ payload }); + return res; +}; diff --git a/api/microsoft-speech.ts b/api/microsoft-speech.ts index 9b62c17..38e3f02 100644 --- a/api/microsoft-speech.ts +++ b/api/microsoft-speech.ts @@ -1,6 +1,5 @@ -import cors from '../src/server/cors'; -import { getAllowOrigins } from '../src/server/getAllowOrigins'; -import { handleMicrosoftSpeechRequest } from '../src/server/handleMicrosoftSpeechRequest'; +import { createMicrosoftSpeechComletion } from '../src/server/createMicrosoftSpeechComletion'; +import { MicrosoftSpeechPayload } from '../src/server/types'; export const config = { runtime: 'edge', @@ -8,8 +7,7 @@ export const config = { export default async (req: Request) => { if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); - const origin = getAllowOrigins(req); - if (!origin) return new Response('Origin Not Allowed', { status: 403 }); - const res = await handleMicrosoftSpeechRequest(req); - return cors(req, new Response(res.body, res), { methods: ['POST'], origin }); + const payload = (await req.json()) as MicrosoftSpeechPayload; + const res = await createMicrosoftSpeechComletion({ payload }); + return res; }; diff --git a/api/open-stt.ts b/api/open-stt.ts new file mode 100644 index 0000000..32ae42b --- /dev/null +++ b/api/open-stt.ts @@ -0,0 +1,19 @@ +import OpenAI from 'openai'; + +import { OPENAI_API_KEY, OPENAI_PROXY_URL } from '@/const/api'; + +import { createOpenaiAudioTranscriptionsCompletion } from '../src/server/createOpenaiAudioTranscriptionsCompletion'; +import { OpenAISTTPayload } from '../src/server/types'; + +export const config = { + runtime: 'edge', +}; + +export default async (req: Request) => { + if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); + const payload = (await req.json()) as OpenAISTTPayload; + if (!OPENAI_API_KEY) return new Response('OPENAI_API_KEY is not set', { status: 500 }); + const openai = new OpenAI({ apiKey: OPENAI_API_KEY, baseURL: OPENAI_PROXY_URL }); + const res = await createOpenaiAudioTranscriptionsCompletion({ openai, payload }); + return res; +}; diff --git a/api/openai-tts.ts b/api/openai-tts.ts new file mode 100644 index 0000000..88ecbfe --- /dev/null +++ b/api/openai-tts.ts @@ -0,0 +1,19 @@ +import OpenAI from 'openai'; + +import { OPENAI_API_KEY, OPENAI_PROXY_URL } from '@/const/api'; + +import { createOpenaiAudioSpeechCompletion } from '../src/server/createOpenaiAudioSpeechCompletion'; +import { OpenAITTSPayload } from '../src/server/types'; + +export const config = { + runtime: 'edge', +}; + +export default async (req: Request) => { + if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); + const payload = (await req.json()) as OpenAITTSPayload; + if (!OPENAI_API_KEY) return new Response('OPENAI_API_KEY is not set', { status: 500 }); + const openai = new OpenAI({ apiKey: OPENAI_API_KEY, baseURL: OPENAI_PROXY_URL }); + const res = await createOpenaiAudioSpeechCompletion({ openai, payload }); + return res; +}; diff --git a/package.json b/package.json index c8b5aa4..ca9845d 100644 --- a/package.json +++ b/package.json @@ -66,11 +66,10 @@ "antd-style": "^3", "lodash-es": "^4", "lucide-react": "latest", - "microsoft-cognitiveservices-speech-sdk": "^1", + "openai": "^4.17.3", "query-string": "^8", "react-error-boundary": "^4.0.11", "react-layout-kit": "^1", - "ssml-document": "^1", "swr": "^2", "url-join": "^5", "uuid": "^9" diff --git a/src/const/api.ts b/src/const/api.ts index 89afd3c..ca4fbad 100644 --- a/src/const/api.ts +++ b/src/const/api.ts @@ -1,15 +1,14 @@ -import urlJoin from 'url-join'; - -export const MICROSOFT_SPPECH_URL = +export const MICROSOFT_SPEECH_URL = 'https://southeastasia.api.speech.microsoft.com/accfreetrial/texttospeech/acc/v3.0-beta1/vcg/speak'; -export const MICROSOFT_SPEECH_PROXY_URL = - process.env.MICROSOFT_SPEECH_PROXY_URL || - process.env.NEXT_PUBLIC_MICROSOFT_SPEECH_PROXY_URL || - '/api/microsoft-speech'; -export const AZURE_SPEECH_PROXY_URL = - process.env.AZURE_SPEECH_PROXY_URL || - process.env.NEXT_PUBLIC_AZURE_SPEECH_PROXY_URL || - '/api/azure-speech'; +export const EDGE_SPEECH_URL = + 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1'; +export const EDGE_API_TOKEN = '6A5AA1D4EAFF4E9FB37E23D68491D6F4'; + +export const MICROSOFT_SPEECH_API_URL = '/api/microsoft-speech'; +export const EDGE_SPEECH_API_URL = '/api/edge-speech'; +export const OPENAI_TTS_API_URL = '/api/openai-tts'; +export const OPENAI_STT_API_URL = '/api/openai-stt'; + export const AZURE_SPEECH_KEY = process.env.AZURE_SPEECH_KEY || process.env.NEXT_PUBLIC_AZURE_SPEECH_KEY || ''; export const AZURE_SPEECH_REGION = @@ -20,14 +19,3 @@ export const OPENAI_PROXY_URL = process.env.OPENAI_PROXY_URL || process.env.NEXT_PUBLIC_OPENAI_PROXY_URL || 'https://api.openai.com/v1'; -export const OPENAI_TTS_URL = (api?: string) => urlJoin(api || OPENAI_PROXY_URL, 'audio/speech'); -export const OPENAI_STT_URL = (api?: string) => - urlJoin(api || OPENAI_PROXY_URL, 'audio/transcriptions'); -export const EDDGE_PROXY_URL = - process.env.EDDGE_PROXY_URL || - process.env.NEXT_PUBLIC_EDDGE_PROXY_UR || - 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1'; -export const EDDGE_API_TOKEN = - process.env.EDDGE_API_TOKEN || - process.env.NEXT_PUBLIC_EDDGE_API_TOKEN || - '6A5AA1D4EAFF4E9FB37E23D68491D6F4'; diff --git a/src/data/nameList.ts b/src/data/voiceList.ts similarity index 100% rename from src/data/nameList.ts rename to src/data/voiceList.ts diff --git a/src/index.ts b/src/index.ts index 5f2b637..985450d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,19 +3,17 @@ export { default as AudioVisualizer, type AudioVisualizerProps } from './AudioVi export { default as azureVoiceList } from './data/azureVoiceList'; export { default as edgeVoiceList } from './data/edgeVoiceList'; export { default as voiceLocale } from './data/locales'; -export { default as nameList } from './data/nameList'; export { default as openaiVoiceList } from './data/openaiVoiceList'; +export { default as voiceList } from './data/voiceList'; export { useAudioPlayer } from './hooks/useAudioPlayer'; export { useAudioVisualizer } from './hooks/useAudioVisualizer'; export { useBlobUrl } from './hooks/useBlobUrl'; export { useStreamAudioPlayer } from './hooks/useStreamAudioPlayer'; -export { type AzureSpeechOptions, fetchAzureSpeech } from './services/fetchAzureSpeech'; export { type EdgeSpeechOptions, fetchEdgeSpeech } from './services/fetchEdgeSpeech'; export { fetchMicrosoftSpeech, type MicrosoftSpeechOptions } from './services/fetchMicrosoftSpeech'; export { fetchOpenaiSTT, type OpenaiSttOptions } from './services/fetchOpenaiSTT'; export { fetchOpenaiTTS, type OpenaiTtsOptions } from './services/fetchOpenaiTTS'; export { useAudioRecorder } from './useAudioRecorder'; -export { useAzureSpeech } from './useAzureSpeech'; export { useEdgeSpeech } from './useEdgeSpeech'; export { useMicrosoftSpeech } from './useMicrosoftSpeech'; export { @@ -42,3 +40,9 @@ export { getSpeechSynthesVoiceOptions, getVoiceLocaleOptions, } from './utils/getVoiceList'; +export { + EDGE_SPEECH_API_URL, + MICROSOFT_SPEECH_API_URL, + OPENAI_STT_API_URL, + OPENAI_TTS_API_URL, +} from '@/const/api'; diff --git a/src/server.ts b/src/server.ts index 4c232a9..114b555 100644 --- a/src/server.ts +++ b/src/server.ts @@ -1,2 +1,5 @@ -export { handleAzureSpeechRequest } from './server/handleAzureSpeechRequest'; -export { handleMicrosoftSpeechRequest } from './server/handleMicrosoftSpeechRequest'; +export { createEdgeSpeechComletion } from '@/server/createEdgeSpeechComletion'; +export { createMicrosoftSpeechComletion } from '@/server/createMicrosoftSpeechComletion'; +export { createOpenaiAudioSpeechCompletion } from '@/server/createOpenaiAudioSpeechCompletion'; +export { createOpenaiAudioTranscriptionsCompletion } from '@/server/createOpenaiAudioTranscriptionsCompletion'; +export * from '@/server/types'; diff --git a/src/server/cors.ts b/src/server/cors.ts deleted file mode 100644 index a8be6c5..0000000 --- a/src/server/cors.ts +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Multi purpose CORS lib. - * Note: Based on the `cors` package in npm but using only - * web APIs. Feel free to use it in your own projects. - */ - -type StaticOrigin = boolean | string | RegExp | (boolean | string | RegExp)[]; - -type OriginFn = (origin: string | undefined, req: Request) => StaticOrigin | Promise; - -interface CorsOptions { - allowedHeaders?: string | string[]; - credentials?: boolean; - exposedHeaders?: string | string[]; - maxAge?: number; - methods?: string | string[]; - optionsSuccessStatus?: number; - origin?: StaticOrigin | OriginFn; - preflightContinue?: boolean; -} - -const defaultOptions: CorsOptions = { - methods: 'GET,HEAD,PUT,PATCH,POST,DELETE', - optionsSuccessStatus: 204, - origin: '*', - preflightContinue: false, -}; - -function isOriginAllowed(origin: string, allowed: StaticOrigin): boolean { - return Array.isArray(allowed) - ? allowed.some((o) => isOriginAllowed(origin, o)) - : typeof allowed === 'string' - ? origin === allowed - : allowed instanceof RegExp - ? allowed.test(origin) - : !!allowed; -} - -function getOriginHeaders(reqOrigin: string | undefined, origin: StaticOrigin) { - const headers = new Headers(); - - if (origin === '*') { - // Allow any origin - headers.set('Access-Control-Allow-Origin', '*'); - } else if (typeof origin === 'string') { - // Fixed origin - headers.set('Access-Control-Allow-Origin', origin); - headers.append('Vary', 'Origin'); - } else { - const allowed = isOriginAllowed(reqOrigin ?? '', origin); - - if (allowed && reqOrigin) { - headers.set('Access-Control-Allow-Origin', reqOrigin); - } - headers.append('Vary', 'Origin'); - } - - return headers; -} - -// originHeadersFromReq - -async function originHeadersFromReq(req: Request, origin: StaticOrigin | OriginFn) { - const reqOrigin = req.headers.get('Origin') || undefined; - const value = typeof origin === 'function' ? await origin(reqOrigin, req) : origin; - - if (!value) return; - return getOriginHeaders(reqOrigin, value); -} - -function getAllowedHeaders(req: Request, allowed?: string | string[]) { - const headers = new Headers(); - - if (!allowed) { - allowed = req.headers.get('Access-Control-Request-Headers')!; - headers.append('Vary', 'Access-Control-Request-Headers'); - } else if (Array.isArray(allowed)) { - // If the allowed headers is an array, turn it into a string - allowed = allowed.join(','); - } - if (allowed) { - headers.set('Access-Control-Allow-Headers', allowed); - } - - return headers; -} - -export default async function cors(req: Request, res: Response, options?: CorsOptions) { - const opts = { ...defaultOptions, ...options }; - const { headers } = res; - const originHeaders = await originHeadersFromReq(req, opts.origin ?? false); - const mergeHeaders = (v: string, k: string) => { - if (k === 'Vary') headers.append(k, v); - else headers.set(k, v); - }; - - // If there's no origin we won't touch the response - if (!originHeaders) return res; - - originHeaders.forEach(mergeHeaders); - - if (opts.credentials) { - headers.set('Access-Control-Allow-Credentials', 'true'); - } - - const exposed = Array.isArray(opts.exposedHeaders) - ? opts.exposedHeaders.join(',') - : opts.exposedHeaders; - - if (exposed) { - headers.set('Access-Control-Expose-Headers', exposed); - } - - // Handle the preflight request - if (req.method === 'OPTIONS') { - if (opts.methods) { - const methods = Array.isArray(opts.methods) ? opts.methods.join(',') : opts.methods; - - headers.set('Access-Control-Allow-Methods', methods); - } - - getAllowedHeaders(req, opts.allowedHeaders).forEach(mergeHeaders); - - if (typeof opts.maxAge === 'number') { - headers.set('Access-Control-Max-Age', String(opts.maxAge)); - } - - if (opts.preflightContinue) return res; - - headers.set('Content-Length', '0'); - return new Response(null, { headers, status: opts.optionsSuccessStatus }); - } - - // If we got here, it's a normal request - return res; -} - -export function initCors(options?: CorsOptions) { - return (req: Request, res: Response) => cors(req, res, options); -} diff --git a/src/server/createEdgeSpeechComletion.ts b/src/server/createEdgeSpeechComletion.ts new file mode 100644 index 0000000..c0a478c --- /dev/null +++ b/src/server/createEdgeSpeechComletion.ts @@ -0,0 +1,103 @@ +import qs from 'query-string'; +import { v4 as uuidv4 } from 'uuid'; + +import { EDGE_API_TOKEN, EDGE_SPEECH_URL } from '../const/api'; +import { EdgeSpeechPayload } from '../server/types'; +import { genSSML } from '../utils/genSSML'; +import { genSendContent } from '../utils/genSendContent'; +import { getHeadersAndData } from '../utils/getHeadersAndData'; + +const configConent = JSON.stringify({ + context: { + synthesis: { + audio: { + metadataoptions: { sentenceBoundaryEnabled: false, wordBoundaryEnabled: true }, + outputFormat: 'audio-24khz-48kbitrate-mono-mp3', + }, + }, + }, +}); + +const genHeader = (connectId: string) => { + const date = new Date().toString(); + const configHeader = { + 'Content-Type': 'application/json; charset=utf-8', + 'Path': 'speech.config', + 'X-Timestamp': date, + }; + const contentHeader = { + 'Content-Type': 'application/ssml+xml', + 'Path': 'ssml', + 'X-RequestId': connectId, + 'X-Timestamp': date, + }; + return { + configHeader, + contentHeader, + }; +}; + +interface CreateEdgeSpeechComletionOptions { + payload: EdgeSpeechPayload; +} + +export const createEdgeSpeechComletion = async ({ + payload, +}: CreateEdgeSpeechComletionOptions): Promise => { + const { input, options } = payload; + + const connectId = uuidv4().replaceAll('-', ''); + const url = qs.stringifyUrl({ + query: { + ConnectionId: connectId, + TrustedClientToken: EDGE_API_TOKEN, + }, + url: EDGE_SPEECH_URL, + }); + + const { configHeader, contentHeader } = genHeader(connectId); + const config = genSendContent(configHeader, configConent); + const content = genSendContent(contentHeader, genSSML(input, options)); + + return new Promise((resolve, reject) => { + const ws = new WebSocket(url); + ws.binaryType = 'arraybuffer'; + const onOpen = () => { + ws.send(config); + ws.send(content); + }; + let audioData = new ArrayBuffer(0); + const onMessage = async (event: MessageEvent) => { + if (typeof event.data === 'string') { + const { headers } = getHeadersAndData(event.data); + switch (headers['Path']) { + case 'turn.end': { + ws.close(); + if (!audioData.byteLength) return; + const res = new Response(audioData); + resolve(res); + break; + } + } + } else if (event.data instanceof ArrayBuffer) { + const dataview = new DataView(event.data); + const headerLength = dataview.getInt16(0); + if (event.data.byteLength > headerLength + 2) { + const newBody = event.data.slice(2 + headerLength); + const newAudioData = new ArrayBuffer(audioData.byteLength + newBody.byteLength); + const mergedUint8Array = new Uint8Array(newAudioData); + mergedUint8Array.set(new Uint8Array(audioData), 0); + mergedUint8Array.set(new Uint8Array(newBody), audioData.byteLength); + audioData = newAudioData; + } + } + }; + const onError = () => { + reject(new Error('WebSocket error occurred.')); + ws.close(); + }; + ws.addEventListener('open', onOpen); + ws.addEventListener('message', onMessage); + ws.addEventListener('error', onError); + }); +}; diff --git a/src/server/handleMicrosoftSpeechRequest.ts b/src/server/createMicrosoftSpeechComletion.ts similarity index 55% rename from src/server/handleMicrosoftSpeechRequest.ts rename to src/server/createMicrosoftSpeechComletion.ts index 1032970..9b2f854 100644 --- a/src/server/handleMicrosoftSpeechRequest.ts +++ b/src/server/createMicrosoftSpeechComletion.ts @@ -1,8 +1,18 @@ import { v4 as uuidv4 } from 'uuid'; -import { MICROSOFT_SPPECH_URL } from '../const/api'; +import { MICROSOFT_SPEECH_URL } from '../const/api'; +import { MicrosoftSpeechPayload } from '../server/types'; +import { genSSML } from '../utils/genSSML'; + +interface CreateMicrosoftSpeechComletionOptions { + payload: MicrosoftSpeechPayload; +} + +export const createMicrosoftSpeechComletion = async ({ + payload, +}: CreateMicrosoftSpeechComletionOptions) => { + const { input, options } = payload; -export const handleMicrosoftSpeechRequest = async (req: Request, options?: any) => { const DEFAULT_HEADERS = new Headers({ 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9', @@ -20,13 +30,21 @@ export const handleMicrosoftSpeechRequest = async (req: Request, options?: any) 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', }); - const res = await fetch(MICROSOFT_SPPECH_URL, { - body: req.body, + const body = JSON.stringify({ + offsetInPlainText: 0, + properties: { + SpeakTriggerSource: 'AccTuningPagePlayButton', + }, + ssml: genSSML(input, options), + ttsAudioFormat: 'audio-24khz-160kbitrate-mono-mp3', + }); + + const res = await fetch(MICROSOFT_SPEECH_URL, { + body, headers: DEFAULT_HEADERS, method: 'POST', // @ts-ignore responseType: 'arraybuffer', - ...options, }); return res; diff --git a/src/server/createOpenaiAudioSpeechCompletion.ts b/src/server/createOpenaiAudioSpeechCompletion.ts new file mode 100644 index 0000000..8b0a1fe --- /dev/null +++ b/src/server/createOpenaiAudioSpeechCompletion.ts @@ -0,0 +1,26 @@ +import OpenAI from 'openai'; + +import { OpenAITTSPayload } from './types'; + +interface CreateOpenaiAudioSpeechCompletionOptions { + openai: OpenAI; + payload: OpenAITTSPayload; +} + +export const createOpenaiAudioSpeechCompletion = async ({ + payload, + openai, +}: CreateOpenaiAudioSpeechCompletionOptions) => { + const { options, input } = payload; + + const response = await openai.audio.speech.create( + { + input, + model: options.model, + voice: options.voice, + }, + { headers: { Accept: '*/*' } }, + ); + + return response; +}; diff --git a/src/server/createOpenaiAudioTranscriptionsCompletion.ts b/src/server/createOpenaiAudioTranscriptionsCompletion.ts new file mode 100644 index 0000000..7c3ec30 --- /dev/null +++ b/src/server/createOpenaiAudioTranscriptionsCompletion.ts @@ -0,0 +1,29 @@ +import OpenAI from 'openai'; + +import { OpenAISTTPayload } from './types'; + +interface CreateOpenaiAudioTranscriptionsOptions { + openai: OpenAI; + payload: OpenAISTTPayload; +} + +export const createOpenaiAudioTranscriptionsCompletion = async ({ + payload, + openai, +}: CreateOpenaiAudioTranscriptionsOptions) => { + const { blob, options } = payload; + + const file = new File([blob], `${Date.now()}.${options.mineType.extension}`, { + type: options.mineType.mineType, + }); + + const response = await openai.audio.transcriptions.create( + { + file, + model: options.model, + }, + { headers: { Accept: '*/*' } }, + ); + + return response.text; +}; diff --git a/src/server/getAllowOrigins.ts b/src/server/getAllowOrigins.ts deleted file mode 100644 index 3fccfdf..0000000 --- a/src/server/getAllowOrigins.ts +++ /dev/null @@ -1,15 +0,0 @@ -const ALLOW_ORIGINS = process.env?.ALLOW_ORIGINS?.split(',') || undefined; - -export const getAllowOrigins = (req: Request) => { - let origin = '*'; - - if (ALLOW_ORIGINS) { - const reqOrigin = req.headers.get('origin'); - if (reqOrigin && ALLOW_ORIGINS.includes(reqOrigin)) { - origin = reqOrigin; - } else { - return; - } - } - return origin; -}; diff --git a/src/server/handleAzureSpeechRequest.ts b/src/server/handleAzureSpeechRequest.ts deleted file mode 100644 index 9c9033f..0000000 --- a/src/server/handleAzureSpeechRequest.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { - AudioConfig, - PropertyId, - ResultReason, - SpeechConfig, - SpeechSynthesisOutputFormat, - SpeechSynthesisResult, - SpeechSynthesizer, -} from 'microsoft-cognitiveservices-speech-sdk'; - -import { AZURE_SPEECH_KEY, AZURE_SPEECH_REGION } from '../const/api'; - -const fetchAzureSpeech = async (ssml: string, { api }: any): Promise => { - const key = api.key || AZURE_SPEECH_KEY; - const region = api.key || AZURE_SPEECH_REGION; - const speechConfig = SpeechConfig.fromSubscription(key, region); - speechConfig.setProperty(PropertyId.SpeechServiceResponse_RequestSentenceBoundary, 'true'); - speechConfig.speechSynthesisOutputFormat = SpeechSynthesisOutputFormat.Webm24Khz16BitMonoOpus; - - const audioConfig = AudioConfig.fromDefaultSpeakerOutput(); - const synthesizer: SpeechSynthesizer | null = new SpeechSynthesizer(speechConfig, audioConfig); - - const completeCb = async ( - result: SpeechSynthesisResult, - resolve: (value: ArrayBuffer) => void, - ) => { - if (result.reason === ResultReason.SynthesizingAudioCompleted) { - const audioData = result.audioData; - resolve(audioData); - } - synthesizer.close(); - }; - - const errCb = (err: string, reject: (err?: any) => void) => { - reject(err); - synthesizer.close(); - }; - - return new Promise((resolve, reject) => { - synthesizer.speakSsmlAsync( - ssml, - (result) => completeCb(result, resolve), - (err) => errCb(err, reject), - ); - }); -}; - -export const handleAzureSpeechRequest = async (req: Request) => { - const { ssml, ...options } = req.body as any; - const data = await fetchAzureSpeech(ssml, options); - return new Response(data); -}; diff --git a/src/server/types.ts b/src/server/types.ts new file mode 100644 index 0000000..4ddfbdf --- /dev/null +++ b/src/server/types.ts @@ -0,0 +1,58 @@ +import { SsmlOptions } from '@/utils/genSSML'; +import { RecordMineType } from '@/utils/getRecordMineType'; + +export interface MicrosoftSpeechPayload { + /** + * @title 语音合成的文本 + */ + input: string; + /** + * @title SSML 语音合成的配置 + */ + options: SsmlOptions; +} + +export interface EdgeSpeechPayload { + /** + * @title 语音合成的文本 + */ + input: string; + /** + * @title SSML 语音合成的配置 + */ + options: Pick; +} + +export interface OpenAITTSPayload { + /** + * @title 语音合成的文本 + */ + input: string; + options: { + /** + * @title 语音合成的模型名称 + */ + model: string; + /** + * @title 语音合成的声音名称 + */ + voice: 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer'; + }; +} + +export interface OpenAISTTPayload { + /** + * @title 语音识别的文件 + */ + blob: Blob; + options: { + /** + * @title 语音文件格式 + */ + mineType: RecordMineType; + /** + * @title 语音识别的模型名称 + */ + model: string; + }; +} diff --git a/src/services/fetchAzureSpeech.ts b/src/services/fetchAzureSpeech.ts deleted file mode 100644 index ea495a3..0000000 --- a/src/services/fetchAzureSpeech.ts +++ /dev/null @@ -1,39 +0,0 @@ -import { AZURE_SPEECH_KEY, AZURE_SPEECH_PROXY_URL, AZURE_SPEECH_REGION } from '@/const/api'; -import { arrayBufferConvert } from '@/utils/arrayBufferConvert'; -import { type SsmlOptions, genSSML } from '@/utils/genSSML'; - -export interface AzureSpeechOptions extends SsmlOptions { - api?: { - key?: string; - proxy?: string; - region?: string; - }; -} - -export const fetchAzureSpeech = async ( - text: string, - { api = {}, ...options }: AzureSpeechOptions, -): Promise => { - const data = JSON.stringify({ - api: { - key: api?.key || AZURE_SPEECH_KEY, - region: api?.region || AZURE_SPEECH_REGION, - }, - ssml: genSSML(text, options), - }); - const url = api?.proxy || AZURE_SPEECH_PROXY_URL; - - const response: Response = await fetch(url, { - body: data, - method: 'POST', - // @ts-ignore - responseType: 'arraybuffer', - }); - - if (!response.ok) { - throw new Error('Network response was not ok'); - } - - const arrayBuffer = await response.arrayBuffer(); - return await arrayBufferConvert(arrayBuffer); -}; diff --git a/src/services/fetchEdgeSpeech.ts b/src/services/fetchEdgeSpeech.ts index 7ab2b50..9c18349 100644 --- a/src/services/fetchEdgeSpeech.ts +++ b/src/services/fetchEdgeSpeech.ts @@ -1,104 +1,25 @@ -import qs from 'query-string'; -import { v4 as uuidv4 } from 'uuid'; - -import { EDDGE_API_TOKEN, EDDGE_PROXY_URL } from '@/const/api'; +import { createEdgeSpeechComletion } from '@/server/createEdgeSpeechComletion'; +import { EdgeSpeechPayload } from '@/server/types'; import { arrayBufferConvert } from '@/utils/arrayBufferConvert'; -import { type SsmlOptions, genSSML } from '@/utils/genSSML'; -import { genSendContent } from '@/utils/genSendContent'; -import { getHeadersAndData } from '@/utils/getHeadersAndData'; - -const configConent = JSON.stringify({ - context: { - synthesis: { - audio: { - metadataoptions: { sentenceBoundaryEnabled: false, wordBoundaryEnabled: true }, - outputFormat: 'audio-24khz-48kbitrate-mono-mp3', - }, - }, - }, -}); - -const genHeader = (connectId: string) => { - const date = new Date().toString(); - const configHeader = { - 'Content-Type': 'application/json; charset=utf-8', - 'Path': 'speech.config', - 'X-Timestamp': date, - }; - const contentHeader = { - 'Content-Type': 'application/ssml+xml', - 'Path': 'ssml', - 'X-RequestId': connectId, - 'X-Timestamp': date, - }; - return { - configHeader, - contentHeader, - }; -}; +import { type SsmlOptions } from '@/utils/genSSML'; -export interface EdgeSpeechOptions extends Pick { +export interface EdgeSpeechOptions extends Pick { api?: { - key?: string; - proxy?: string; + url?: string; }; } + export const fetchEdgeSpeech = async ( - text: string, - { api = {}, ...options }: EdgeSpeechOptions, + input: string, + { api, ...options }: EdgeSpeechOptions, ): Promise => { - const connectId = uuidv4().replaceAll('-', ''); - const url = qs.stringifyUrl({ - query: { - ConnectionId: connectId, - TrustedClientToken: api?.key || EDDGE_API_TOKEN, - }, - url: api?.proxy || EDDGE_PROXY_URL, - }); + const payload: EdgeSpeechPayload = { input, options }; - const { configHeader, contentHeader } = genHeader(connectId); - const config = genSendContent(configHeader, configConent); - const content = genSendContent(contentHeader, genSSML(text, options)); + const response = await (api?.url + ? fetch(api.url, { body: JSON.stringify(payload), method: 'POST' }) + : await createEdgeSpeechComletion({ payload })); - return new Promise((resolve, reject) => { - const ws = new WebSocket(url); - ws.binaryType = 'arraybuffer'; - const onOpen = () => { - ws.send(config); - ws.send(content); - }; - let audioData = new ArrayBuffer(0); - const onMessage = async (event: MessageEvent) => { - if (typeof event.data === 'string') { - const { headers } = getHeadersAndData(event.data); - switch (headers['Path']) { - case 'turn.end': { - ws.close(); - if (!audioData.byteLength) return; - const audioBuffer = await arrayBufferConvert(audioData); - resolve(audioBuffer); - break; - } - } - } else if (event.data instanceof ArrayBuffer) { - const dataview = new DataView(event.data); - const headerLength = dataview.getInt16(0); - if (event.data.byteLength > headerLength + 2) { - const newBody = event.data.slice(2 + headerLength); - const newAudioData = new ArrayBuffer(audioData.byteLength + newBody.byteLength); - const mergedUint8Array = new Uint8Array(newAudioData); - mergedUint8Array.set(new Uint8Array(audioData), 0); - mergedUint8Array.set(new Uint8Array(newBody), audioData.byteLength); - audioData = newAudioData; - } - } - }; - const onError = () => { - reject(new Error('WebSocket error occurred.')); - ws.close(); - }; - ws.addEventListener('open', onOpen); - ws.addEventListener('message', onMessage); - ws.addEventListener('error', onError); - }); + const arrayBuffer = await response.arrayBuffer(); + const audioBuffer = await arrayBufferConvert(arrayBuffer); + return audioBuffer; }; diff --git a/src/services/fetchMicrosoftSpeech.ts b/src/services/fetchMicrosoftSpeech.ts index 323c80f..4a0594a 100644 --- a/src/services/fetchMicrosoftSpeech.ts +++ b/src/services/fetchMicrosoftSpeech.ts @@ -1,39 +1,29 @@ -import { MICROSOFT_SPEECH_PROXY_URL } from '@/const/api'; +import { createMicrosoftSpeechComletion } from '@/server/createMicrosoftSpeechComletion'; +import { MicrosoftSpeechPayload } from '@/server/types'; import { arrayBufferConvert } from '@/utils/arrayBufferConvert'; import { type SsmlOptions } from '@/utils/genSSML'; -import { genSSML } from '@/utils/genSSML'; export interface MicrosoftSpeechOptions extends SsmlOptions { api?: { - proxy?: string; + url?: string; }; } export const fetchMicrosoftSpeech = async ( - text: string, - { api = {}, ...options }: MicrosoftSpeechOptions, + input: string, + { api, ...options }: MicrosoftSpeechOptions, ): Promise => { - const data = JSON.stringify({ - offsetInPlainText: 0, - properties: { - SpeakTriggerSource: 'AccTuningPagePlayButton', - }, - ssml: genSSML(text, options), - ttsAudioFormat: 'audio-24khz-160kbitrate-mono-mp3', - }); - const url = api?.proxy || MICROSOFT_SPEECH_PROXY_URL; + const payload: MicrosoftSpeechPayload = { input, options }; - const response: Response = await fetch(url, { - body: data, - method: 'POST', - // @ts-ignore - responseType: 'arraybuffer', - }); + const response = await (api?.url + ? fetch(api.url, { body: JSON.stringify(payload), method: 'POST' }) + : createMicrosoftSpeechComletion({ payload })); if (!response.ok) { throw new Error('Network response was not ok'); } const arrayBuffer = await response.arrayBuffer(); - return await arrayBufferConvert(arrayBuffer); + const audioBuffer = await arrayBufferConvert(arrayBuffer); + return audioBuffer; }; diff --git a/src/services/fetchOpenaiSTT.ts b/src/services/fetchOpenaiSTT.ts index d29e643..4f7a6e4 100644 --- a/src/services/fetchOpenaiSTT.ts +++ b/src/services/fetchOpenaiSTT.ts @@ -1,12 +1,15 @@ -import { v4 as uuidv4 } from 'uuid'; +import OpenAI from 'openai'; -import { OPENAI_API_KEY, OPENAI_STT_URL } from '@/const/api'; +import { OPENAI_API_KEY, OPENAI_PROXY_URL } from '@/const/api'; +import { createOpenaiAudioTranscriptionsCompletion } from '@/server/createOpenaiAudioTranscriptionsCompletion'; +import { OpenAISTTPayload } from '@/server/types'; import { RecordMineType, getRecordMineType } from '@/utils/getRecordMineType'; export interface OpenaiSttOptions { api?: { key?: string; proxy?: string; + url?: string; }; mineType?: RecordMineType; model?: 'whisper-1'; @@ -18,28 +21,22 @@ export const fetchOpenaiSTT = async ( { api = {}, model = 'whisper-1', mineType }: OpenaiSttOptions, ): Promise => { const key = api?.key || OPENAI_API_KEY; - const url = OPENAI_STT_URL(api?.proxy); - - const headers = new Headers({ - Authorization: `Bearer ${key}`, - }); - - const filename = `${uuidv4()}.${mineType?.extension || getRecordMineType().extension}`; - const file = new File([speech], filename, { - type: mineType?.mineType || getRecordMineType().mineType, - }); - - const body = new FormData(); - body.append('file', file); - body.append('model', model); - - const response: Response = await fetch(url, { body, headers, method: 'POST' }); - - if (!response.ok) { - throw new Error('Network response was not ok'); - } + const url = api?.proxy || OPENAI_PROXY_URL; + + const payload: OpenAISTTPayload = { + blob: speech, + options: { + mineType: mineType || getRecordMineType(), + model, + }, + }; - const json = await response.json(); + const response = (await (api?.url + ? fetch(api.url, { body: JSON.stringify(payload), method: 'POST' }) + : createOpenaiAudioTranscriptionsCompletion({ + openai: new OpenAI({ apiKey: key, baseURL: url }), + payload, + }))) as string; - return json?.text; + return response; }; diff --git a/src/services/fetchOpenaiTTS.ts b/src/services/fetchOpenaiTTS.ts index 75d6218..9007563 100644 --- a/src/services/fetchOpenaiTTS.ts +++ b/src/services/fetchOpenaiTTS.ts @@ -1,36 +1,43 @@ -import { OPENAI_API_KEY, OPENAI_TTS_URL } from '@/const/api'; +import OpenAI from 'openai'; + +import { OPENAI_API_KEY, OPENAI_PROXY_URL } from '@/const/api'; +import { createOpenaiAudioSpeechCompletion } from '@/server/createOpenaiAudioSpeechCompletion'; +import { OpenAITTSPayload } from '@/server/types'; import { arrayBufferConvert } from '@/utils/arrayBufferConvert'; import { type SsmlOptions } from '@/utils/genSSML'; export type OpenaiVoice = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer'; -export interface OpenaiTtsOptions extends Pick { +export interface OpenaiTtsOptions extends Pick, OpenAITTSPayload { api: { key?: string; proxy?: string; + url?: string; }; model?: 'tts-1' | 'tts-1-hd'; - name: OpenaiVoice | string; + voice: OpenaiVoice; } export const fetchOpenaiTTS = async ( - text: string, - { api = {}, model = 'tts-1', ...options }: OpenaiTtsOptions, + input: string, + { api = {}, model = 'tts-1', voice }: OpenaiTtsOptions, ): Promise => { const key = api?.key || OPENAI_API_KEY; - const url = OPENAI_TTS_URL(api?.proxy); - - const headers = new Headers({ - 'Authorization': `Bearer ${key}`, - 'Content-Type': 'application/json', - }); + const url = api?.proxy || OPENAI_PROXY_URL; - const body = JSON.stringify({ - input: text, - model, - voice: options.name, - }); + const payload: OpenAITTSPayload = { + input, + options: { + model, + voice, + }, + }; - const response: Response = await fetch(url, { body, headers, method: 'POST' }); + const response = await (api?.url + ? fetch(api.url, { body: JSON.stringify(payload), method: 'POST' }) + : await createOpenaiAudioSpeechCompletion({ + openai: new OpenAI({ apiKey: key, baseURL: url }), + payload, + })); if (!response.ok) { throw new Error('Network response was not ok'); diff --git a/src/useAzureSpeech/demos/index.tsx b/src/useAzureSpeech/demos/index.tsx deleted file mode 100644 index 8d5e7b6..0000000 --- a/src/useAzureSpeech/demos/index.tsx +++ /dev/null @@ -1,84 +0,0 @@ -import { AudioPlayer, genLevaOptions, getAzureVoiceOptions, useAzureSpeech } from '@lobehub/tts'; -import { Icon, StoryBook, useControls, useCreateStore } from '@lobehub/ui'; -import { Button, Input } from 'antd'; -import { Volume2 } from 'lucide-react'; -import { Flexbox } from 'react-layout-kit'; - -const defaultText = '这是一段使用 Azure Speech 的语音演示'; - -export default () => { - const store = useCreateStore(); - - const api: any = useControls( - { - key: { - label: 'AZURE_SPEECH_KEY', - value: '', - }, - region: { - label: 'AZURE_SPEECH_REGION', - value: '', - }, - }, - { store }, - ); - - const options: any = useControls( - { - name: { - options: genLevaOptions(getAzureVoiceOptions()), - value: 'zh-CN-YunxiaNeural', - }, - pitch: { - max: 1, - min: -1, - step: 0.1, - value: 0, - }, - rate: { - max: 1, - min: -1, - step: 0.1, - value: 0, - }, - style: { - options: [ - 'affectionate', - 'angry', - 'calm', - 'cheerful', - 'disgruntled', - 'embarrassed', - 'fearful', - 'general', - 'gentle', - 'sad', - 'serious', - ], - value: 'general', - }, - }, - { store }, - ); - const { setText, isGlobalLoading, audio, start, stop } = useAzureSpeech(defaultText, { - api, - ...options, - }); - return ( - - - {isGlobalLoading ? ( - - ) : ( - - )} - setText(e.target.value)} /> - - - - ); -}; diff --git a/src/useAzureSpeech/index.md b/src/useAzureSpeech/index.md deleted file mode 100644 index 1bbc032..0000000 --- a/src/useAzureSpeech/index.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -nav: Components -group: TTS -title: useAzureSpeech ---- - -## hooks - -- ENV: `AZURE_SPEECH_KEY` `AZURE_SPEECH_REGION` - - diff --git a/src/useAzureSpeech/index.ts b/src/useAzureSpeech/index.ts deleted file mode 100644 index 6e29ae5..0000000 --- a/src/useAzureSpeech/index.ts +++ /dev/null @@ -1,15 +0,0 @@ -import { useState } from 'react'; - -import { AzureSpeechOptions, fetchAzureSpeech } from '@/services/fetchAzureSpeech'; -import { useTTS } from '@/useTTS'; - -export const useAzureSpeech = (defaultText: string, options: AzureSpeechOptions) => { - const [text, setText] = useState(defaultText); - const rest = useTTS(options.name, text, (segmentText: string) => - fetchAzureSpeech(segmentText, options), - ); - return { - setText, - ...rest, - }; -}; diff --git a/src/useEdgeSpeech/demos/index.tsx b/src/useEdgeSpeech/demos/index.tsx index dae1ef8..f44e544 100644 --- a/src/useEdgeSpeech/demos/index.tsx +++ b/src/useEdgeSpeech/demos/index.tsx @@ -1,4 +1,10 @@ -import { AudioPlayer, genLevaOptions, getEdgeVoiceOptions, useEdgeSpeech } from '@lobehub/tts'; +import { + AudioPlayer, + EDGE_SPEECH_API_URL, + genLevaOptions, + getEdgeVoiceOptions, + useEdgeSpeech, +} from '@lobehub/tts'; import { Icon, StoryBook, useControls, useCreateStore } from '@lobehub/ui'; import { Button, Input } from 'antd'; import { Volume2 } from 'lucide-react'; @@ -11,21 +17,14 @@ export default () => { const api: any = useControls( { - key: { - label: 'EDDGE_API_TOKEN', - value: '', - }, - proxy: { - label: 'EDDGE_PROXY_URL', - value: '', - }, + url: EDGE_SPEECH_API_URL, }, { store }, ); const options: any = useControls( { - name: { + voice: { options: genLevaOptions(getEdgeVoiceOptions()), value: 'zh-CN-YunxiaNeural', }, diff --git a/src/useEdgeSpeech/index.ts b/src/useEdgeSpeech/index.ts index ff2f7fc..6909b06 100644 --- a/src/useEdgeSpeech/index.ts +++ b/src/useEdgeSpeech/index.ts @@ -5,7 +5,7 @@ import { useTTS } from '@/useTTS'; export const useEdgeSpeech = (defaultText: string, options: EdgeSpeechOptions) => { const [text, setText] = useState(defaultText); - const rest = useTTS(options.name, text, (segmentText: string) => + const rest = useTTS(options.voice, text, (segmentText: string) => fetchEdgeSpeech(segmentText, options), ); return { diff --git a/src/useMicrosoftSpeech/demos/index.tsx b/src/useMicrosoftSpeech/demos/index.tsx index ac31eba..24cce4f 100644 --- a/src/useMicrosoftSpeech/demos/index.tsx +++ b/src/useMicrosoftSpeech/demos/index.tsx @@ -1,4 +1,10 @@ -import { AudioPlayer, genLevaOptions, getEdgeVoiceOptions, useMicrosoftSpeech } from '@lobehub/tts'; +import { + AudioPlayer, + MICROSOFT_SPEECH_API_URL, + genLevaOptions, + getEdgeVoiceOptions, + useMicrosoftSpeech, +} from '@lobehub/tts'; import { Icon, StoryBook, useControls, useCreateStore } from '@lobehub/ui'; import { Button, Input } from 'antd'; import { Volume2 } from 'lucide-react'; @@ -8,16 +14,17 @@ const defaultText = '这是一段使用 Microsoft Speech 的语音演示'; export default () => { const store = useCreateStore(); - const options: any = useControls( + const api: any = useControls( { - api: { - label: 'MICROSOFT_SPEECH_PROXY_URL', - value: '', - }, - name: { - options: genLevaOptions(getEdgeVoiceOptions()), - value: 'zh-CN-YunxiaNeural', + url: { + label: 'MICROSOFT_SPEECH_API_URL', + value: MICROSOFT_SPEECH_API_URL, }, + }, + { store }, + ); + const options: any = useControls( + { pitch: { max: 1, min: -1, @@ -46,10 +53,17 @@ export default () => { ], value: 'general', }, + voice: { + options: genLevaOptions(getEdgeVoiceOptions()), + value: 'zh-CN-YunxiaNeural', + }, }, { store }, ); - const { setText, isGlobalLoading, audio, start, stop } = useMicrosoftSpeech(defaultText, options); + const { setText, isGlobalLoading, audio, start, stop } = useMicrosoftSpeech(defaultText, { + api, + ...options, + }); return ( diff --git a/src/useMicrosoftSpeech/index.md b/src/useMicrosoftSpeech/index.md index 8ab707e..b431580 100644 --- a/src/useMicrosoftSpeech/index.md +++ b/src/useMicrosoftSpeech/index.md @@ -6,6 +6,6 @@ title: useMicrosoftSpeech ## hooks -- ENV: `MICROSOFT_SPEECH_PROXY_URL` +- ENV: `MICROSOFT_SPEECH_API_URL` diff --git a/src/useMicrosoftSpeech/index.ts b/src/useMicrosoftSpeech/index.ts index 637d2c6..3e2e150 100644 --- a/src/useMicrosoftSpeech/index.ts +++ b/src/useMicrosoftSpeech/index.ts @@ -5,7 +5,7 @@ import { useTTS } from '@/useTTS'; export const useMicrosoftSpeech = (defaultText: string, options: MicrosoftSpeechOptions) => { const [text, setText] = useState(defaultText); - const rest = useTTS(options.name, text, (segmentText: string) => + const rest = useTTS(options.voice, text, (segmentText: string) => fetchMicrosoftSpeech(segmentText, options), ); return { diff --git a/src/useOpenaiSTT/demos/index.tsx b/src/useOpenaiSTT/demos/index.tsx index e638a02..3b05f8e 100644 --- a/src/useOpenaiSTT/demos/index.tsx +++ b/src/useOpenaiSTT/demos/index.tsx @@ -1,4 +1,4 @@ -import { useOpenaiSTTWithRecord } from '@lobehub/tts'; +import { OPENAI_STT_API_URL, useOpenaiSTTWithRecord } from '@lobehub/tts'; import { Icon, StoryBook, useControls, useCreateStore } from '@lobehub/ui'; import { Button, Input } from 'antd'; import { Mic, StopCircle } from 'lucide-react'; @@ -16,6 +16,7 @@ export default () => { label: 'OPENAI_PROXY_URL', value: '', }, + url: OPENAI_STT_API_URL, }, { store }, ); diff --git a/src/useOpenaiTTS/demos/index.tsx b/src/useOpenaiTTS/demos/index.tsx index b07e534..cfbc24b 100644 --- a/src/useOpenaiTTS/demos/index.tsx +++ b/src/useOpenaiTTS/demos/index.tsx @@ -1,4 +1,4 @@ -import { AudioPlayer, openaiVoiceList, useOpenaiTTS } from '@lobehub/tts'; +import { AudioPlayer, OPENAI_TTS_API_URL, openaiVoiceList, useOpenaiTTS } from '@lobehub/tts'; import { Icon, StoryBook, useControls, useCreateStore } from '@lobehub/ui'; import { Button, Input } from 'antd'; import { Volume2 } from 'lucide-react'; @@ -19,13 +19,14 @@ export default () => { label: 'OPENAI_PROXY_URL', value: '', }, + url: OPENAI_TTS_API_URL, }, { store }, ); const options: any = useControls( { - name: { + voice: { options: openaiVoiceList, value: 'alloy', }, diff --git a/src/useOpenaiTTS/index.ts b/src/useOpenaiTTS/index.ts index d04cc68..c88a78b 100644 --- a/src/useOpenaiTTS/index.ts +++ b/src/useOpenaiTTS/index.ts @@ -5,7 +5,7 @@ import { useTTS } from '@/useTTS'; export const useOpenaiTTS = (defaultText: string, options: OpenaiTtsOptions) => { const [text, setText] = useState(defaultText); - const rest = useTTS(options.name, text, (segmentText: string) => + const rest = useTTS(options.voice, text, (segmentText: string) => fetchOpenaiTTS(segmentText, options), ); return { diff --git a/src/useSpeechSynthes/demos/index.tsx b/src/useSpeechSynthes/demos/index.tsx index 162c9d6..f1406d5 100644 --- a/src/useSpeechSynthes/demos/index.tsx +++ b/src/useSpeechSynthes/demos/index.tsx @@ -10,10 +10,6 @@ export default () => { const store = useCreateStore(); const options: any = useControls( { - name: { - options: genLevaOptions(getSpeechSynthesVoiceOptions()), - value: '婷婷', - }, pitch: { max: 1, min: -1, @@ -26,6 +22,10 @@ export default () => { step: 0.1, value: 0, }, + voice: { + options: genLevaOptions(getSpeechSynthesVoiceOptions()), + value: '婷婷', + }, }, { store }, ); diff --git a/src/useSpeechSynthes/index.ts b/src/useSpeechSynthes/index.ts index e34cea2..af9b347 100644 --- a/src/useSpeechSynthes/index.ts +++ b/src/useSpeechSynthes/index.ts @@ -2,18 +2,18 @@ import { useCallback, useMemo, useState } from 'react'; import { SsmlOptions } from '@/utils/genSSML'; -export const useSpeechSynthes = (defaultText: string, { name, rate, pitch }: SsmlOptions) => { +export const useSpeechSynthes = (defaultText: string, { voice, rate, pitch }: SsmlOptions) => { const [voiceList, setVoiceList] = useState(speechSynthesis.getVoices()); const [text, setText] = useState(defaultText); const [isLoading, setIsLoading] = useState(false); const speechSynthesisUtterance = useMemo(() => { const utterance = new SpeechSynthesisUtterance(text); - utterance.voice = voiceList.find((item) => item.name === name) as any; + utterance.voice = voiceList.find((item) => item.name === voice) as any; if (pitch) utterance.pitch = pitch * 10; if (rate) utterance.rate = rate * 10; return utterance; - }, [text, voiceList, rate, pitch, name]); + }, [text, voiceList, rate, pitch, voice]); speechSynthesis.onvoiceschanged = () => { setVoiceList(speechSynthesis.getVoices()); diff --git a/src/utils/genSSML.ts b/src/utils/genSSML.ts index 1e0bc1e..57dd52a 100644 --- a/src/utils/genSSML.ts +++ b/src/utils/genSSML.ts @@ -1,5 +1,3 @@ -import { Document, ServiceProvider } from 'ssml-document'; - export type StyleName = | 'affectionate' | 'angry' @@ -14,16 +12,34 @@ export type StyleName = | 'serious'; export interface SsmlOptions { - name: string; pitch?: number; rate?: number; style?: StyleName; + voice: string; } -export const genSSML = (text: string, options: SsmlOptions) => { - let ssml = new Document().voice(options.name); - if (options.style) ssml.expressAs({ style: options.style }); - if (options.pitch || options.rate) ssml.prosody({ pitch: options.pitch, rate: options.rate }); - const result = ssml.say(text).render({ provider: ServiceProvider.Microsoft }); - return `${result}`; +const voiceTemplate = (input: string, { voice }: Pick) => + `${input}`; + +const styleTemplate = (input: string, { style }: Pick) => { + if (!style) return input; + return `${input}`; +}; + +const prosodyTemplate = (input: string, { pitch, rate }: Pick) => { + if (!pitch && !rate) return input; + return `${input}`; +}; +const speackTemplate = (input: string) => + `${input}`; + +export const genSSML = (input: string, options: SsmlOptions) => { + let ssml = prosodyTemplate(input, options); + ssml = styleTemplate(ssml, options); + ssml = voiceTemplate(ssml, options); + ssml = speackTemplate(ssml); + + return ssml; }; diff --git a/src/utils/getVoiceList.ts b/src/utils/getVoiceList.ts index 841b040..9cd789b 100644 --- a/src/utils/getVoiceList.ts +++ b/src/utils/getVoiceList.ts @@ -4,9 +4,9 @@ import { flatten } from 'lodash-es'; import azureVoiceList from '@/data/azureVoiceList'; import edgeVoiceList from '@/data/edgeVoiceList'; import voiceLocale from '@/data/locales'; -import nameList from '@/data/nameList'; import openaiVoiceList from '@/data/openaiVoiceList'; import speechSynthesVoiceList from '@/data/speechSynthesVoiceList'; +import voiceList from '@/data/voiceList'; export const genSpeechSynthesVoiceList = () => { const data = speechSynthesis.getVoices(); @@ -38,7 +38,7 @@ export const getAzureVoiceOptions = (locale?: string): SelectProps['options'] => ? (azureVoiceList as any)?.[locale] || [] : flatten(Object.values(azureVoiceList)); - return data.map((voice: any) => ({ label: (nameList as any)?.[voice] || voice, value: voice })); + return data.map((voice: any) => ({ label: (voiceList as any)?.[voice] || voice, value: voice })); }; export const getEdgeVoiceOptions = (locale?: string): SelectProps['options'] => { @@ -46,7 +46,7 @@ export const getEdgeVoiceOptions = (locale?: string): SelectProps['options'] => locale && (edgeVoiceList as any)[locale] ? (edgeVoiceList as any)[locale] || [] : flatten(Object.values(edgeVoiceList)); - return data.map((voice: any) => ({ label: (nameList as any)?.[voice] || voice, value: voice })); + return data.map((voice: any) => ({ label: (voiceList as any)?.[voice] || voice, value: voice })); }; export const getOpenaiVoiceOptions = (): SelectProps['options'] => {