diff --git a/.gitignore b/.gitignore index 5fc9aabb..93e8d09f 100644 --- a/.gitignore +++ b/.gitignore @@ -88,7 +88,7 @@ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. diff --git a/README.rst b/README.rst index 337f6873..69beebcf 100644 --- a/README.rst +++ b/README.rst @@ -97,7 +97,7 @@ To use all of the functionality of the library, you should have: * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) -* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``) +* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``) * **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``) The following requirements are optional, but can improve or extend functionality in some situations: @@ -176,9 +176,9 @@ You can install it with ``python3 -m pip install SpeechRecognition[whisper-local OpenAI Whisper API (for OpenAI Whisper API users) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The library `openai `__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_whisper_api``). +The library `openai `__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_openai``). -If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_whisper_api`` will raise an ``RequestError``. +If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_openai`` will raise an ``RequestError``. You can install it with ``python3 -m pip install SpeechRecognition[whisper-api]``. diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index a5d8a688..38ef95bd 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -96,6 +96,6 @@ # recognize speech using Whisper API OPENAI_API_KEY = "INSERT OPENAI API KEY HERE" try: - print(f"Whisper API thinks you said {r.recognize_whisper_api(audio, api_key=OPENAI_API_KEY)}") + print(f"Whisper API thinks you said {r.recognize_openai(audio, api_key=OPENAI_API_KEY)}") except sr.RequestError as e: print(f"Could not request results from Whisper API; {e}") diff --git a/reference/library-reference.rst b/reference/library-reference.rst index e8b6c7e0..e245e819 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -314,8 +314,8 @@ You can translate the result to english with Whisper by passing translate=True Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options -``recognizer_instance.recognize_whisper_api(audio_data: AudioData, model: str = "whisper-1", api_key: str | None = None)`` --------------------------------------------------------------------------------------------------------------------------- +``recognizer_instance.recognize_openai(audio_data: AudioData, model: str = "whisper-1", api_key: str | None = None)`` +--------------------------------------------------------------------------------------------------------------------- Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 238d5e50..94345ccb 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1506,12 +1506,13 @@ def flush(self, *args, **kwargs): # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError. # This is a workaround to resolve this issue try: - from .recognizers import google, groq, whisper + from .recognizers import google, openai, groq except (ModuleNotFoundError, ImportError): pass else: Recognizer.recognize_google = google.recognize_legacy - Recognizer.recognize_whisper_api = whisper.recognize_whisper_api + Recognizer.recognize_openai = openai.recognize + Recognizer.recognize_whisper_api = openai.recognize # Deprecated Recognizer.recognize_groq = groq.recognize_groq diff --git a/speech_recognition/recognizers/whisper.py b/speech_recognition/recognizers/openai.py similarity index 53% rename from speech_recognition/recognizers/whisper.py rename to speech_recognition/recognizers/openai.py index 31a8e43e..79843d69 100644 --- a/speech_recognition/recognizers/whisper.py +++ b/speech_recognition/recognizers/openai.py @@ -1,19 +1,42 @@ from __future__ import annotations import os -from io import BytesIO +from typing import Literal + +from typing_extensions import Unpack from speech_recognition.audio import AudioData from speech_recognition.exceptions import SetupError +from speech_recognition.recognizers.whisper_api import ( + OpenAICompatibleRecognizer, +) + +# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model +WhisperModel = Literal["whisper-1"] + + +class OpenAIOptionalParameters: + """OpenAI speech transcription's optional parameters. + https://platform.openai.com/docs/api-reference/audio/createTranscription + """ + + language: str + prompt: str + # TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]` + response_format: Literal["json"] + temperature: float + # timestamp_granularities # TODO support -def recognize_whisper_api( + +def recognize( recognizer, audio_data: "AudioData", *, - model: str = "whisper-1", + model: WhisperModel = "whisper-1", api_key: str | None = None, -): + **kwargs: Unpack[OpenAIOptionalParameters], +) -> str: """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. @@ -23,8 +46,6 @@ def recognize_whisper_api( Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the openai installation, or the environment variable is missing. """ - if not isinstance(audio_data, AudioData): - raise ValueError("``audio_data`` must be an ``AudioData`` instance") if api_key is None and os.environ.get("OPENAI_API_KEY") is None: raise SetupError("Set environment variable ``OPENAI_API_KEY``") @@ -35,9 +56,5 @@ def recognize_whisper_api( "missing openai module: ensure that openai is set up correctly." ) - wav_data = BytesIO(audio_data.get_wav_data()) - wav_data.name = "SpeechRecognition_audio.wav" - - client = openai.OpenAI(api_key=api_key) - transcript = client.audio.transcriptions.create(file=wav_data, model=model) - return transcript.text + recognizer = OpenAICompatibleRecognizer(openai.OpenAI(api_key=api_key)) + return recognizer.recognize(audio_data, model, **kwargs) diff --git a/tests/recognizers/test_openai.py b/tests/recognizers/test_openai.py new file mode 100644 index 00000000..21c2b04e --- /dev/null +++ b/tests/recognizers/test_openai.py @@ -0,0 +1,31 @@ +from unittest.mock import MagicMock + +import httpx +import respx + +from speech_recognition import AudioData, Recognizer +from speech_recognition.recognizers import openai + + +@respx.mock(assert_all_called=True, assert_all_mocked=True) +def test_transcribe_with_openai_whisper(respx_mock, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "sk_openai_api_key") + + respx_mock.post( + "https://api.openai.com/v1/audio/transcriptions", + headers__contains={"Authorization": "Bearer sk_openai_api_key"}, + data__contains={"model": "whisper-1"}, + ).mock( + return_value=httpx.Response( + 200, + json={"text": "Transcription by OpenAI Whisper"}, + ) + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio_data" + + actual = openai.recognize(MagicMock(spec=Recognizer), audio_data) + + assert actual == "Transcription by OpenAI Whisper" + audio_data.get_wav_data.assert_called_once() diff --git a/tests/recognizers/test_whisper.py b/tests/recognizers/test_whisper.py deleted file mode 100644 index e84d0503..00000000 --- a/tests/recognizers/test_whisper.py +++ /dev/null @@ -1,42 +0,0 @@ -from unittest import TestCase -from unittest.mock import MagicMock, patch - -from speech_recognition import AudioData, Recognizer -from speech_recognition.recognizers import whisper - - -@patch("speech_recognition.recognizers.whisper.os.environ") -@patch("speech_recognition.recognizers.whisper.BytesIO") -@patch("openai.OpenAI") -class RecognizeWhisperApiTestCase(TestCase): - def test_recognize_default_arguments(self, OpenAI, BytesIO, environ): - client = OpenAI.return_value - transcript = client.audio.transcriptions.create.return_value - - recognizer = MagicMock(spec=Recognizer) - audio_data = MagicMock(spec=AudioData) - - actual = whisper.recognize_whisper_api(recognizer, audio_data) - - self.assertEqual(actual, transcript.text) - audio_data.get_wav_data.assert_called_once_with() - BytesIO.assert_called_once_with(audio_data.get_wav_data.return_value) - OpenAI.assert_called_once_with(api_key=None) - client.audio.transcriptions.create.assert_called_once_with( - file=BytesIO.return_value, model="whisper-1" - ) - - def test_recognize_pass_arguments(self, OpenAI, BytesIO, environ): - client = OpenAI.return_value - - recognizer = MagicMock(spec=Recognizer) - audio_data = MagicMock(spec=AudioData) - - _ = whisper.recognize_whisper_api( - recognizer, audio_data, model="x-whisper", api_key="OPENAI_API_KEY" - ) - - OpenAI.assert_called_once_with(api_key="OPENAI_API_KEY") - client.audio.transcriptions.create.assert_called_once_with( - file=BytesIO.return_value, model="x-whisper" - )