Skip to content

Commit

Permalink
clean up docstrings: WhisperTranscribers (#8235)
Browse files Browse the repository at this point in the history
* clarify docstrings

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <[email protected]>

---------

Co-authored-by: Agnieszka Marzec <[email protected]>
  • Loading branch information
dfokina and agnieszka-m authored Aug 16, 2024
1 parent bbe18cf commit 35b1215
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 33 deletions.
27 changes: 15 additions & 12 deletions haystack/components/audio/whisper_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,14 @@
@component
class LocalWhisperTranscriber:
"""
Transcribes audio files using OpenAI's Whisper model in your local machine.
Transcribes audio files using OpenAI's Whisper model on your local machine.
For the supported audio formats, languages, and other parameters, see the
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
[github repository](https://github.com/openai/whisper).
[GitHub repository](https://github.com/openai/whisper).
### Usage example
Usage example:
```python
from haystack.components.audio import LocalWhisperTranscriber
Expand All @@ -61,11 +62,12 @@ def __init__(
Creates an instance of the LocalWhisperTranscriber component.
:param model:
Name of the model to use. Set it to one of the following values:
:type model:
Literal["tiny", "small", "medium", "large", "large-v2"]
The name of the model to use. Set to one of the following models:
"tiny", "base", "small", "medium", "large" (default).
For details on the models and their modifications, see the
[Whisper documentation](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages).
:param device:
The device on which the model is loaded. If `None`, the default device is automatically selected.
The device for loading the model. If `None`, automatically selects the default device.
"""
whisper_import.check()
if model not in get_args(WhisperLocalModel):
Expand Down Expand Up @@ -111,19 +113,20 @@ def from_dict(cls, data: Dict[str, Any]) -> "LocalWhisperTranscriber":
@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], whisper_params: Optional[Dict[str, Any]] = None):
"""
Transcribes the audio files into a list of Documents, one for each input file.
Transcribes a list of audio files into a list of documents.
For the supported audio formats, languages, and other parameters, see the
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
[github repo](https://github.com/openai/whisper).
[GitHup repo](https://github.com/openai/whisper).
:param audio_files:
A list of paths or binary streams to transcribe.
:returns: A dictionary with the following keys:
- `documents`: A list of Documents, one for each file. The content of the document is the transcription
text, while the document's metadata contains the values returned by the Whisper model, such as the
alignment data and the path to the audio file used for the transcription.
- `documents`: A list of documents where each document is a transcribed audio file. The content of
the document is the transcription text, and the document's metadata contains the values returned by
the Whisper model, such as the alignment data and the path to the audio file used
for the transcription.
"""
if self._model is None:
raise RuntimeError(
Expand Down
46 changes: 25 additions & 21 deletions haystack/components/audio/whisper_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@
@component
class RemoteWhisperTranscriber:
"""
Transcribes audio files using the Whisper API from OpenAI.
Transcribes audio files using the OpenAI's Whisper API.
The component requires an API key, see the relative
The component requires an OpenAI API key, see the
[OpenAI documentation](https://platform.openai.com/docs/api-reference/authentication) for more details.
For the supported audio formats, languages, and other parameters, see the
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text)
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text).
### Usage example
Usage example:
```python
from haystack.components.audio import RemoteWhisperTranscriber
Expand All @@ -47,31 +48,33 @@ def __init__(
:param api_key:
OpenAI API key.
You can set it with an environment variable `OPENAI_API_KEY`, or pass with this parameter
during initialization.
:param model:
Name of the model to use. It now accepts only `whisper-1`.
Name of the model to use. Currently accepts only `whisper-1`.
:param organization:
The Organization ID. See
[production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization).
Your OpenAI organization ID. See OpenAI's documentation on
[Setting Up Your Organization](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization).
:param api_base:
An optional URL to use as the API base. See OpenAI [docs](https://platform.openai.com/docs/api-reference/audio).
An optional URL to use as the API base. For details, see the
OpenAI [documentation](https://platform.openai.com/docs/api-reference/audio).
:param kwargs:
Other parameters to use for the model. These parameters are all sent directly to the OpenAI
Other optional parameters for the model. These are sent directly to the OpenAI
endpoint. See OpenAI [documentation](https://platform.openai.com/docs/api-reference/audio) for more details.
Some of the supported parameters:
Some of the supported parameters are:
- `language`: The language of the input audio.
Supplying the input language in ISO-639-1 format
will improve accuracy and latency.
Provide the input language in ISO-639-1 format
to improve transcription accuracy and latency.
- `prompt`: An optional text to guide the model's
style or continue a previous audio segment.
The prompt should match the audio language.
- `response_format`: The format of the transcript
output, in one of these options: json, text, srt,
verbose_json, or vtt. Defaults to "json". Currently only "json" is supported.
output. This component only supports `json`.
- `temperature`: The sampling temperature, between 0
and 1. Higher values like 0.8 will make the output more
random, while lower values like 0.2 will make it more
focused and deterministic. If set to 0, the model will
use log probability to automatically increase the
and 1. Higher values like 0.8 make the output more
random, while lower values like 0.2 make it more
focused and deterministic. If set to 0, the model
uses log probability to automatically increase the
temperature until certain thresholds are hit.
"""

Expand Down Expand Up @@ -123,13 +126,14 @@ def from_dict(cls, data: Dict[str, Any]) -> "RemoteWhisperTranscriber":
@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
"""
Transcribes the audio files into a list of Documents, one for each input file.
Transcribes the list of audio files into a list of documents.
:param sources:
A list of file paths or ByteStreams containing the audio files to transcribe.
A list of file paths or `ByteStream` objects containing the audio files to transcribe.
:returns: A dictionary with the following keys:
- `documents`: A list of Documents, one for each file. The content of the document is the transcribed text.
- `documents`: A list of documents, one document for each file.
The content of each document is the transcribed text.
"""
documents = []

Expand Down

0 comments on commit 35b1215

Please sign in to comment.