From f7fc7f28a9f0a2da605ad6a395312359c35fb36e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl?= Date: Tue, 3 Dec 2024 21:18:31 +0000 Subject: [PATCH 1/6] feat(audio): integrate audio transfromers --- outlines/generate/api.py | 91 +++++++++++ outlines/generate/cfg.py | 11 +- outlines/generate/fsm.py | 12 +- outlines/generate/regex.py | 15 +- outlines/generate/text.py | 8 +- outlines/models/__init__.py | 1 + outlines/models/transformers_audio.py | 136 ++++++++++++++++ pyproject.toml | 4 +- tests/generate/test_api.py | 30 +++- tests/generate/test_generate.py | 37 ++++- .../test_integration_transformers_audio.py | 152 ++++++++++++++++++ 11 files changed, 490 insertions(+), 7 deletions(-) create mode 100644 outlines/models/transformers_audio.py create mode 100644 tests/generate/test_integration_transformers_audio.py diff --git a/outlines/generate/api.py b/outlines/generate/api.py index 4919f2090..a248162b1 100644 --- a/outlines/generate/api.py +++ b/outlines/generate/api.py @@ -621,3 +621,94 @@ def valid_types(prompts, media): ) return prompts, media + + +class AudioSequenceGeneratorAdapter(SequenceGeneratorAdapter): + def __call__( # type: ignore + self, + prompts: Union[str, List[str]], + media: Union[str, Any], + max_tokens: Optional[int] = None, + stop_at: Optional[Union[str, List[str]]] = None, + seed: Optional[int] = None, + **model_specific_params, + ): + """ + Generate text from a prompt or list of prompts. + + Media: A URI to construct media or media object itself. Used as AutoProcessor argument. + """ + prompts, media = self._validate_prompt_media_types(prompts, media) + + generation_params = self.prepare_generation_parameters( + max_tokens, stop_at, seed + ) + + completions = self.model.generate( + prompts, + media, + generation_params, + copy(self.logits_processor), + self.sampling_params, + **model_specific_params, + ) + + return self._format(completions) + + def stream( # type: ignore + self, + prompts: Union[str, List[str]], + media: List[Union[str, Any, List[Union[str, Any]]]], + max_tokens: Optional[int] = None, + stop_at: Optional[Union[str, List[str]]] = None, + seed: Optional[int] = None, + **model_specific_params, + ): + """Return a text generator from a prompt or a list of prompts.""" + prompts, media = self._validate_prompt_media_types(prompts, media) + generation_params = self.prepare_generation_parameters( + max_tokens, stop_at, seed + ) + return self.model.stream( + prompts, + media, + generation_params, + copy(self.logits_processor), + self.sampling_params, + **model_specific_params, + ) + + @classmethod + def _validate_prompt_media_types( + cls, + prompts: Union[str, List[str]], + media: Union[str, Any, List[Union[str, Any]]], + ) -> Union[Any, List[Any]]: + """ + Prepare media as np.ndarray and ensure for every prompt str there is one List[PIL.Image] + """ + + def valid_types(prompts, media): + import numpy as np # type: ignore + + if isinstance(prompts, list): + if not isinstance(media, list) or len(prompts) != len(media): + return False + for subprompt, submedia in zip(prompts, media): + if not isinstance(subprompt, str) or not all( + isinstance(m, np.ndarray) for m in submedia + ): + return False + elif isinstance(prompts, str): + if not all(isinstance(m, np.ndarray) for m in media): + return False + return True + + if not valid_types(prompts, media): + raise TypeError( + "Expected (prompts, media) to be of type " + "(str, List[np.ndarray])), or (List[str], List[List[np.ndarray]]) " + f"instead got prompts={prompts}, media={media}" + ) + + return prompts, media diff --git a/outlines/generate/cfg.py b/outlines/generate/cfg.py index b677040d5..0d11b9c76 100644 --- a/outlines/generate/cfg.py +++ b/outlines/generate/cfg.py @@ -1,10 +1,11 @@ from functools import singledispatch from outlines.generate.api import ( + AudioSequenceGeneratorAdapter, SequenceGeneratorAdapter, VisionSequenceGeneratorAdapter, ) -from outlines.models import LlamaCpp, OpenAI, TransformersVision +from outlines.models import LlamaCpp, OpenAI, TransformersAudio, TransformersVision from outlines.samplers import Sampler, multinomial @@ -33,6 +34,14 @@ def cfg( return SequenceGeneratorAdapter(model, logits_processor, sampler) +@cfg.register(TransformersAudio) +def cfg_audio(model, cfg_str: str, sampler: Sampler = multinomial()): + from outlines.processors import CFGLogitsProcessor + + logits_processor = CFGLogitsProcessor(cfg_str, tokenizer=model.tokenizer) + return AudioSequenceGeneratorAdapter(model, logits_processor, sampler) + + @cfg.register(TransformersVision) def cfg_vision(model, cfg_str: str, sampler: Sampler = multinomial()): from outlines.processors import CFGLogitsProcessor diff --git a/outlines/generate/fsm.py b/outlines/generate/fsm.py index 1950812d2..a73bd5aaa 100644 --- a/outlines/generate/fsm.py +++ b/outlines/generate/fsm.py @@ -4,10 +4,11 @@ from outlines.fsm.guide import RegexGuide from outlines.generate.api import ( + AudioSequenceGeneratorAdapter, SequenceGeneratorAdapter, VisionSequenceGeneratorAdapter, ) -from outlines.models import TransformersVision +from outlines.models import TransformersAudio, TransformersVision from outlines.samplers import Sampler, multinomial @@ -22,6 +23,15 @@ def fsm( return SequenceGeneratorAdapter(model, logits_processor, sampler) +@fsm.register(TransformersAudio) +def fsm_audio(model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial()): + from outlines.processors import GuideLogitsProcessor + + guide = RegexGuide.from_interegular_fsm(fsm, model.tokenizer) + logits_processor = GuideLogitsProcessor(tokenizer=model.tokenizer, guide=guide) + return AudioSequenceGeneratorAdapter(model, logits_processor, sampler) + + @fsm.register(TransformersVision) def fsm_vision(model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial()): from outlines.processors import GuideLogitsProcessor diff --git a/outlines/generate/regex.py b/outlines/generate/regex.py index 673880e49..ab6267c5e 100644 --- a/outlines/generate/regex.py +++ b/outlines/generate/regex.py @@ -1,10 +1,11 @@ from functools import singledispatch from outlines.generate.api import ( + AudioSequenceGeneratorAdapter, SequenceGeneratorAdapter, VisionSequenceGeneratorAdapter, ) -from outlines.models import OpenAI, TransformersVision +from outlines.models import OpenAI, TransformersAudio, TransformersVision from outlines.samplers import Sampler, multinomial @@ -35,6 +36,18 @@ def regex(model, regex_str: str, sampler: Sampler = multinomial()): return SequenceGeneratorAdapter(model, logits_processor, sampler) +@regex.register(TransformersAudio) +def regex_audio( + model, + regex_str: str, + sampler: Sampler = multinomial(), +): + from outlines.processors import RegexLogitsProcessor + + logits_processor = RegexLogitsProcessor(regex_str, tokenizer=model.tokenizer) + return AudioSequenceGeneratorAdapter(model, logits_processor, sampler) + + @regex.register(TransformersVision) def regex_vision( model, diff --git a/outlines/generate/text.py b/outlines/generate/text.py index 32530d0c4..06fa20ee0 100644 --- a/outlines/generate/text.py +++ b/outlines/generate/text.py @@ -1,10 +1,11 @@ from functools import singledispatch from outlines.generate.api import ( + AudioSequenceGeneratorAdapter, SequenceGeneratorAdapter, VisionSequenceGeneratorAdapter, ) -from outlines.models import OpenAI, TransformersVision +from outlines.models import OpenAI, TransformersAudio, TransformersVision from outlines.samplers import Sampler, multinomial @@ -34,6 +35,11 @@ def text(model, sampler: Sampler = multinomial()) -> SequenceGeneratorAdapter: return SequenceGeneratorAdapter(model, None, sampler) +@text.register(TransformersAudio) +def text_audio(model, sampler: Sampler = multinomial()): + return AudioSequenceGeneratorAdapter(model, None, sampler) + + @text.register(TransformersVision) def text_vision(model, sampler: Sampler = multinomial()): return VisionSequenceGeneratorAdapter(model, None, sampler) diff --git a/outlines/models/__init__.py b/outlines/models/__init__.py index fe6f861ac..1845d6d8f 100644 --- a/outlines/models/__init__.py +++ b/outlines/models/__init__.py @@ -13,6 +13,7 @@ from .mlxlm import MLXLM, mlxlm from .openai import OpenAI, azure_openai, openai from .transformers import Transformers, TransformerTokenizer, mamba, transformers +from .transformers_audio import TransformersAudio, transformers_audio from .transformers_vision import TransformersVision, transformers_vision from .vllm import VLLM, vllm diff --git a/outlines/models/transformers_audio.py b/outlines/models/transformers_audio.py new file mode 100644 index 000000000..bcfa8d848 --- /dev/null +++ b/outlines/models/transformers_audio.py @@ -0,0 +1,136 @@ +from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union + +from outlines.generate.api import GenerationParameters, SamplingParameters +from outlines.models import Transformers + +if TYPE_CHECKING: + from outlines.processors import OutlinesLogitsProcessor + + +class TransformersAudio(Transformers): + def __init__(self, model, tokenizer, processor): + super().__init__(model, tokenizer) + self.processor = processor + + def generate( # type: ignore + self, + prompts: Union[str, List[str]], + media: Union[List[Any], List[List[Any]]], + generation_parameters: GenerationParameters, + logits_processor: Optional["OutlinesLogitsProcessor"], + sampling_parameters: SamplingParameters, + ) -> Union[str, List[str], List[List[str]]]: + """Generate text using `transformers`. + + Arguments + --------- + prompts + A prompt or list of prompts. + media + A List[numpy.ndarray] or List[List[numpy.ndarray]] + generation_parameters + An instance of `GenerationParameters` that contains the prompt, + the maximum number of tokens, stop sequences and seed. All the + arguments to `SequenceGeneratorAdapter`'s `__cal__` method. + logits_processor + The logits processor to use when generating text. + sampling_parameters + An instance of `SamplingParameters`, a dataclass that contains + the name of the sampler to use and related parameters as available + in Outlines. + + Returns + ------- + The generated text + """ + inputs = self.processor( + text=prompts, audios=media, padding=True, return_tensors="pt" + ).to(self.model.device) + + generation_kwargs = self._get_generation_kwargs( + prompts, + generation_parameters, + logits_processor, + sampling_parameters, + ) + generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs) + + # if single str input and single sample per input, convert to a 1D output + if isinstance(prompts, str): + # Should always be true until NotImplementedError above is fixed + generated_ids = generated_ids.squeeze(0) + + return self._decode_generation(generated_ids) + + def stream( # type: ignore + self, + prompts: Union[str, List[str]], + media: Union[Any, List[Any]], # TODO: docstring + generation_parameters: GenerationParameters, + logits_processor: Optional["OutlinesLogitsProcessor"], + sampling_parameters: SamplingParameters, + ) -> Iterator[Union[str, List[str]]]: + raise NotImplementedError + + +def transformers_audio( + model_name: str, + model_class, + device: Optional[str] = None, + model_kwargs: dict = {}, + processor_kwargs: dict = {}, + tokenizer_class=None, + processor_class=None, +): + """Instantiate a model from the `transformers` library and its tokenizer. + + Parameters + ---------- + model_name + The name of the model as listed on Hugging Face's model page. + model_class + The `PreTrainedModel` class from transformers to use in initializing the vision model from `model_name`. + https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel + device + The device(s) on which the model should be loaded. This overrides + the `device_map` entry in `model_kwargs` when provided. + model_kwargs + A dictionary that contains the keyword arguments to pass to the + `from_pretrained` method when loading the model. + processor_kwargs + A dictionary that contains the keyword arguments to pass to the + `from_pretrained` method when loading the processor. + + Returns + ------- + A `TransformersModel` model instance. + + """ + if processor_class is None or tokenizer_class is None: + try: + from transformers import AutoProcessor, AutoTokenizer + except ImportError: + raise ImportError( + "The `transformers` library needs to be installed in order to use `transformers` models." + ) + if processor_class is None: + processor_class = AutoProcessor + + if device is not None: + model_kwargs["device_map"] = device + + model = model_class.from_pretrained(model_name, **model_kwargs) + + processor_kwargs.setdefault("padding_side", "left") + processor_kwargs.setdefault("pad_token", "[PAD]") + processor = processor_class.from_pretrained(model_name, **processor_kwargs) + + if tokenizer_class is None: + if getattr(processor, "tokenizer", None): + tokenizer = processor.tokenizer + else: + tokenizer = AutoTokenizer.from_pretrained(model_name, **processor_kwargs) + else: + tokenizer = tokenizer_class.from_pretrained(model_name, **processor_kwargs) + + return TransformersAudio(model, tokenizer, processor) diff --git a/pyproject.toml b/pyproject.toml index 896e8aadd..19deb264e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,8 @@ test = [ "transformers", "pillow", "exllamav2", - "jax" + "jax", + "librosa", ] serve = [ "vllm>=0.3.0", @@ -147,6 +148,7 @@ module = [ "pycountry.*", "airportsdata.*", "outlines_core.*", + "librosa", ] ignore_missing_imports = true diff --git a/tests/generate/test_api.py b/tests/generate/test_api.py index 7188022f5..69d39cd47 100644 --- a/tests/generate/test_api.py +++ b/tests/generate/test_api.py @@ -1,13 +1,18 @@ from io import BytesIO from urllib.request import urlopen +import numpy as np import pytest from PIL import Image # type: ignore -from outlines.generate.api import VisionSequenceGeneratorAdapter +from outlines.generate.api import ( + AudioSequenceGeneratorAdapter, + VisionSequenceGeneratorAdapter, +) IMG_URI = "https://upload.wikimedia.org/wikipedia/en/a/a9/Example.jpg" PIL_IMG = Image.open(BytesIO(urlopen(IMG_URI).read())).convert("RGB") +AUDIO_ARRAY = np.array([1, 2]) @pytest.mark.parametrize( @@ -31,3 +36,26 @@ def test_vision_sequence_generator_validate_types(prompts, media, type_error): VisionSequenceGeneratorAdapter._validate_prompt_media_types(prompts, media) else: VisionSequenceGeneratorAdapter._validate_prompt_media_types(prompts, media) + + +@pytest.mark.parametrize( + "prompts,media,type_error", + [ + ("single prompt", [AUDIO_ARRAY], False), + (["prompt0", "prompt1"], [[AUDIO_ARRAY], [AUDIO_ARRAY]], False), + ("single prompt", [AUDIO_ARRAY, AUDIO_ARRAY], False), + (["prompt0", "prompt1"], [[AUDIO_ARRAY, AUDIO_ARRAY], [AUDIO_ARRAY]], False), + ("single prompt", "this isn't an audio, it's a string", True), + ("single prompt", AUDIO_ARRAY, True), + (["prompt0", "prompt1"], [AUDIO_ARRAY], True), + (["prompt0", "prompt1"], [[AUDIO_ARRAY]], True), + (["prompt0", "prompt1"], [[[AUDIO_ARRAY]], [[AUDIO_ARRAY]]], True), + ], +) +def test_audio_sequence_generator_validate_types(prompts, media, type_error): + """Ensure inputs are validated correctly""" + if type_error: + with pytest.raises(TypeError): + AudioSequenceGeneratorAdapter._validate_prompt_media_types(prompts, media) + else: + AudioSequenceGeneratorAdapter._validate_prompt_media_types(prompts, media) diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index f91bc8653..9f7d56b6e 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -2,6 +2,7 @@ import re from enum import Enum +import numpy as np import pytest import outlines.generate as generate @@ -72,6 +73,23 @@ def model_bart(tmp_path_factory): ) +@pytest.fixture(scope="session") +def model_transformers_audio(tmp_path_factory): + import torch + from transformers import Qwen2AudioForConditionalGeneration + + return models.transformers_audio( + "Qwen/Qwen2-Audio-7B-Instruct", + model_class=Qwen2AudioForConditionalGeneration, + device="cuda", + model_kwargs=dict( + torch_dtype=torch.bfloat16, + load_in_4bit=True, + low_cpu_mem_usage=True, + ), + ) + + @pytest.fixture(scope="session") def model_transformers_vision(tmp_path_factory): import torch @@ -125,6 +143,7 @@ def model_t5(tmp_path_factory): "model_bart", "model_transformers_vision", "model_vllm", + "model_transformers_audio", ) @@ -191,7 +210,11 @@ def enforce_not_implemented(model_fixture, *task_names): assert an NotImplementedError is raised. Otherwise, run normally """ NOT_IMPLEMENTED = { - "stream": ["model_transformers_vision", "model_vllm"], + "stream": [ + "model_transformers_vision", + "model_vllm", + "model_transformers_audio", + ], "batch": ["model_llamacpp", "model_mlxlm", "model_mlxlm_phi3"], "beam_search": ["model_llamacpp", "model_mlxlm", "model_mlxlm_phi3"], "multiple_samples": ["model_llamacpp", "model_mlxlm", "model_mlxlm_phi3"], @@ -226,6 +249,18 @@ def get_inputs(fixture_name, batch_size=None): "media": [[img] for _ in range(batch_size)], } + elif fixture_name.endswith("_audio"): + instruct_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n" + audio = np.random.random(20000) + + if batch_size is None: + return {"prompts": f"{instruct_prompt}{prompts}", "media": [audio]} + else: + return { + "prompts": [f"{instruct_prompt}{p}" for p in prompts], + "media": [[audio] for _ in range(batch_size)], + } + else: return {"prompts": prompts} diff --git a/tests/generate/test_integration_transformers_audio.py b/tests/generate/test_integration_transformers_audio.py new file mode 100644 index 000000000..3c98eef2e --- /dev/null +++ b/tests/generate/test_integration_transformers_audio.py @@ -0,0 +1,152 @@ +from io import BytesIO +from urllib.request import urlopen + +import librosa +import pytest +import torch +from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration + +import outlines +from outlines.models.transformers_audio import transformers_audio + +AUDIO_URLS = [ + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3", + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav", +] +QWEN2_AUDIO_SAMPLING_RATE = 16000 + +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + + +def audio_from_url(url): + audio_byte_stream = BytesIO(urlopen(url).read()) + return librosa.load(audio_byte_stream, sr=QWEN2_AUDIO_SAMPLING_RATE)[0] + + +@pytest.fixture(scope="session") +def model(tmp_path_factory): + return transformers_audio( + "Qwen/Qwen2-Audio-7B-Instruct", + model_class=Qwen2AudioForConditionalGeneration, + device="cuda", + model_kwargs=dict( + torch_dtype=torch.bfloat16, + load_in_4bit=True, + low_cpu_mem_usage=True, + ), + ) + + +@pytest.fixture(scope="session") +def processor(tmp_path_factory): + return AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct") + + +def test_single_audio_text_gen(model, processor): + conversation = [ + { + "role": "user", + "content": [ + {"audio"}, + {"type": "text", "text": "What's that sound?"}, + ], + }, + ] + generator = outlines.generate.text(model) + sequence = generator( + processor.apply_chat_template(conversation), + [audio_from_url(AUDIO_URLS[0])], + seed=10000, + max_tokens=10, + ) + assert isinstance(sequence, str) + + +def test_multi_audio_text_gen(model, processor): + """If the length of audio tags and number of audios we pass are > 1 and equal, + we should yield a successful generation. + """ + conversation = [ + { + "role": "user", + "content": [{"audio"} for _ in range(len(AUDIO_URLS))] + + [ + { + "type": "text", + "text": "Did a human make one of the audio recordings?", + } + ], + }, + ] + generator = outlines.generate.text(model) + sequence = generator( + processor.apply_chat_template(conversation), + [audio_from_url(url) for url in AUDIO_URLS], + seed=10000, + max_tokens=10, + ) + assert isinstance(sequence, str) + + +def test_mismatched_audio_text_gen(model, processor): + """If the length of audio tags and number of audios we pass are unequal, + we should raise an error. + """ + generator = outlines.generate.text(model) + + conversation = [ + { + "role": "user", + "content": [ + {"audio"}, + {"type": "text", "text": "I'm passing 2 audios, but only 1 audio tag"}, + ], + }, + ] + with pytest.raises(RuntimeError): + _ = generator( + processor.apply_chat_template(conversation), + [audio_from_url(i) for i in AUDIO_URLS], + seed=10000, + max_tokens=10, + ) + + conversation = [ + { + "role": "user", + "content": [ + {"audio"}, + {"audio"}, + {"type": "text", "text": "I'm passing 2 audio tags, but only 1 audio"}, + ], + }, + ] + with pytest.raises(ValueError): + _ = generator( + processor.apply_chat_template(conversation), + [audio_from_url(AUDIO_URLS[0])], + seed=10000, + max_tokens=10, + ) + + +def test_single_audio_choice(model, processor): + conversation = [ + { + "role": "user", + "content": [ + {"audio"}, + {"type": "text", "text": "What is this?"}, + ], + }, + ] + choices = ["dog barking", "glass breaking"] + generator = outlines.generate.choice(model, choices) + sequence = generator( + processor.apply_chat_template(conversation), + [audio_from_url(AUDIO_URLS[0])], + seed=10000, + max_tokens=10, + ) + assert isinstance(sequence, str) + assert sequence in choices From ee5a17678c740bf8df27a6d65571a3de556d82ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl?= Date: Tue, 3 Dec 2024 21:59:01 +0000 Subject: [PATCH 2/6] fix(test): tests for audio transformers --- tests/generate/test_generate.py | 8 +------- tests/generate/test_integration_transformers_audio.py | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index 9f7d56b6e..29153c581 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -75,18 +75,12 @@ def model_bart(tmp_path_factory): @pytest.fixture(scope="session") def model_transformers_audio(tmp_path_factory): - import torch from transformers import Qwen2AudioForConditionalGeneration return models.transformers_audio( "Qwen/Qwen2-Audio-7B-Instruct", model_class=Qwen2AudioForConditionalGeneration, - device="cuda", - model_kwargs=dict( - torch_dtype=torch.bfloat16, - load_in_4bit=True, - low_cpu_mem_usage=True, - ), + device="cpu", ) diff --git a/tests/generate/test_integration_transformers_audio.py b/tests/generate/test_integration_transformers_audio.py index 3c98eef2e..50d022089 100644 --- a/tests/generate/test_integration_transformers_audio.py +++ b/tests/generate/test_integration_transformers_audio.py @@ -3,7 +3,6 @@ import librosa import pytest -import torch from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration import outlines @@ -28,12 +27,7 @@ def model(tmp_path_factory): return transformers_audio( "Qwen/Qwen2-Audio-7B-Instruct", model_class=Qwen2AudioForConditionalGeneration, - device="cuda", - model_kwargs=dict( - torch_dtype=torch.bfloat16, - load_in_4bit=True, - low_cpu_mem_usage=True, - ), + device="cpu", ) From 69ec78714c9e88334393439076977cbf962c67b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl?= Date: Wed, 4 Dec 2024 17:13:25 +0000 Subject: [PATCH 3/6] fix(test): use tiny model for audio transformers --- outlines/generate/api.py | 19 +++++-------- outlines/models/transformers_audio.py | 6 ++++- tests/generate/test_api.py | 7 ++--- tests/generate/test_generate.py | 27 ++++++++++++++----- .../test_integration_transformers_audio.py | 6 ++--- 5 files changed, 39 insertions(+), 26 deletions(-) diff --git a/outlines/generate/api.py b/outlines/generate/api.py index a248162b1..396166622 100644 --- a/outlines/generate/api.py +++ b/outlines/generate/api.py @@ -691,23 +691,18 @@ def _validate_prompt_media_types( def valid_types(prompts, media): import numpy as np # type: ignore - if isinstance(prompts, list): - if not isinstance(media, list) or len(prompts) != len(media): - return False - for subprompt, submedia in zip(prompts, media): - if not isinstance(subprompt, str) or not all( - isinstance(m, np.ndarray) for m in submedia - ): - return False - elif isinstance(prompts, str): - if not all(isinstance(m, np.ndarray) for m in media): - return False + if not isinstance(prompts, (str, list)): + return False + if not isinstance(media, list): + return False + if not all(isinstance(m, np.ndarray) for m in media): + return False return True if not valid_types(prompts, media): raise TypeError( "Expected (prompts, media) to be of type " - "(str, List[np.ndarray])), or (List[str], List[List[np.ndarray]]) " + "(str, List[np.ndarray])), or (List[str], List[np.ndarray]]) " f"instead got prompts={prompts}, media={media}" ) diff --git a/outlines/models/transformers_audio.py b/outlines/models/transformers_audio.py index bcfa8d848..39b55b8d1 100644 --- a/outlines/models/transformers_audio.py +++ b/outlines/models/transformers_audio.py @@ -44,7 +44,11 @@ def generate( # type: ignore The generated text """ inputs = self.processor( - text=prompts, audios=media, padding=True, return_tensors="pt" + text=prompts, + audios=media, + padding=True, + return_tensors="pt", + sampling_rate=self.processor.feature_extractor.sampling_rate, ).to(self.model.device) generation_kwargs = self._get_generation_kwargs( diff --git a/tests/generate/test_api.py b/tests/generate/test_api.py index 69d39cd47..881da04ed 100644 --- a/tests/generate/test_api.py +++ b/tests/generate/test_api.py @@ -42,12 +42,13 @@ def test_vision_sequence_generator_validate_types(prompts, media, type_error): "prompts,media,type_error", [ ("single prompt", [AUDIO_ARRAY], False), - (["prompt0", "prompt1"], [[AUDIO_ARRAY], [AUDIO_ARRAY]], False), + (["single prompt"], [AUDIO_ARRAY], False), + (["prompt0", "prompt1"], [AUDIO_ARRAY, AUDIO_ARRAY], False), ("single prompt", [AUDIO_ARRAY, AUDIO_ARRAY], False), - (["prompt0", "prompt1"], [[AUDIO_ARRAY, AUDIO_ARRAY], [AUDIO_ARRAY]], False), ("single prompt", "this isn't an audio, it's a string", True), ("single prompt", AUDIO_ARRAY, True), - (["prompt0", "prompt1"], [AUDIO_ARRAY], True), + (["prompt0", "prompt1"], [AUDIO_ARRAY], False), + ("prompt0", [[AUDIO_ARRAY]], True), (["prompt0", "prompt1"], [[AUDIO_ARRAY]], True), (["prompt0", "prompt1"], [[[AUDIO_ARRAY]], [[AUDIO_ARRAY]]], True), ], diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index 29153c581..7f5f108d1 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -78,7 +78,7 @@ def model_transformers_audio(tmp_path_factory): from transformers import Qwen2AudioForConditionalGeneration return models.transformers_audio( - "Qwen/Qwen2-Audio-7B-Instruct", + "yujiepan/qwen2-audio-tiny-random", model_class=Qwen2AudioForConditionalGeneration, device="cpu", ) @@ -210,7 +210,12 @@ def enforce_not_implemented(model_fixture, *task_names): "model_transformers_audio", ], "batch": ["model_llamacpp", "model_mlxlm", "model_mlxlm_phi3"], - "beam_search": ["model_llamacpp", "model_mlxlm", "model_mlxlm_phi3"], + "beam_search": [ + "model_llamacpp", + "model_mlxlm", + "model_mlxlm_phi3", + "model_transformers_audio", + ], "multiple_samples": ["model_llamacpp", "model_mlxlm", "model_mlxlm_phi3"], "cfg": ["model_llamacpp"], # TODO: fix llama_cpp tokenizer } @@ -245,14 +250,20 @@ def get_inputs(fixture_name, batch_size=None): elif fixture_name.endswith("_audio"): instruct_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n" - audio = np.random.random(20000) + batch_prompt = "<|im_start|>assistant\n" + audio = np.random.random(20000).astype(np.float32) if batch_size is None: - return {"prompts": f"{instruct_prompt}{prompts}", "media": [audio]} + return { + "prompts": f"{instruct_prompt}{prompts}<|im_end|>\n", + "media": [audio], + } else: return { - "prompts": [f"{instruct_prompt}{p}" for p in prompts], - "media": [[audio] for _ in range(batch_size)], + "prompts": [ + f"{instruct_prompt}{p}<|im_end|>\n{batch_prompt}" for p in prompts + ], + "media": [audio for _ in range(batch_size)], } else: @@ -445,7 +456,9 @@ def test_generate_regex_batch_multi_sample( generator = generate.regex( model, pattern, sampler=getattr(samplers, sampler_name)(4) ) - with enforce_not_implemented(model_fixture, "batch", "multiple_samples"): + with enforce_not_implemented( + model_fixture, "batch", "multiple_samples", sampler_name + ): output_batch_groups = generator(**get_inputs(model_fixture, 4), max_tokens=40) for output_sample_groups in output_batch_groups: for output in output_sample_groups: diff --git a/tests/generate/test_integration_transformers_audio.py b/tests/generate/test_integration_transformers_audio.py index 50d022089..d9fe0921c 100644 --- a/tests/generate/test_integration_transformers_audio.py +++ b/tests/generate/test_integration_transformers_audio.py @@ -25,7 +25,7 @@ def audio_from_url(url): @pytest.fixture(scope="session") def model(tmp_path_factory): return transformers_audio( - "Qwen/Qwen2-Audio-7B-Instruct", + "yujiepan/qwen2-audio-tiny-random", model_class=Qwen2AudioForConditionalGeneration, device="cpu", ) @@ -33,7 +33,7 @@ def model(tmp_path_factory): @pytest.fixture(scope="session") def processor(tmp_path_factory): - return AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct") + return AutoProcessor.from_pretrained("yujiepan/qwen2-audio-tiny-random") def test_single_audio_text_gen(model, processor): @@ -130,7 +130,7 @@ def test_single_audio_choice(model, processor): "role": "user", "content": [ {"audio"}, - {"type": "text", "text": "What is this?"}, + {"type": "text", "text": "What's that sound?"}, ], }, ] From 5d3142d6ed1287c30da7534f24d00564f92f13ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl?= Date: Wed, 11 Dec 2024 10:29:31 +0000 Subject: [PATCH 4/6] fix(test): correctly handle beam_search in generate text --- tests/generate/test_generate.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index 7f5f108d1..3615a3605 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -279,10 +279,19 @@ def get_inputs(fixture_name, batch_size=None): @pytest.mark.parametrize("model_fixture", ALL_MODEL_FIXTURES) def test_generate_text(request, model_fixture, sampler_name): model = request.getfixturevalue(model_fixture) - generator = generate.text(model, getattr(samplers, sampler_name)()) with enforce_not_implemented(model_fixture, sampler_name): - res = generator(**get_inputs(model_fixture), max_tokens=10) - assert isinstance(res, str) + if sampler_name == "beam_search": + num_head = 2 + generator = generate.text(model, getattr(samplers, sampler_name)(num_head)) + res = generator(**get_inputs(model_fixture), max_tokens=10) + assert isinstance(res, list) + assert len(res) == num_head + for elt in res: + assert isinstance(elt, str) + else: + generator = generate.text(model, getattr(samplers, sampler_name)()) + res = generator(**get_inputs(model_fixture), max_tokens=10) + assert isinstance(res, str) @pytest.mark.parametrize("pattern", REGEX_PATTERNS) From d7d6b652aba75cc90799c2852ff5a9250b55b35b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl?= Date: Wed, 11 Dec 2024 11:35:03 +0000 Subject: [PATCH 5/6] feat(audio): add cookbook for audio transformers integration --- docs/cookbook/audio_understanding.md | 200 +++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 docs/cookbook/audio_understanding.md diff --git a/docs/cookbook/audio_understanding.md b/docs/cookbook/audio_understanding.md new file mode 100644 index 000000000..7d601f81f --- /dev/null +++ b/docs/cookbook/audio_understanding.md @@ -0,0 +1,200 @@ +# Generate structured output for audio understanding + +Even though audio-LM models for audio-text-to-text tasks are still pretty niche, they are still useful (and fun) to analyse, extract informations, translate or transcript speeches. + +This cookbook highlights the new integration of audio-LM and has been tested with `Qwen/Qwen2-Audio-7B-Instruct` ([HF link](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)). + +## Setup + +As usual let's have the right packages + +```bash +pip install outlines torch==2.4.0 transformers accelerate librosa +``` + +So that you can import as follow: + +```python +# LLM stuff +import outlines +from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration + +# Audio stuff +import librosa +from io import BytesIO +from urllib.request import urlopen + +# Some ooo stuff +from enum import Enum +from pydantic import BaseModel +from typing import Optional +``` + +## Load the model and processor + +To achieve audio analysis we will need a model and its processor to pre-process prompts and audio. Let's do as follow: + +```python +qwen2_audio = outlines.models.transformers_vision( + "Qwen/Qwen2-Audio-7B-Instruct", + model_class=Qwen2AudioForConditionalGeneration, + model_kwargs={ + "device_map": "auto", + "torch_dtype": torch.bfloat16, + }, + processor_kwargs={ + "device": "cuda", # set to "cpu" if you don't have a GPU + }, +) + +processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct") +``` + +Let's also define a useful audio extractor from conversational prompts: + +```pyton +def audio_extractor(conversation): + audios = [] + for message in conversation: + if isinstance(message["content"], list): + for elt in message["content"]: + if elt["type"] == "audio": + audios.append( + librosa.load( + BytesIO(urlopen(elt['audio_url']).read()), + sr=processor.feature_extractor.sampling_rate + )[0] + ) + return audios +``` + +## Question answering + +Let's say we want to analyse and answer the question of the lady in this [audio](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav). + +### Data structure + +To have a structured data output, we can define the following data model: + +```python +class Age(int, Enum): + twenties = 20 + fifties = 50 + +class Gender(str, Enum): + male = "male" + female = "female" + +class Person(BaseModel): + gender: Gender + age: Age + language: Optional[str] +``` + +### Prompting + +Let's have the following prompt to ask our model: + +```python +audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav" + +conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + {"type": "audio", "audio_url": audio_url}, + { + "type": "text", + "text": f"""As asked in the audio, what is the gender and the age of the speaker? + + Return the information in the following JSON schema: + {Person.model_json_schema()} + """ + }, + ]}, +] +``` + +But we cannot pass it raw! We need to pre-process it and handle the audio file. + +```python +audios = audio_extractor(conversation) + +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) +``` + +Now we're ready to ask our model! + +### Run the model + +As usual with the outlines' framework, we will instantiate a generator that specifically struture the output based on our data model: + +```python +person_generator = outlines.generate.json( + qwen2_audio, + Person, + sampler=outlines.samplers.greedy() +) +``` + +That runs just like: + +```python +result = person_generator(prompt, audios) +``` + +And you are expecting to get a result as follow: +``` +Person( + gender=, + age=, + language='English' +) +``` + +## Classification + +Now we can focus on this [audio](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3) of a glass breaking. + +The integration of audio transformers, allows you to use all the functionalities of the outlines' API such as the `choice` method. We can do as follow: + +### Prompting + +Let's consider the following prompt and pre-process our audio: + +```python +audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3" + +conversation = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + {"type": "audio", "audio_url": audio_url}, + { + "type": "text", + "text": "Do you hear a dog barking or a glass breaking?" + }, + ]}, +] + +audios = audio_extractor(conversation) + +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) +``` + +### Run the model + +As mentioned, we will use the `choice` method to generate our structured output: + +```python +choice_generator = outlines.generate.choice( + qwen2_audio, + ["dog barking", "glass breaking"], +) + +result = choice_generator(prompt, audios) +``` + +And you are expected to have: +```python +print(result) +# "glass breaking" +``` From b529821a4c8d26970653383feefb580453de31cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl?= Date: Wed, 11 Dec 2024 11:47:15 +0000 Subject: [PATCH 6/6] test(audio): improve coverage of validate prompt and media --- outlines/generate/api.py | 7 +++++-- tests/generate/test_api.py | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/outlines/generate/api.py b/outlines/generate/api.py index 396166622..2c8a0d831 100644 --- a/outlines/generate/api.py +++ b/outlines/generate/api.py @@ -691,8 +691,11 @@ def _validate_prompt_media_types( def valid_types(prompts, media): import numpy as np # type: ignore - if not isinstance(prompts, (str, list)): - return False + if not isinstance(prompts, str): + if not isinstance(prompts, list): + return False + if not all(isinstance(p, str) for p in prompts): + return False if not isinstance(media, list): return False if not all(isinstance(m, np.ndarray) for m in media): diff --git a/tests/generate/test_api.py b/tests/generate/test_api.py index 881da04ed..4b162a147 100644 --- a/tests/generate/test_api.py +++ b/tests/generate/test_api.py @@ -42,6 +42,8 @@ def test_vision_sequence_generator_validate_types(prompts, media, type_error): "prompts,media,type_error", [ ("single prompt", [AUDIO_ARRAY], False), + (0, [AUDIO_ARRAY], True), + ([AUDIO_ARRAY], "single prompt", True), (["single prompt"], [AUDIO_ARRAY], False), (["prompt0", "prompt1"], [AUDIO_ARRAY, AUDIO_ARRAY], False), ("single prompt", [AUDIO_ARRAY, AUDIO_ARRAY], False),