From 2b30a915d2491476cad7aea6cf43bf4fb11e832b Mon Sep 17 00:00:00 2001 From: Yu xing Date: Tue, 20 Aug 2024 23:21:06 -0700 Subject: [PATCH 01/31] update ci --- .github/workflows/ci.yaml | 37 +++++++++++++++++++++++++++++++++++ tests/test_text_generation.py | 4 ++-- tests/test_vlm.py | 3 ++- 3 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/ci.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 00000000..35568ec0 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,37 @@ +name: Python CI + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + submodules: recursive # This will clone the repository with all its submodules + fetch-depth: 0 # This fetches all history so you can access any version of the submodules + + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' # Specify the Python version you want + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build pytest + - name: Build DLL + run: | + pip install -e . + - name: Run tests + run: | + python -m pytest tests \ No newline at end of file diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py index f37a4781..04782a21 100644 --- a/tests/test_text_generation.py +++ b/tests/test_text_generation.py @@ -1,7 +1,7 @@ import os from nexa.gguf.llama import llama from tests.utils import download_model - +from nexa.gguf.lib_utils import is_gpu_available # Constants TINY_LLAMA_URL = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_0.gguf" OUTPUT_DIR = os.getcwd() @@ -12,7 +12,7 @@ def init_llama_model(verbose=False, n_gpu_layers=-1, chat_format=None, embedding return llama.Llama( model_path=MODEL_PATH, verbose=verbose, - n_gpu_layers=n_gpu_layers, + n_gpu_layers=n_gpu_layers if is_gpu_available() else 0, chat_format=chat_format, embedding=embedding, ) diff --git a/tests/test_vlm.py b/tests/test_vlm.py index b70389be..25d81d56 100644 --- a/tests/test_vlm.py +++ b/tests/test_vlm.py @@ -4,6 +4,7 @@ from nexa.gguf.llama import llama from nexa.gguf.llama.llama_chat_format import NanoLlavaChatHandler from tests.utils import download_model +from nexa.gguf.lib_utils import is_gpu_available def image_to_base64_data_uri(file_path): """ @@ -31,7 +32,7 @@ def test_image_generation(): model_path=model_path, chat_handler=chat_handler, n_ctx=2048, # n_ctx should be increased to accommodate the image embedding - n_gpu_layers=-1, # Uncomment to use GPU acceleration + n_gpu_layers=-1 if is_gpu_available() else 0, # Uncomment to use GPU acceleration verbose=False, ) output = llm.create_chat_completion( From 220dbc07bcb47332df974e5a7b5e42d38b427536 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Tue, 20 Aug 2024 23:54:40 -0700 Subject: [PATCH 02/31] use tempfile and try to fix ci --- tests/test_vlm.py | 85 +++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/tests/test_vlm.py b/tests/test_vlm.py index 25d81d56..17400bd4 100644 --- a/tests/test_vlm.py +++ b/tests/test_vlm.py @@ -5,6 +5,7 @@ from nexa.gguf.llama.llama_chat_format import NanoLlavaChatHandler from tests.utils import download_model from nexa.gguf.lib_utils import is_gpu_available +import tempfile def image_to_base64_data_uri(file_path): """ @@ -15,53 +16,51 @@ def image_to_base64_data_uri(file_path): base64_data = base64.b64encode(img_file.read()).decode("utf-8") return f"data:image/png;base64,{base64_data}" -model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf" -mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf" -# Download paths -output_dir = os.getcwd() -model_path = download_model(model_url, output_dir) -mmproj_path = download_model(mmproj_url, output_dir) -print("Model downloaded to:", model_path) -print("MMProj downloaded to:", mmproj_path) +def test_image_generation(): + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir = os.path.dirname(os.path.abspath(__file__)) + model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf" + mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf" -chat_handler = NanoLlavaChatHandler(clip_model_path=mmproj_path) + model_path = download_model(model_url, temp_dir) + mmproj_path = download_model(mmproj_url, temp_dir) + chat_handler = NanoLlavaChatHandler(clip_model_path=mmproj_path) -def test_image_generation(): - llm = llama.Llama( - model_path=model_path, - chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accommodate the image embedding - n_gpu_layers=-1 if is_gpu_available() else 0, # Uncomment to use GPU acceleration - verbose=False, - ) - output = llm.create_chat_completion( - messages=[ - { - "role": "system", - "content": "You are an assistant who perfectly describes images.", - }, - { - "role": "user", - "content": [ - {"type": "text", "text": "What's in this image?"}, - { - "type": "image_url", - "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + llm = llama.Llama( + model_path=model_path, + chat_handler=chat_handler, + n_ctx=2048, # n_ctx should be increased to accommodate the image embedding + n_gpu_layers=-1 if is_gpu_available() else 0, # Uncomment to use GPU acceleration + verbose=False, + ) + output = llm.create_chat_completion( + messages=[ + { + "role": "system", + "content": "You are an assistant who perfectly describes images.", + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, }, - }, - ], - }, - ], - stream=True, - ) - for chunk in output: - delta = chunk["choices"][0]["delta"] - if "role" in delta: - print(delta["role"], end=": ") - elif "content" in delta: - print(delta["content"], end="") + ], + }, + ], + stream=True, + ) + for chunk in output: + delta = chunk["choices"][0]["delta"] + if "role" in delta: + print(delta["role"], end=": ") + elif "content" in delta: + print(delta["content"], end="") # if __name__ == "__main__": From 7047e1f91848c38f0091b138a1e631ada644da7c Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 00:38:27 -0700 Subject: [PATCH 03/31] update ci --- .github/workflows/ci.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 35568ec0..bde784ed 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,10 +28,11 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install build pytest + python -m pip install numpy --upgrade + python -m pip install build pytest - name: Build DLL run: | - pip install -e . + python -m pip install -e . - name: Run tests run: | python -m pytest tests \ No newline at end of file From 45751159c3b3c43776adaf286a73a372d7636ec2 Mon Sep 17 00:00:00 2001 From: Zack Li Date: Wed, 21 Aug 2024 17:33:31 +0000 Subject: [PATCH 04/31] wip --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 12c9232e..d290b249 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "nexaai" -version = "0.0.1.dev" +version = "0.0.2.dev" description = "Nexa AI SDK" readme = "README.md" license = { text = "MIT" } From 663f20269fee0e5f763421450f7047e62642f0d5 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 12:57:37 -0700 Subject: [PATCH 05/31] update ci --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bde784ed..0e17e44d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,7 +10,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: macos-latest steps: - name: Checkout code From dd49f40a98805ff3092f008a2e563dc997bdb7ba Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 13:26:48 -0700 Subject: [PATCH 06/31] expose gguf interface --- nexa/gguf/nexa_inference_image.py | 124 +++++++++++++++++------------- nexa/gguf/nexa_inference_text.py | 65 +++++++++++++--- nexa/gguf/nexa_inference_vlm.py | 25 ++++++ nexa/gguf/nexa_inference_voice.py | 101 +++++++++++++++++++++--- 4 files changed, 238 insertions(+), 77 deletions(-) diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 21714fad..473b9f2f 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -29,21 +29,22 @@ class NexaImageInference: A class used for loading image models and running image generation. Methods: - run_txt2img: Run the text-to-image generation loop. - run_img2img: Run the image-to-image generation loop. - run_streamlit: Run the Streamlit UI. + run_txt2img: Run the text-to-image generation loop. + run_img2img: Run the image-to-image generation loop. + run_streamlit: Run the Streamlit UI. Args: - model_path (str): Path or identifier for the model in Nexa Model Hub. - num_inference_steps (int): Number of inference steps. - width (int): Width of the output image. - height (int): Height of the output image. - guidance_scale (float): Guidance scale for diffusion. - output_path (str): Output path for the generated image. - random_seed (int): Random seed for image generation. - streamlit (bool): Run the inference in Streamlit UI. + model_path (str): Path or identifier for the model in Nexa Model Hub. + num_inference_steps (int): Number of inference steps. + width (int): Width of the output image. + height (int): Height of the output image. + guidance_scale (float): Guidance scale for diffusion. + output_path (str): Output path for the generated image. + random_seed (int): Random seed for image generation. + streamlit (bool): Run the inference in Streamlit UI. """ + from nexa.gguf.sd.stable_diffusion import StableDiffusion def __init__(self, model_path, **kwargs): self.model_path = None @@ -107,63 +108,75 @@ def _save_images(self, images): file_path = os.path.join(output_dir, file_name) image.save(file_path) logging.info(f"\nImage {i+1} saved to: {file_path}") + + def txt2img(self, prompt, negative_prompt): + """ + Used for SDK. Generate images from text. + + Args: + prompt (str): Prompt for the image generation. + negative_prompt (str): Negative prompt for the image generation. - def loop_txt2img(self): + Returns: + list: List of generated images. + """ + images = self.model.txt_to_img( + prompt=prompt, + negative_prompt=negative_prompt if negative_prompt else "", + cfg_scale=self.params["guidance_scale"], + width=self.params["width"], + height=self.params["height"], + sample_steps=self.params["num_inference_steps"], + seed=self.params["random_seed"], + control_cond=self.params.get("control_image_path", ""), + control_strength=self.params.get("control_strength", 0.9), + ) + return images + def run_txt2img(self): while True: try: prompt = nexa_prompt("Enter your prompt: ") negative_prompt = nexa_prompt( "Enter your negative prompt (press Enter to skip): " ) - self._txt2img(prompt, negative_prompt) + try: + images = self.txt2img(prompt, negative_prompt) + self._save_images(images) + except Exception as e: + logging.error(f"Error during text to image generation: {e}") except KeyboardInterrupt: print(EXIT_REMINDER) except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) - def _txt2img(self, prompt: str, negative_prompt: str): + def img2img(self, image_path, prompt, negative_prompt): """ - Generate images based on the given prompt, negative prompt, and parameters. - """ - try: - images = self.model.txt_to_img( - prompt=prompt, - negative_prompt=negative_prompt if negative_prompt else "", - cfg_scale=self.params["guidance_scale"], - width=self.params["width"], - height=self.params["height"], - sample_steps=self.params["num_inference_steps"], - seed=self.params["random_seed"], - control_cond=self.params.get("control_image_path", ""), - control_strength=self.params.get("control_strength", 0.9), - ) - self._save_images(images) - except Exception as e: - logging.error(f"Error during image generation: {e}") + Used for SDK. Generate images from an image. - def loop_img2img(self): - def _generate_images(image_path, prompt, negative_prompt): - """ - Generate images based on the given prompt, negative prompt, and parameters. - """ - try: - images = self.model.img_to_img( - image=image_path, - prompt=prompt, - negative_prompt=negative_prompt if negative_prompt else "", - cfg_scale=self.params["guidance_scale"], - width=self.params["width"], - height=self.params["height"], - sample_steps=self.params["num_inference_steps"], - seed=self.params["random_seed"], - control_cond=self.params.get("control_image_path", ""), - control_strength=self.params.get("control_strength", 0.9), - ) - self._save_images(images) - except Exception as e: - logging.error(f"Error during image generation: {e}") + Args: + image_path (str): Path to the input image. + prompt (str): Prompt for the image generation. + negative_prompt (str): Negative prompt for the image generation. + Returns: + list: List of generated images. + """ + images = self.model.img_to_img( + image=image_path, + prompt=prompt, + negative_prompt=negative_prompt if negative_prompt else "", + cfg_scale=self.params["guidance_scale"], + width=self.params["width"], + height=self.params["height"], + sample_steps=self.params["num_inference_steps"], + seed=self.params["random_seed"], + control_cond=self.params.get("control_image_path", ""), + control_strength=self.params.get("control_strength", 0.9), + ) + return images + + def run_img2img(self): while True: try: image_path = nexa_prompt("Enter the path to your image: ") @@ -171,7 +184,8 @@ def _generate_images(image_path, prompt, negative_prompt): negative_prompt = nexa_prompt( "Enter your negative prompt (press Enter to skip): " ) - _generate_images(image_path, prompt, negative_prompt) + images = self.img2img(image_path, prompt, negative_prompt) + self._save_images(images) except KeyboardInterrupt: print(EXIT_REMINDER) except Exception as e: @@ -257,6 +271,6 @@ def run_streamlit(self, model_path: str): inference.run_streamlit(model_path) else: if args.img2img: - inference.loop_img2img() + inference.run_img2img() else: - inference.loop_txt2img() + inference.run_txt2img() \ No newline at end of file diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index 210e0267..4f3e933f 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -14,7 +14,6 @@ ) from nexa.general import pull_model from nexa.gguf.lib_utils import is_gpu_available -from nexa.gguf.llama.llama import Llama from nexa.utils import SpinningCursorAnimation, nexa_prompt, suppress_stdout_stderr logging.basicConfig( @@ -27,19 +26,20 @@ class NexaTextInference: A class used for load text models and run text generation. Methods: - run: Run the text generation loop. - run_streamlit: Run the Streamlit UI. + run: Run the text generation loop. + run_streamlit: Run the Streamlit UI. Args: - model_path (str): Path or identifier for the model in Nexa Model Hub. - stop_words (list): List of stop words for early stopping. - profiling (bool): Enable timing measurements for the generation process. - streamlit (bool): Run the inference in Streamlit UI. - temperature (float): Temperature for sampling. - max_new_tokens (int): Maximum number of new tokens to generate. - top_k (int): Top-k sampling parameter. - top_p (float): Top-p sampling parameter + model_path (str): Path or identifier for the model in Nexa Model Hub. + stop_words (list): List of stop words for early stopping. + profiling (bool): Enable timing measurements for the generation process. + streamlit (bool): Run the inference in Streamlit UI. + temperature (float): Temperature for sampling. + max_new_tokens (int): Maximum number of new tokens to generate. + top_k (int): Top-k sampling parameter. + top_p (float): Top-p sampling parameter """ + from nexa.gguf.llama import Llama def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS self.params.update(kwargs) @@ -132,6 +132,9 @@ def _load_model(self): self.conversation_history = [] if self.chat_format else None def run(self): + """ + CLI interactive session. Not for SDK. + """ while True: generated_text = "" try: @@ -177,6 +180,44 @@ def run(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) print("\n") + + def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, stream=False, stop=None): + """ + Used for SDK. Generate completion for a chat conversation. + + Args: + messages (list): List of messages in the conversation. + temperature (float): Temperature for sampling. + max_tokens (int): Maximum number of new tokens to generate. + top_k (int): Top-k sampling parameter. + top_p (float): Top-p sampling parameter. + stream (bool): Stream the output. + stop (list): List of stop words for early stopping. + + Returns: + Iterator: Iterator for the completion. + """ + return self.model.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, stream=stream, stop=stop) + + def create_completion(self, prompt, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, echo=False, stream=False, stop=None): + """ + Used for SDK. Generate completion for a given prompt. + + Args: + prompt (str): Prompt for the completion. + temperature (float): Temperature for sampling. + max_tokens (int): Maximum number of new tokens to generate. + top_k (int): Top-k sampling parameter. + top_p (float): Top-p sampling parameter. + echo (bool): Echo the prompt back in the output. + stream (bool): Stream the output. + stop (list): List of stop words for early stopping. + + Returns: + Iterator: Iterator for the completion. + """ + return self.model.create_completion(prompt=prompt, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, echo=echo, stream=stream, stop=stop) + def _chat(self, user_input: str) -> Iterator: self.conversation_history.append({"role": "user", "content": user_input}) @@ -209,7 +250,7 @@ def _complete(self, user_input: str) -> Iterator: def run_streamlit(self, model_path: str): """ - Run the Streamlit UI. + Used for CLI. Run the Streamlit UI. """ logging.info("Running Streamlit UI...") diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 2d9e39e1..157bf28e 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -79,6 +79,8 @@ class NexaVLMInference: top_k (int): Top-k sampling parameter. top_p (float): Top-p sampling parameter """ + from nexa.gguf.llama.llama import Llama + def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS self.params.update(kwargs) @@ -216,6 +218,29 @@ def run(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) print("\n") + + def create_chat_completion(self, messages, temperature, max_tokens, top_k, top_p, stream, stop): + """ + Generate text completion for a given chat prompt. + + Args: + messages (list): List of messages in the chat prompt. + temperature (float): Temperature for sampling. + max_tokens (int): Maximum number of tokens to generate. + top_k (int): Top-k sampling parameter. + top_p (float): Top-p sampling parameter. + stream (bool): Stream the output. + stop (list): List of stop words for early stopping. + """ + return self.model.create_chat_completion( + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + top_k=top_k, + top_p=top_p, + stream=stream, + stop=stop, + ) def _chat(self, user_input: str, image_path: str = None) -> Iterator: data_uri = image_to_base64_data_uri(image_path) if image_path else None diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index 7822344d..3ab103d6 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -19,18 +19,18 @@ class NexaVoiceInference: A class used for loading voice models and running voice transcription. Methods: - run: Run the voice transcription loop. - run_streamlit: Run the Streamlit UI. + run: Run the voice transcription loop. + run_streamlit: Run the Streamlit UI. Args: - model_path (str): Path or identifier for the model in Nexa Model Hub. - output_dir (str): Output directory for transcriptions. - beam_size (int): Beam size to use for transcription. - language (str): The language spoken in the audio. - task (str): Task to execute (transcribe or translate). - temperature (float): Temperature for sampling. - compute_type (str): Type to use for computation (e.g., float16, int8, int8_float16). - output_dir (str): Output directory for transcriptions. + model_path (str): Path or identifier for the model in Nexa Model Hub. + output_dir (str): Output directory for transcriptions. + beam_size (int): Beam size to use for transcription. + language (str): The language spoken in the audio. + task (str): Task to execute (transcribe or translate). + temperature (float): Temperature for sampling. + compute_type (str): Type to use for computation (e.g., float16, int8, int8_float16). + output_dir (str): Output directory for transcriptions. """ def __init__(self, model_path, **kwargs): @@ -87,6 +87,87 @@ def run(self): print(EXIT_REMINDER) except Exception as e: logging.error(f"Error during text generation: {e}", exc_info=True) + + def transcribe(self, audio, **kwargs): + """ + Transcribe the audio file. + + Arguments: + audio: Path to the input file (or a file-like object), or the audio waveform. + language: The language spoken in the audio. It should be a language code such + as "en" or "fr". If not set, the language will be detected in the first 30 seconds + of audio. + task: Task to execute (transcribe or translate). + beam_size: Beam size to use for decoding. + best_of: Number of candidates when sampling with non-zero temperature. + patience: Beam search patience factor. + length_penalty: Exponential length penalty constant. + repetition_penalty: Penalty applied to the score of previously generated tokens + (set > 1 to penalize). + no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable). + temperature: Temperature for sampling. It can be a tuple of temperatures, + which will be successively used upon failures according to either + `compression_ratio_threshold` or `log_prob_threshold`. + compression_ratio_threshold: If the gzip compression ratio is above this value, + treat as failed. + log_prob_threshold: If the average log probability over sampled tokens is + below this value, treat as failed. + no_speech_threshold: If the no_speech probability is higher than this value AND + the average log probability over sampled tokens is below `log_prob_threshold`, + consider the segment as silent. + condition_on_previous_text: If True, the previous output of the model is provided + as a prompt for the next window; disabling may make the text inconsistent across + windows, but the model becomes less prone to getting stuck in a failure loop, + such as repetition looping or timestamps going out of sync. + prompt_reset_on_temperature: Resets prompt if temperature is above this value. + Arg has effect only if condition_on_previous_text is True. + initial_prompt: Optional text string or iterable of token ids to provide as a + prompt for the first window. + prefix: Optional text to provide as a prefix for the first window. + suppress_blank: Suppress blank outputs at the beginning of the sampling. + suppress_tokens: List of token IDs to suppress. -1 will suppress a default set + of symbols as defined in the model config.json file. + without_timestamps: Only sample text tokens. + max_initial_timestamp: The initial timestamp cannot be later than this. + word_timestamps: Extract word-level timestamps using the cross-attention pattern + and dynamic time warping, and include the timestamps for each word in each segment. + prepend_punctuations: If word_timestamps is True, merge these punctuation symbols + with the next word + append_punctuations: If word_timestamps is True, merge these punctuation symbols + with the previous word + vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio + without speech. This step is using the Silero VAD model + https://github.com/snakers4/silero-vad. + vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available + parameters and default values in the class `VadOptions`). + max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set, + the maximum will be set by the default max_length. + chunk_length: The length of audio segments. If it is not None, it will overwrite the + default chunk_length of the FeatureExtractor. + clip_timestamps: + Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to + process. The last end timestamp defaults to the end of the file. + vad_filter will be ignored if clip_timestamps is used. + hallucination_silence_threshold: + When word_timestamps is True, skip silent periods longer than this threshold + (in seconds) when a possible hallucination is detected + hotwords: + Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None. + language_detection_threshold: If the maximum probability of the language tokens is higher + than this value, the language is detected. + language_detection_segments: Number of segments to consider for the language detection. + + Returns: + A tuple with: + + - a generator over transcribed segments + - an instance of TranscriptionInfo + """ + return self.model.transcribe( + audio, + **kwargs, + ) + def _transcribe_audio(self, audio_path): logging.debug(f"Transcribing audio from: {audio_path}") From a0b2358de852af578c56c05cf1961ed7bb09425f Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 13:54:01 -0700 Subject: [PATCH 07/31] update onnx interface --- nexa/gguf/nexa_inference_text.py | 2 +- nexa/onnx/nexa_inference_image.py | 54 +++++++++++++++++-------------- nexa/onnx/nexa_inference_text.py | 20 ++++++------ nexa/onnx/nexa_inference_tts.py | 35 +++++++++++++------- 4 files changed, 64 insertions(+), 47 deletions(-) diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index 4f3e933f..23804abf 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -39,7 +39,7 @@ class NexaTextInference: top_k (int): Top-k sampling parameter. top_p (float): Top-p sampling parameter """ - from nexa.gguf.llama import Llama + from nexa.gguf.llama.llama import Llama def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS self.params.update(kwargs) diff --git a/nexa/onnx/nexa_inference_image.py b/nexa/onnx/nexa_inference_image.py index 0a6e5f54..cf676255 100644 --- a/nexa/onnx/nexa_inference_image.py +++ b/nexa/onnx/nexa_inference_image.py @@ -105,15 +105,23 @@ def _dialogue_mode(self): negative_prompt = nexa_prompt( "Enter your negative prompt (press Enter to skip): " ) - self._generate_images(prompt, negative_prompt) + images = self.generate_images(prompt, negative_prompt) + self._save_images(images) except KeyboardInterrupt: print(EXIT_REMINDER) except Exception as e: logging.error(f"Error during text generation: {e}", exc_info=True) - def _generate_images(self, prompt, negative_prompt): + def generate_images(self, prompt, negative_prompt): """ - Generate images based on the given prompt, negative prompt, and parameters. + Used for SDK. Generate images based on the given prompt, negative prompt, and parameters. + + Arg: + prompt (str): Prompt for the image generation. + negative_prompt (str): Negative prompt for the image generation. + + Returns: + list: List of generated images. """ if self.pipeline is None: logging.error("Model not loaded. Exiting.") @@ -121,28 +129,26 @@ def _generate_images(self, prompt, negative_prompt): generator = np.random.RandomState(self.params["random_seed"]) - try: - is_lcm_pipeline = isinstance( - self.pipeline, ORTLatentConsistencyModelPipeline - ) + is_lcm_pipeline = isinstance( + self.pipeline, ORTLatentConsistencyModelPipeline + ) - pipeline_kwargs = { - "prompt": prompt, - "num_inference_steps": self.params["num_inference_steps"], - "num_images_per_prompt": self.params["num_images_per_prompt"], - "height": self.params["height"], - "width": self.params["width"], - "generator": generator, - "guidance_scale": self.params["guidance_scale"], - } - if not is_lcm_pipeline and negative_prompt: - pipeline_kwargs["negative_prompt"] = negative_prompt - - images = self.pipeline(**pipeline_kwargs).images - - self._save_images(images) - except Exception as e: - logging.error(f"Error during image generation: {e}") + pipeline_kwargs = { + "prompt": prompt, + "num_inference_steps": self.params["num_inference_steps"], + "num_images_per_prompt": self.params["num_images_per_prompt"], + "height": self.params["height"], + "width": self.params["width"], + "generator": generator, + "guidance_scale": self.params["guidance_scale"], + } + if not is_lcm_pipeline and negative_prompt: + pipeline_kwargs["negative_prompt"] = negative_prompt + + images = self.pipeline(**pipeline_kwargs).images + return images + + def _save_images(self, images): """ diff --git a/nexa/onnx/nexa_inference_text.py b/nexa/onnx/nexa_inference_text.py index f2f94a3c..56f5c09b 100644 --- a/nexa/onnx/nexa_inference_text.py +++ b/nexa/onnx/nexa_inference_text.py @@ -20,18 +20,18 @@ class NexaTextInference: A class used for load text models and run text generation. Methods: - run: Run the text generation loop. - run_streamlit: Run the Streamlit UI. + run: Run the text generation loop. + run_streamlit: Run the Streamlit UI. Args: - model_path (str): Path or identifier for the model in Nexa Model Hub. - profiling (bool): Enable timing measurements for the generation process. - streamlit (bool): Run the inference in Streamlit UI. - temperature (float): Temperature for sampling. - min_new_tokens (int): Minimum number of new tokens to generate. - max_new_tokens (int): Maximum number of new tokens to generate. - top_k (int): Top-k sampling parameter. - top_p (float): Top-p sampling parameter + model_path (str): Path or identifier for the model in Nexa Model Hub. + profiling (bool): Enable timing measurements for the generation process. + streamlit (bool): Run the inference in Streamlit UI. + temperature (float): Temperature for sampling. + min_new_tokens (int): Minimum number of new tokens to generate. + max_new_tokens (int): Maximum number of new tokens to generate. + top_k (int): Top-k sampling parameter. + top_p (float): Top-p sampling parameter """ def __init__(self, model_path, **kwargs): diff --git a/nexa/onnx/nexa_inference_tts.py b/nexa/onnx/nexa_inference_tts.py index e7167ee6..ff7093d6 100644 --- a/nexa/onnx/nexa_inference_tts.py +++ b/nexa/onnx/nexa_inference_tts.py @@ -23,14 +23,14 @@ class NexaTTSInference: A class used for loading text-to-speech models and running text-to-speech generation. Methods: - run: Run the text-to-speech generation loop. - run_streamlit: Run the Streamlit UI. + run: Run the text-to-speech generation loop. + run_streamlit: Run the Streamlit UI. Args: - model_path (str): Path or identifier for the model in Nexa Model Hub. - output_dir (str): Output directory for tts. - sampling_rate (int): Sampling rate for audio processing. - streamlit (bool): Run the inference in Streamlit UI. + model_path (str): Path or identifier for the model in Nexa Model Hub. + output_dir (str): Output directory for tts. + sampling_rate (int): Sampling rate for audio processing. + streamlit (bool): Run the inference in Streamlit UI. """ def __init__(self, model_path, **kwargs): @@ -71,19 +71,30 @@ def run(self): while True: try: user_input = nexa_prompt("Enter text to generate audio: ") - self._audio_generation(user_input) + outputs = self.audio_generation(user_input) + self._save_audio( + outputs[0], self.params["sampling_rate"], self.params["output_path"] + ) + logging.info(f"Audio saved to {self.params['output_path']}") except KeyboardInterrupt: print(EXIT_REMINDER) except Exception as e: logging.error(f"Error during text generation: {e}", exc_info=True) - def _audio_generation(self, user_input): + def audio_generation(self, user_input): + """ + Used for SDK. Generate audio from the user input. + + Args: + user_input (str): User input for audio generation. + + Returns: + np.array: Audio data. + """ inputs = self.tokenizer(user_input) outputs = self.model.run(None, {"text": inputs}) - self._save_audio( - outputs[0], self.params["sampling_rate"], self.params["output_path"] - ) - logging.info(f"Audio saved to {self.params['output_path']}") + return outputs + def _save_audio(self, audio_data, sampling_rate, output_path): os.makedirs(output_path, exist_ok=True) From c9f3d4fd8d232d124818d7f95986588f474a0652 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 13:59:30 -0700 Subject: [PATCH 08/31] move import position --- nexa/gguf/nexa_inference_image.py | 4 ++-- nexa/gguf/nexa_inference_text.py | 3 ++- nexa/gguf/nexa_inference_vlm.py | 4 ++-- nexa/gguf/nexa_inference_voice.py | 3 +-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 473b9f2f..3e2123ad 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -6,7 +6,6 @@ import time from pathlib import Path -from nexa.gguf.sd.stable_diffusion import StableDiffusion from nexa.general import pull_model from nexa.constants import ( DEFAULT_IMG_GEN_PARAMS, @@ -44,7 +43,7 @@ class NexaImageInference: streamlit (bool): Run the inference in Streamlit UI. """ - from nexa.gguf.sd.stable_diffusion import StableDiffusion + def __init__(self, model_path, **kwargs): self.model_path = None @@ -85,6 +84,7 @@ def __init__(self, model_path, **kwargs): @SpinningCursorAnimation() def _load_model(self, model_path: str): with suppress_stdout_stderr(): + from nexa.gguf.sd.stable_diffusion import StableDiffusion self.model = StableDiffusion( model_path=self.downloaded_path, lora_model_dir=self.params.get("lora_dir", ""), diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index 23804abf..9c9a3c32 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -39,7 +39,7 @@ class NexaTextInference: top_k (int): Top-k sampling parameter. top_p (float): Top-p sampling parameter """ - from nexa.gguf.llama.llama import Llama + def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS self.params.update(kwargs) @@ -110,6 +110,7 @@ def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() with suppress_stdout_stderr(): + from nexa.gguf.llama.llama import Llama self.model = Llama( model_path=self.downloaded_path, verbose=self.profiling, diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 157bf28e..1e2ab005 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -19,7 +19,6 @@ ) from nexa.general import pull_model from nexa.gguf.lib_utils import is_gpu_available -from nexa.gguf.llama.llama import Llama from nexa.gguf.llama.llama_chat_format import ( Llava15ChatHandler, Llava16ChatHandler, @@ -79,7 +78,7 @@ class NexaVLMInference: top_k (int): Top-k sampling parameter. top_p (float): Top-p sampling parameter """ - from nexa.gguf.llama.llama import Llama + def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS @@ -151,6 +150,7 @@ def _load_model(self): if self.projector_downloaded_path else None ) + from nexa.gguf.llama.llama import Llama self.model = Llama( model_path=self.downloaded_path, chat_handler=self.projector, diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index 3ab103d6..372a72f6 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -7,9 +7,8 @@ from nexa.constants import EXIT_REMINDER, NEXA_RUN_MODEL_MAP_VOICE, DEFAULT_VOICE_GEN_PARAMS from nexa.general import pull_model -from nexa.utils import nexa_prompt from faster_whisper import WhisperModel -from nexaai.utils import nexa_prompt, SpinningCursorAnimation, suppress_stdout_stderr +from nexa.utils import nexa_prompt, SpinningCursorAnimation, suppress_stdout_stderr logging.basicConfig(level=logging.INFO) From 5bc20f81687aacafab9ab7592a59576fb380884f Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 14:26:07 -0700 Subject: [PATCH 09/31] update vlm test --- nexa/gguf/__init__.py | 20 +++++++++--------- nexa/gguf/nexa_inference_vlm.py | 37 ++++++++++++++++++++++++++++++++- tests/test_vlm.py | 17 ++++----------- 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/nexa/gguf/__init__.py b/nexa/gguf/__init__.py index 0001ab54..6ab29ece 100644 --- a/nexa/gguf/__init__.py +++ b/nexa/gguf/__init__.py @@ -1,11 +1,11 @@ -# from .nexa_inference_image import NexaImageInference -# from .nexa_inference_text import NexaTextInference -# from .nexa_inference_vlm import NexaVLMInference -# from .nexa_inference_voice import NexaVoiceInference +from .nexa_inference_image import NexaImageInference +from .nexa_inference_text import NexaTextInference +from .nexa_inference_vlm import NexaVLMInference +from .nexa_inference_voice import NexaVoiceInference -# __all__ = [ -# "NexaImageInference", -# "NexaTextInference", -# "NexaVLMInference", -# "NexaVoiceInference", -# ] \ No newline at end of file +__all__ = [ + "NexaImageInference", + "NexaTextInference", + "NexaVLMInference", + "NexaVoiceInference", +] \ No newline at end of file diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 1e2ab005..63061852 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -219,7 +219,14 @@ def run(self): logging.error(f"Error during generation: {e}", exc_info=True) print("\n") - def create_chat_completion(self, messages, temperature, max_tokens, top_k, top_p, stream, stop): + def create_chat_completion(self, + messages, + max_tokens:int = 2048, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + stream=False, + stop=[]): """ Generate text completion for a given chat prompt. @@ -231,6 +238,34 @@ def create_chat_completion(self, messages, temperature, max_tokens, top_k, top_p top_p (float): Top-p sampling parameter. stream (bool): Stream the output. stop (list): List of stop words for early stopping. + + Returns: + Iterator: An iterator of the generated text completion + return format: + { + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "The 2020 World Series was played in Texas at Globe Life Field in Arlington.", + "role": "assistant" + }, + "logprobs": null + } + ], + "created": 1677664795, + "id": "chatcmpl-7QyqpwdfhqwajicIEznoc6Q47XAyW", + "model": "gpt-4o-mini", + "object": "chat.completion", + "usage": { + "completion_tokens": 17, + "prompt_tokens": 57, + "total_tokens": 74 + } + } + usage: message = completion.choices[0].message.content + """ return self.model.create_chat_completion( messages=messages, diff --git a/tests/test_vlm.py b/tests/test_vlm.py index 17400bd4..d8977a68 100644 --- a/tests/test_vlm.py +++ b/tests/test_vlm.py @@ -1,8 +1,7 @@ import base64 import os -from nexa.gguf.llama import llama -from nexa.gguf.llama.llama_chat_format import NanoLlavaChatHandler +from nexa.gguf import NexaVLMInference from tests.utils import download_model from nexa.gguf.lib_utils import is_gpu_available import tempfile @@ -23,18 +22,10 @@ def test_image_generation(): model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf" mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf" - model_path = download_model(model_url, temp_dir) - mmproj_path = download_model(mmproj_url, temp_dir) - chat_handler = NanoLlavaChatHandler(clip_model_path=mmproj_path) - - llm = llama.Llama( - model_path=model_path, - chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accommodate the image embedding - n_gpu_layers=-1 if is_gpu_available() else 0, # Uncomment to use GPU acceleration - verbose=False, + model = NexaVLMInference( + model_path="nanollava", ) - output = llm.create_chat_completion( + output = model.create_chat_completion( messages=[ { "role": "system", From 85d6ed3b48e6ebd185a5926dd543278e50e5e1b1 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 14:57:50 -0700 Subject: [PATCH 10/31] update tests to use sdk interface instead of from scratch --- nexa/gguf/nexa_inference_image.py | 84 +++++++++++++++++++++++-------- nexa/gguf/nexa_inference_text.py | 14 ++---- nexa/gguf/nexa_inference_vlm.py | 2 +- tests/test_image_generation.py | 35 ++++--------- tests/test_text_generation.py | 44 ++++++++-------- tests/test_vlm.py | 19 ------- 6 files changed, 100 insertions(+), 98 deletions(-) diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 3e2123ad..d9d38bfa 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -28,8 +28,8 @@ class NexaImageInference: A class used for loading image models and running image generation. Methods: - run_txt2img: Run the text-to-image generation loop. - run_img2img: Run the image-to-image generation loop. + txt2img: (Used for SDK) Run the text-to-image generation loop. + img2img: (Used for SDK) Run the image-to-image generation loop. run_streamlit: Run the Streamlit UI. Args: @@ -109,7 +109,16 @@ def _save_images(self, images): image.save(file_path) logging.info(f"\nImage {i+1} saved to: {file_path}") - def txt2img(self, prompt, negative_prompt): + def txt2img(self, + prompt, + negative_prompt="", + cfg_scale=7.5, + width=512, + height=512, + sample_steps=20, + seed=0, + control_cond="", + control_strength=0.9): """ Used for SDK. Generate images from text. @@ -122,14 +131,14 @@ def txt2img(self, prompt, negative_prompt): """ images = self.model.txt_to_img( prompt=prompt, - negative_prompt=negative_prompt if negative_prompt else "", - cfg_scale=self.params["guidance_scale"], - width=self.params["width"], - height=self.params["height"], - sample_steps=self.params["num_inference_steps"], - seed=self.params["random_seed"], - control_cond=self.params.get("control_image_path", ""), - control_strength=self.params.get("control_strength", 0.9), + negative_prompt=negative_prompt, + cfg_scale=cfg_scale, + width=width, + height=height, + sample_steps=sample_steps, + seed=seed, + control_cond=control_cond, + control_strength=control_strength, ) return images @@ -141,7 +150,17 @@ def run_txt2img(self): "Enter your negative prompt (press Enter to skip): " ) try: - images = self.txt2img(prompt, negative_prompt) + images = self.txt2img( + prompt, + negative_prompt, + cfg_scale=self.params["guidance_scale"], + width=self.params["width"], + height=self.params["height"], + sample_steps=self.params["num_inference_steps"], + seed=self.params["random_seed"], + control_cond=self.params.get("control_image_path", ""), + control_strength=self.params.get("control_strength", 0.9), + ) self._save_images(images) except Exception as e: logging.error(f"Error during text to image generation: {e}") @@ -150,7 +169,17 @@ def run_txt2img(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) - def img2img(self, image_path, prompt, negative_prompt): + def img2img(self, + image_path, + prompt, + negative_prompt="", + cfg_scale=7.5, + width=512, + height=512, + sample_steps=20, + seed=0, + control_cond="", + control_strength=0.9): """ Used for SDK. Generate images from an image. @@ -165,14 +194,14 @@ def img2img(self, image_path, prompt, negative_prompt): images = self.model.img_to_img( image=image_path, prompt=prompt, - negative_prompt=negative_prompt if negative_prompt else "", - cfg_scale=self.params["guidance_scale"], - width=self.params["width"], - height=self.params["height"], - sample_steps=self.params["num_inference_steps"], - seed=self.params["random_seed"], - control_cond=self.params.get("control_image_path", ""), - control_strength=self.params.get("control_strength", 0.9), + negative_prompt=negative_prompt, + cfg_scale=cfg_scale, + width=width, + height=height, + sample_steps=sample_steps, + seed=seed, + control_cond=control_cond, + control_strength=control_strength, ) return images @@ -184,7 +213,18 @@ def run_img2img(self): negative_prompt = nexa_prompt( "Enter your negative prompt (press Enter to skip): " ) - images = self.img2img(image_path, prompt, negative_prompt) + images = self.img2img(image_path, + prompt, + negative_prompt, + cfg_scale=self.params["guidance_scale"], + width=self.params["width"], + height=self.params["height"], + sample_steps=self.params["num_inference_steps"], + seed=self.params["random_seed"], + control_cond=self.params.get("control_image_path", ""), + control_strength=self.params.get("control_strength", 0.9), + ) + self._save_images(images) except KeyboardInterrupt: print(EXIT_REMINDER) diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index 9c9a3c32..324f0811 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -31,6 +31,7 @@ class NexaTextInference: Args: model_path (str): Path or identifier for the model in Nexa Model Hub. + embedding (bool): Enable embedding generation. stop_words (list): List of stop words for early stopping. profiling (bool): Enable timing measurements for the generation process. streamlit (bool): Run the inference in Streamlit UI. @@ -83,27 +84,19 @@ def __init__(self, model_path, stop_words=None, **kwargs): "Failed to load model or tokenizer. Exiting.", exc_info=True ) exit(1) - def embed( + def create_embedding( self, input: Union[str, List[str]], - normalize: bool = False, - truncate: bool = True, - return_count: bool = False, ): """Embed a string. Args: input: The utf-8 encoded string or a list of string to embed. - normalize: whether to normalize embedding in embedding dimension. - trunca - truncate: whether to truncate tokens to window length before generating embedding. - return count: if true, return (embedding, count) tuple. else return embedding only. - Returns: A list of embeddings """ - return self.model.embed(input, normalize, truncate, return_count) + return self.model.create_embedding(input) @SpinningCursorAnimation() def _load_model(self): @@ -112,6 +105,7 @@ def _load_model(self): with suppress_stdout_stderr(): from nexa.gguf.llama.llama import Llama self.model = Llama( + embedding=self.params.get("embedding", False), model_path=self.downloaded_path, verbose=self.profiling, chat_format=self.chat_format, diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 63061852..b4cd0f5c 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -138,7 +138,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/tests/test_image_generation.py b/tests/test_image_generation.py index 6c9d5b21..7e749dc6 100644 --- a/tests/test_image_generation.py +++ b/tests/test_image_generation.py @@ -1,47 +1,34 @@ -import os -from nexa.gguf.sd import stable_diffusion -from tests.utils import download_model +from nexa.gguf import NexaImageInference from tempfile import TemporaryDirectory +from .utils import download_model -# Constants -STABLE_DIFFUSION_URL = "https://huggingface.co/second-state/stable-diffusion-v-1-4-GGUF/resolve/main/stable-diffusion-v1-4-Q4_0.gguf" -IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" -OUTPUT_DIR = os.getcwd() -MODEL_PATH = download_model(STABLE_DIFFUSION_URL, OUTPUT_DIR) +sd = NexaImageInference( + model_path="sd1-4", + wtype="q4_0", +) -# Print the model path -print("Model downloaded to:", MODEL_PATH) - -# Helper function for Stable Diffusion initialization -def init_stable_diffusion(): - return stable_diffusion.StableDiffusion( - model_path=MODEL_PATH, - wtype="q4_0" # Weight type (options: default, f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0) - ) - # Test text-to-image generation def test_txt_to_img(): - sd = init_stable_diffusion() - output = sd.txt_to_img("a lovely cat", width=128, height=128, sample_steps=2) + global sd + output = sd.txt2img("a lovely cat", width=128, height=128, sample_steps=2) output[0].save("output_txt_to_img.png") # Test image-to-image generation def test_img_to_img(): - sd = init_stable_diffusion() + global sd img_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" with TemporaryDirectory() as temp_dir: img_path = download_model(img_url, temp_dir) - output = sd.img_to_img( - image=img_path, + output = sd.img2img( + image_path=img_path, prompt="blue sky", width=128, height=128, negative_prompt="black soil", sample_steps=2 ) - output[0].save("output_img_to_img.png") # Main execution # if __name__ == "__main__": diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py index 04782a21..e3ceed30 100644 --- a/tests/test_text_generation.py +++ b/tests/test_text_generation.py @@ -1,36 +1,28 @@ -import os -from nexa.gguf.llama import llama -from tests.utils import download_model +from nexa.gguf import NexaTextInference from nexa.gguf.lib_utils import is_gpu_available -# Constants -TINY_LLAMA_URL = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_0.gguf" -OUTPUT_DIR = os.getcwd() -MODEL_PATH = download_model(TINY_LLAMA_URL, OUTPUT_DIR) -# Initialize Llama model -def init_llama_model(verbose=False, n_gpu_layers=-1, chat_format=None, embedding=False): - return llama.Llama( - model_path=MODEL_PATH, - verbose=verbose, - n_gpu_layers=n_gpu_layers if is_gpu_available() else 0, - chat_format=chat_format, - embedding=embedding, - ) +model = NexaTextInference( + model_path="gemma", + verbose=False, + n_gpu_layers=-1 if is_gpu_available() else 0, + chat_format="llama-2", +) # Test text generation from a prompt def test_text_generation(): - model = init_llama_model() - output = model( + global model + output = model.create_completion( "Q: Name the planets in the solar system? A: ", max_tokens=512, stop=["Q:", "\n"], echo=True, ) - print(output) + # print(output) + # TODO: add assertions here # Test chat completion in streaming mode def test_streaming(): - model = init_llama_model() + global model output = model.create_completion( "Q: Name the planets in the solar system? A: ", max_tokens=512, @@ -40,10 +32,12 @@ def test_streaming(): for chunk in output: if "choices" in chunk: print(chunk["choices"][0]["text"], end="", flush=True) + # TODO: add assertions here # Test conversation mode with chat format def test_create_chat_completion(): - model = init_llama_model(chat_format="llama-2") + global model + output = model.create_chat_completion( messages=[ {"role": "user", "content": "write a long 1000 word story about a detective"} @@ -58,7 +52,13 @@ def test_create_chat_completion(): print(delta["content"], end="", flush=True) def test_create_embedding(): - model = init_llama_model(embedding=True) + model = NexaTextInference( + model_path="gemma", + verbose=False, + n_gpu_layers=-1 if is_gpu_available() else 0, + chat_format="llama-2", + embedding=True, + ) embeddings = model.create_embedding("Hello, world!") print("Embeddings:\n", embeddings) diff --git a/tests/test_vlm.py b/tests/test_vlm.py index d8977a68..2c863146 100644 --- a/tests/test_vlm.py +++ b/tests/test_vlm.py @@ -1,27 +1,8 @@ -import base64 -import os - from nexa.gguf import NexaVLMInference -from tests.utils import download_model -from nexa.gguf.lib_utils import is_gpu_available import tempfile -def image_to_base64_data_uri(file_path): - """ - file_path = 'file_path.png' - data_uri = image_to_base64_data_uri(file_path) - """ - with open(file_path, "rb") as img_file: - base64_data = base64.b64encode(img_file.read()).decode("utf-8") - return f"data:image/png;base64,{base64_data}" - - def test_image_generation(): with tempfile.TemporaryDirectory() as temp_dir: - temp_dir = os.path.dirname(os.path.abspath(__file__)) - model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf" - mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf" - model = NexaVLMInference( model_path="nanollava", ) From be4520d6e91bedb995c1353b09807df0bb08bbe8 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 14:58:05 -0700 Subject: [PATCH 11/31] revert vlm --- nexa/gguf/nexa_inference_vlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index b4cd0f5c..63061852 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -138,7 +138,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() From fee9553cb6f3f356972dcdb73af31b4696d5d105 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 15:05:04 -0700 Subject: [PATCH 12/31] use ubuntu instead --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0e17e44d..bde784ed 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,7 +10,7 @@ on: jobs: build: - runs-on: macos-latest + runs-on: ubuntu-latest steps: - name: Checkout code From 1bae4e6b545b110530af94299e1007958bb19079 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 15:14:47 -0700 Subject: [PATCH 13/31] remove cursor to try --- .github/workflows/ci.yaml | 1 - nexa/gguf/nexa_inference_image.py | 2 +- nexa/gguf/nexa_inference_text.py | 2 +- nexa/gguf/nexa_inference_vlm.py | 2 +- nexa/gguf/nexa_inference_voice.py | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bde784ed..3bdcd138 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,7 +28,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install numpy --upgrade python -m pip install build pytest - name: Build DLL run: | diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index d9d38bfa..494374d7 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs): logging.error("Failed to load the model or pipeline.") exit(1) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self, model_path: str): with suppress_stdout_stderr(): from nexa.gguf.sd.stable_diffusion import StableDiffusion diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index 324f0811..e707cf1a 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -98,7 +98,7 @@ def create_embedding( """ return self.model.create_embedding(input) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 63061852..b4cd0f5c 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -138,7 +138,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index 372a72f6..88856fa8 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -66,7 +66,7 @@ def __init__(self, model_path, **kwargs): exit(1) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from: {self.downloaded_path}") with suppress_stdout_stderr(): From 5ddc656c3dbf75c56e3350775e76c6ef246b8587 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 15:33:26 -0700 Subject: [PATCH 14/31] fix --- nexa/gguf/nexa_inference_text.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index c050a465..fa59e7ee 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -106,6 +106,7 @@ def _load_model(self): try: from nexa.gguf.llama.llama import Llama self.model = Llama( + embedding=self.params.get("embedding", False), model_path=self.downloaded_path, verbose=self.profiling, chat_format=self.chat_format, From d2b119c0fafab3721fb24ae609fa4e0dcb4a60c7 Mon Sep 17 00:00:00 2001 From: Yu Xing Date: Wed, 21 Aug 2024 23:07:44 +0000 Subject: [PATCH 15/31] png should use smaller one --- tests/test_vlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_vlm.py b/tests/test_vlm.py index 2c863146..57dff975 100644 --- a/tests/test_vlm.py +++ b/tests/test_vlm.py @@ -19,7 +19,7 @@ def test_image_generation(): { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "url": "https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png" }, }, ], From 4205236733bd6bec10d8843d0ac2f80c86e4f474 Mon Sep 17 00:00:00 2001 From: Yu Xing Date: Wed, 21 Aug 2024 23:26:31 +0000 Subject: [PATCH 16/31] use engine interface to test --- tests/test_vlm.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/tests/test_vlm.py b/tests/test_vlm.py index 57dff975..b7a450f2 100644 --- a/tests/test_vlm.py +++ b/tests/test_vlm.py @@ -1,32 +1,17 @@ from nexa.gguf import NexaVLMInference import tempfile +from .utils import download_model def test_image_generation(): with tempfile.TemporaryDirectory() as temp_dir: model = NexaVLMInference( model_path="nanollava", ) - output = model.create_chat_completion( - messages=[ - { - "role": "system", - "content": "You are an assistant who perfectly describes images.", - }, - { - "role": "user", - "content": [ - {"type": "text", "text": "What's in this image?"}, - { - "type": "image_url", - "image_url": { - "url": "https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png" - }, - }, - ], - }, - ], - stream=True, + image_path = download_model( + "https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png", + temp_dir, ) + output = model._chat("what's in this image?", image_path) for chunk in output: delta = chunk["choices"][0]["delta"] if "role" in delta: From 9a78395f6d8816afc9a0a8643463e8c49fe4e60c Mon Sep 17 00:00:00 2001 From: Yu Xing Date: Wed, 21 Aug 2024 23:27:52 +0000 Subject: [PATCH 17/31] revert cursor and use interface --- nexa/gguf/nexa_inference_image.py | 2 +- nexa/gguf/nexa_inference_text.py | 2 +- nexa/gguf/nexa_inference_vlm.py | 2 +- nexa/gguf/nexa_inference_voice.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 494374d7..d9d38bfa 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs): logging.error("Failed to load the model or pipeline.") exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self, model_path: str): with suppress_stdout_stderr(): from nexa.gguf.sd.stable_diffusion import StableDiffusion diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index fa59e7ee..c93d3519 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -98,7 +98,7 @@ def create_embedding( """ return self.model.create_embedding(input) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 27c057be..47e54786 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index fc8034e3..b6437442 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs): exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): from faster_whisper import WhisperModel From 5a2e00a384e7395e2eaca4b3a8a81d559157d750 Mon Sep 17 00:00:00 2001 From: Yu Xing Date: Wed, 21 Aug 2024 23:37:19 +0000 Subject: [PATCH 18/31] remove spin --- nexa/gguf/nexa_inference_image.py | 2 +- nexa/gguf/nexa_inference_text.py | 2 +- nexa/gguf/nexa_inference_vlm.py | 2 +- nexa/gguf/nexa_inference_voice.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index d9d38bfa..279923a0 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs): logging.error("Failed to load the model or pipeline.") exit(1) - @SpinningCursorAnimation() + #@SpinningCursorAnimation() def _load_model(self, model_path: str): with suppress_stdout_stderr(): from nexa.gguf.sd.stable_diffusion import StableDiffusion diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index c93d3519..7395093e 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -98,7 +98,7 @@ def create_embedding( """ return self.model.create_embedding(input) - @SpinningCursorAnimation() + #@SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 47e54786..63b8f091 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - @SpinningCursorAnimation() + #@SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index b6437442..92eeaee5 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs): exit(1) - @SpinningCursorAnimation() + #@SpinningCursorAnimation() def _load_model(self): from faster_whisper import WhisperModel From c1809a3380edc10edb93ed8b9bb29744b9ec5fcf Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 16:49:44 -0700 Subject: [PATCH 19/31] around it now --- nexa/gguf/nexa_inference_image.py | 2 +- nexa/gguf/nexa_inference_text.py | 2 +- nexa/gguf/nexa_inference_vlm.py | 2 +- nexa/gguf/nexa_inference_voice.py | 2 +- tests/{test_vlm.py => vlm_test.py} | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename tests/{test_vlm.py => vlm_test.py} (100%) diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 279923a0..d9d38bfa 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs): logging.error("Failed to load the model or pipeline.") exit(1) - #@SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self, model_path: str): with suppress_stdout_stderr(): from nexa.gguf.sd.stable_diffusion import StableDiffusion diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index 7395093e..c93d3519 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -98,7 +98,7 @@ def create_embedding( """ return self.model.create_embedding(input) - #@SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 63b8f091..47e54786 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - #@SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index 92eeaee5..b6437442 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs): exit(1) - #@SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): from faster_whisper import WhisperModel diff --git a/tests/test_vlm.py b/tests/vlm_test.py similarity index 100% rename from tests/test_vlm.py rename to tests/vlm_test.py From 48625897b917ebecce6b4d1bfd600376e4e4b49b Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 17:03:07 -0700 Subject: [PATCH 20/31] remove cursor for now --- nexa/gguf/nexa_inference_image.py | 2 +- nexa/gguf/nexa_inference_text.py | 2 +- nexa/gguf/nexa_inference_vlm.py | 2 +- nexa/gguf/nexa_inference_voice.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index d9d38bfa..494374d7 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs): logging.error("Failed to load the model or pipeline.") exit(1) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self, model_path: str): with suppress_stdout_stderr(): from nexa.gguf.sd.stable_diffusion import StableDiffusion diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index c93d3519..fa59e7ee 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -98,7 +98,7 @@ def create_embedding( """ return self.model.create_embedding(input) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 47e54786..27c057be 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index b6437442..fc8034e3 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs): exit(1) - @SpinningCursorAnimation() + # @SpinningCursorAnimation() def _load_model(self): from faster_whisper import WhisperModel From 64d9b62e26259b5873a3bc55b1a4c5ce08841c37 Mon Sep 17 00:00:00 2001 From: Yu xing Date: Wed, 21 Aug 2024 17:16:48 -0700 Subject: [PATCH 21/31] remove vlm test --- tests/vlm_test.py | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 tests/vlm_test.py diff --git a/tests/vlm_test.py b/tests/vlm_test.py deleted file mode 100644 index b7a450f2..00000000 --- a/tests/vlm_test.py +++ /dev/null @@ -1,25 +0,0 @@ -from nexa.gguf import NexaVLMInference -import tempfile -from .utils import download_model - -def test_image_generation(): - with tempfile.TemporaryDirectory() as temp_dir: - model = NexaVLMInference( - model_path="nanollava", - ) - image_path = download_model( - "https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png", - temp_dir, - ) - output = model._chat("what's in this image?", image_path) - for chunk in output: - delta = chunk["choices"][0]["delta"] - if "role" in delta: - print(delta["role"], end=": ") - elif "content" in delta: - print(delta["content"], end="") - - -# if __name__ == "__main__": -# print("=== Testing 1 ===") -# test1() From 7ee0ca205e735588b31665fcfb543e033b31f268 Mon Sep 17 00:00:00 2001 From: Zack Zhiyuan Li Date: Wed, 21 Aug 2024 21:07:20 -0700 Subject: [PATCH 22/31] wip --- tomls/pyproject_cuda.toml | 8 ++++++-- tomls/pyproject_metal.toml | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tomls/pyproject_cuda.toml b/tomls/pyproject_cuda.toml index 008ea1a0..4233cd49 100644 --- a/tomls/pyproject_cuda.toml +++ b/tomls/pyproject_cuda.toml @@ -81,9 +81,13 @@ build.verbose = true cmake.build-type = "Release" cmake.version = ">=3.16" cmake.args = [ - "-DGGML_CUDA=ON -DSD_CUBLAS=ON -DCMAKE_CUDA_ARCHITECTURES=all", + "-DGGML_CUDA=ON", + "-DSD_CUBLAS=ON", + "-DCMAKE_CUDA_ARCHITECTURES=all", "-DGGML_CUDA_FORCE_MMQ=ON", - "-DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF", + "-DGGML_AVX2=OFF", + "-DGGML_FMA=OFF", + "-DGGML_F16C=OFF" ] [tool.pytest.ini_options] diff --git a/tomls/pyproject_metal.toml b/tomls/pyproject_metal.toml index c895998d..6154b613 100644 --- a/tomls/pyproject_metal.toml +++ b/tomls/pyproject_metal.toml @@ -81,8 +81,10 @@ build.verbose = true cmake.build-type = "Release" cmake.version = ">=3.16" cmake.args = [ - "-DGGML_METAL=ON -DSD_METAL=ON", - "-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64", + "-DGGML_METAL=ON" + "-DSD_METAL=ON", + "-DCMAKE_OSX_ARCHITECTURES=arm64", + "-DCMAKE_APPLE_SILICON_PROCESSOR=arm64" ] [tool.pytest.ini_options] From b826e2b622bf44dda961caccf7a4cedf97ee726e Mon Sep 17 00:00:00 2001 From: Zack Zhiyuan Li Date: Wed, 21 Aug 2024 21:11:00 -0700 Subject: [PATCH 23/31] wip --- tomls/pyproject_metal.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tomls/pyproject_metal.toml b/tomls/pyproject_metal.toml index 6154b613..584ebf07 100644 --- a/tomls/pyproject_metal.toml +++ b/tomls/pyproject_metal.toml @@ -81,7 +81,7 @@ build.verbose = true cmake.build-type = "Release" cmake.version = ">=3.16" cmake.args = [ - "-DGGML_METAL=ON" + "-DGGML_METAL=ON", "-DSD_METAL=ON", "-DCMAKE_OSX_ARCHITECTURES=arm64", "-DCMAKE_APPLE_SILICON_PROCESSOR=arm64" From 726c1277278cdbf093be3a9c54a08bcb2895305e Mon Sep 17 00:00:00 2001 From: Ethan Wang Date: Wed, 21 Aug 2024 22:30:55 +0000 Subject: [PATCH 24/31] fix: SpinningCursorAnimation can work on windows now --- nexa/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nexa/utils.py b/nexa/utils.py index 33499a27..1e2a5b64 100644 --- a/nexa/utils.py +++ b/nexa/utils.py @@ -132,7 +132,10 @@ def _spin(self): def __enter__(self): if self._use_alternate_stream: - self.stream = open("/dev/tty", "w") + if sys.platform == "win32": # Windows + self.stream = open('CONOUT$', "w") + else: + self.stream = open('/dev/tty', "w") self.thread = threading.Thread(target=self._spin) self.thread.start() return self From 32e763acf5ec3b63a67c4f44e969130b289667b2 Mon Sep 17 00:00:00 2001 From: Ethan Wang Date: Thu, 22 Aug 2024 03:13:32 +0000 Subject: [PATCH 25/31] fix: /dev/tty -> /dev/stdout --- nexa/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nexa/utils.py b/nexa/utils.py index 1e2a5b64..985dea56 100644 --- a/nexa/utils.py +++ b/nexa/utils.py @@ -135,7 +135,10 @@ def __enter__(self): if sys.platform == "win32": # Windows self.stream = open('CONOUT$', "w") else: - self.stream = open('/dev/tty', "w") + try: + self.stream = open('/dev/tty', "w") + except FileNotFoundError: + self.stream = open('/dev/stdout', "w") self.thread = threading.Thread(target=self._spin) self.thread.start() return self From 4085284b58fdec8596a341c9ef23836e50df415c Mon Sep 17 00:00:00 2001 From: Ethan Wang Date: Thu, 22 Aug 2024 03:15:57 +0000 Subject: [PATCH 26/31] fix: make sd and llama can use CUDA at the same time --- CMakeLists.txt | 104 ++++++++++++++++++------------------- tomls/pyproject_cuda.toml | 2 +- tomls/pyproject_metal.toml | 2 +- 3 files changed, 54 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5de11a78..8e2be6d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,57 @@ cmake_minimum_required(VERSION 3.16) +# Project: stable_diffusion_cpp +project(stable_diffusion_cpp) + +option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON) + +if (STABLE_DIFFUSION_BUILD) + set(BUILD_SHARED_LIBS "ON") + option(SD_BUILD_SHARED_LIBS "" "ON") + + # Building llama + if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") + # Need to disable these llama.cpp flags on Apple x86_64, + # otherwise users may encounter invalid instruction errors + set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE) + set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE) + set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE) + set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE) + endif() + + add_subdirectory(dependency/stable-diffusion.cpp) + install( + TARGETS stable-diffusion + LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + ) + + message(STATUS "SKBUILD_PLATLIB_DIR: ${SKBUILD_PLATLIB_DIR}") + # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 + install( + TARGETS stable-diffusion + LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + ) + # Workaround for Windows + CUDA + if (WIN32) + install( + FILES $ + DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + ) + install( + FILES $ + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + ) + endif() +endif() + # Project: llama_cpp project(llama_cpp) @@ -122,55 +174,3 @@ if (LLAMA_BUILD) endif() endif() endif() - -# Project: stable_diffusion_cpp -project(stable_diffusion_cpp) - -option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON) - -if (STABLE_DIFFUSION_BUILD) - set(BUILD_SHARED_LIBS "ON") - option(SD_BUILD_SHARED_LIBS "" "ON") - - # Building llama - if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") - # Need to disable these llama.cpp flags on Apple x86_64, - # otherwise users may encounter invalid instruction errors - set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE) - set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE) - set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE) - set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE) - endif() - - add_subdirectory(dependency/stable-diffusion.cpp) - install( - TARGETS stable-diffusion - LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib - RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib - ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib - FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib - RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib - ) - - message(STATUS "SKBUILD_PLATLIB_DIR: ${SKBUILD_PLATLIB_DIR}") - # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 - install( - TARGETS stable-diffusion - LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib - RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib - ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib - FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib - RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib - ) - # Workaround for Windows + CUDA - if (WIN32) - install( - FILES $ - DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib - ) - install( - FILES $ - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib - ) - endif() -endif() \ No newline at end of file diff --git a/tomls/pyproject_cuda.toml b/tomls/pyproject_cuda.toml index 4233cd49..69c93f58 100644 --- a/tomls/pyproject_cuda.toml +++ b/tomls/pyproject_cuda.toml @@ -81,8 +81,8 @@ build.verbose = true cmake.build-type = "Release" cmake.version = ">=3.16" cmake.args = [ - "-DGGML_CUDA=ON", "-DSD_CUBLAS=ON", + "-DGGML_CUDA=ON", "-DCMAKE_CUDA_ARCHITECTURES=all", "-DGGML_CUDA_FORCE_MMQ=ON", "-DGGML_AVX2=OFF", diff --git a/tomls/pyproject_metal.toml b/tomls/pyproject_metal.toml index 584ebf07..a14b1155 100644 --- a/tomls/pyproject_metal.toml +++ b/tomls/pyproject_metal.toml @@ -81,8 +81,8 @@ build.verbose = true cmake.build-type = "Release" cmake.version = ">=3.16" cmake.args = [ - "-DGGML_METAL=ON", "-DSD_METAL=ON", + "-DGGML_METAL=ON", "-DCMAKE_OSX_ARCHITECTURES=arm64", "-DCMAKE_APPLE_SILICON_PROCESSOR=arm64" ] From 9a01788e1f94b226dd768a55b7d304b842103434 Mon Sep 17 00:00:00 2001 From: Ethan Wang Date: Thu, 22 Aug 2024 03:16:19 +0000 Subject: [PATCH 27/31] add --index-url for installing pre-built wheels --- README.md | 19 ++++++++++++++++--- nexa/gguf/nexa_inference_image.py | 24 ++++++++++++------------ nexa/gguf/nexa_inference_text.py | 10 +++++----- nexa/gguf/nexa_inference_vlm.py | 24 ++++++++++++------------ nexa/gguf/nexa_inference_voice.py | 6 +++--- 5 files changed, 48 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index b1716169..939d93db 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Detailed API documentation is available [here](docs/index.html). ## Installation -**GPU version(optional)** +**GPU version(optional)** check if you have GPU acceleration (torch required)
@@ -40,16 +40,24 @@ check if you have GPU acceleration (torch required) ``` CMAKE_ARGS="-DGGML_CUDA=on -DSD_CUBLAS=ON" pip install nexaai-gpu ``` + Or you prefer to install our pre-built wheel: + ```bash + pip install nexaai-cuda --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple + ```
Apple M Chip: Apple icon -> about this mac -> Graphics - + if True: ``` CMAKE_ARGS="-DGGML_METAL=on -DSD_METAL=ON" pip install nexaai-gpu ``` + Or you prefer to install our pre-built wheel: + ```bash + pip install nexaai-metal --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple + ```
@@ -77,7 +85,12 @@ check if you have GPU acceleration (torch required) ``` pip install nexaai ``` -
+
+ +Or you prefer to install the pre-built wheel: +```bash +pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple +``` ## Nexa CLI commands diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 494374d7..8c85645a 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -43,7 +43,7 @@ class NexaImageInference: streamlit (bool): Run the inference in Streamlit UI. """ - + def __init__(self, model_path, **kwargs): self.model_path = None @@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs): logging.error("Failed to load the model or pipeline.") exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self, model_path: str): with suppress_stdout_stderr(): from nexa.gguf.sd.stable_diffusion import StableDiffusion @@ -108,9 +108,9 @@ def _save_images(self, images): file_path = os.path.join(output_dir, file_name) image.save(file_path) logging.info(f"\nImage {i+1} saved to: {file_path}") - - def txt2img(self, - prompt, + + def txt2img(self, + prompt, negative_prompt="", cfg_scale=7.5, width=512, @@ -151,7 +151,7 @@ def run_txt2img(self): ) try: images = self.txt2img( - prompt, + prompt, negative_prompt, cfg_scale=self.params["guidance_scale"], width=self.params["width"], @@ -169,9 +169,9 @@ def run_txt2img(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) - def img2img(self, - image_path, - prompt, + def img2img(self, + image_path, + prompt, negative_prompt="", cfg_scale=7.5, width=512, @@ -213,8 +213,8 @@ def run_img2img(self): negative_prompt = nexa_prompt( "Enter your negative prompt (press Enter to skip): " ) - images = self.img2img(image_path, - prompt, + images = self.img2img(image_path, + prompt, negative_prompt, cfg_scale=self.params["guidance_scale"], width=self.params["width"], @@ -224,7 +224,7 @@ def run_img2img(self): control_cond=self.params.get("control_image_path", ""), control_strength=self.params.get("control_strength", 0.9), ) - + self._save_images(images) except KeyboardInterrupt: print(EXIT_REMINDER) diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index fa59e7ee..2760d5d1 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -40,7 +40,7 @@ class NexaTextInference: top_k (int): Top-k sampling parameter. top_p (float): Top-p sampling parameter """ - + def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS self.params.update(kwargs) @@ -98,7 +98,7 @@ def create_embedding( """ return self.model.create_embedding(input) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() @@ -140,7 +140,7 @@ def _load_model(self): def run(self): """ - CLI interactive session. Not for SDK. + CLI interactive session. Not for SDK. """ while True: generated_text = "" @@ -189,7 +189,7 @@ def run(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) print("\n") - + def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, stream=False, stop=None): """ Used for SDK. Generate completion for a chat conversation. @@ -207,7 +207,7 @@ def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top Iterator: Iterator for the completion. """ return self.model.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, stream=stream, stop=stop) - + def create_completion(self, prompt, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, echo=False, stream=False, stop=None): """ Used for SDK. Generate completion for a given prompt. diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 27c057be..e5627ffc 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -86,7 +86,7 @@ class NexaVLMInference: top_k (int): Top-k sampling parameter. top_p (float): Top-p sampling parameter """ - + def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS @@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() @@ -240,18 +240,18 @@ def run(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) print("\n") - - def create_chat_completion(self, - messages, - max_tokens:int = 2048, + + def create_chat_completion(self, + messages, + max_tokens:int = 2048, temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, - stream=False, + stream=False, stop=[]): """ Generate text completion for a given chat prompt. - + Args: messages (list): List of messages in the chat prompt. temperature (float): Temperature for sampling. @@ -260,7 +260,7 @@ def create_chat_completion(self, top_p (float): Top-p sampling parameter. stream (bool): Stream the output. stop (list): List of stop words for early stopping. - + Returns: Iterator: An iterator of the generated text completion return format: @@ -285,9 +285,9 @@ def create_chat_completion(self, "prompt_tokens": 57, "total_tokens": 74 } - } - usage: message = completion.choices[0].message.content - + } + usage: message = completion.choices[0].message.content + """ return self.model.create_chat_completion( messages=messages, diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index fc8034e3..f61f872c 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs): exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): from faster_whisper import WhisperModel @@ -91,7 +91,7 @@ def run(self): print(EXIT_REMINDER) except Exception as e: logging.error(f"Error during text generation: {e}", exc_info=True) - + def transcribe(self, audio, **kwargs): """ Transcribe the audio file. @@ -171,7 +171,7 @@ def transcribe(self, audio, **kwargs): audio, **kwargs, ) - + def _transcribe_audio(self, audio_path): logging.debug(f"Transcribing audio from: {audio_path}") From e8f19c34b5b0a1e7420fe9866b6119d0fc380fd3 Mon Sep 17 00:00:00 2001 From: Ethan Wang Date: Thu, 22 Aug 2024 04:47:11 +0000 Subject: [PATCH 28/31] last try --- nexa/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexa/utils.py b/nexa/utils.py index 985dea56..2483582f 100644 --- a/nexa/utils.py +++ b/nexa/utils.py @@ -137,7 +137,7 @@ def __enter__(self): else: try: self.stream = open('/dev/tty', "w") - except FileNotFoundError: + except (FileNotFoundError, OSError): self.stream = open('/dev/stdout', "w") self.thread = threading.Thread(target=self._spin) self.thread.start() From b2ba114d20618d7ababf464b4c3f4eae61e73551 Mon Sep 17 00:00:00 2001 From: Yu Xing Date: Thu, 22 Aug 2024 05:00:18 +0000 Subject: [PATCH 29/31] unified cpu and gpu --- CMakeLists.txt | 10 +++++ nexa/gguf/lib_utils.py | 4 +- tomls/pyproject_cuda.toml | 90 -------------------------------------- tomls/pyproject_metal.toml | 89 ------------------------------------- 4 files changed, 13 insertions(+), 180 deletions(-) delete mode 100644 tomls/pyproject_cuda.toml delete mode 100644 tomls/pyproject_metal.toml diff --git a/CMakeLists.txt b/CMakeLists.txt index 5de11a78..7ea5f139 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,15 @@ cmake_minimum_required(VERSION 3.16) +if (GGML_CUDA OR GGML_METAL) + set(EMPTY_FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib/empty_file.txt") + add_custom_command( + OUTPUT ${EMPTY_FILE_PATH} + COMMAND ${CMAKE_COMMAND} -E touch ${EMPTY_FILE_PATH} + COMMENT "Creating an empty file because MY_FEATURE is ON" + ) + add_custom_target(create_empty_file ALL DEPENDS ${EMPTY_FILE_PATH}) +endif() + # Project: llama_cpp project(llama_cpp) diff --git a/nexa/gguf/lib_utils.py b/nexa/gguf/lib_utils.py index fe1fa7bc..ff2d887f 100644 --- a/nexa/gguf/lib_utils.py +++ b/nexa/gguf/lib_utils.py @@ -15,7 +15,9 @@ def is_gpu_available(): - return is_nexa_cuda_installed() or is_nexa_metal_installed() + current_dir = os.path.dirname(os.path.abspath(__file__)) + sentinel_file_exists = os.path.exists(os.path.join(current_dir, "lib", "empty_file.txt")) + return sentinel_file_exists # Load the library def load_library(lib_base_name: str): diff --git a/tomls/pyproject_cuda.toml b/tomls/pyproject_cuda.toml deleted file mode 100644 index 008ea1a0..00000000 --- a/tomls/pyproject_cuda.toml +++ /dev/null @@ -1,90 +0,0 @@ -[build-system] -requires = ["scikit-build-core"] -build-backend = "scikit_build_core.build" - -[project] -name = "nexaai-cuda" -version = "0.0.1" -description = "Nexa AI SDK" -readme = "README.md" -license = { text = "MIT" } -authors = [{ name = "Nexa AI", email = "octopus@nexa4ai.com" }] -dependencies = [ - "faster_whisper", - "typing-extensions>=4.5.0", # For ggml - "numpy>=1.20.0", - "diskcache>=5.6.1", - "jinja2>=2.11.3", - "librosa>=0.8.0", - "boto3>=1.34.148", - "botocore>=1.34.148", - "fastapi", - "uvicorn", - "pydantic", - "pillow", - "prompt_toolkit", - "tqdm", # Shared dependencies - "tabulate", - "streamlit" -] -classifiers = [ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", -] - -[project.optional-dependencies] -onnx = [ - "librosa", - "optimum[onnxruntime]>=1.7.3", # for CPU version - "diffusers", # required for image generation - "optuna", - "pydantic", - "PyYAML", - "requests", - "setuptools", - "soundfile", - "streamlit_audiorec", - "transformers", - "ttstokenizer" -] - -[project.urls] -Homepage = "https://github.com/NexaAI/nexaai-sdk-cpp" -Issues = "https://github.com/NexaAI/nexaai-sdk-cpp/issues" -Documentation = "https://docs-test.nexa4ai.com/" - -[project.scripts] -nexa-cli = "nexa.cli.entry:main" -nexa = "nexa.cli.entry:main" -nexaai = "nexa.cli.entry:main" -nexai = "nexa.cli.entry:main" - -[tool.scikit-build] -wheel.packages = [ - "nexa", - "nexa.cli", - "nexa.gguf", - "nexa.gguf.llama", - "nexa.gguf.sd", - "nexa.gguf.streamlit", - "nexa.gguf.server", - "nexa.onnx", - "nexa.onnx.streamlit", - "nexa.onnx.server" -] -sdist.include = ["CMakeLists.txt", "dependency/llama.cpp/*", "dependency/stable-diffusion.cpp/*"] -sdist.exclude = [".github", "build", "dist", "nexa.egg-info", "dependency/llama.cpp/build", "dependency/stable-diffusion.cpp/build"] -build.verbose = true -cmake.build-type = "Release" -cmake.version = ">=3.16" -cmake.args = [ - "-DGGML_CUDA=ON -DSD_CUBLAS=ON -DCMAKE_CUDA_ARCHITECTURES=all", - "-DGGML_CUDA_FORCE_MMQ=ON", - "-DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF", -] - -[tool.pytest.ini_options] -testpaths = ["tests"] \ No newline at end of file diff --git a/tomls/pyproject_metal.toml b/tomls/pyproject_metal.toml deleted file mode 100644 index c895998d..00000000 --- a/tomls/pyproject_metal.toml +++ /dev/null @@ -1,89 +0,0 @@ -[build-system] -requires = ["scikit-build-core"] -build-backend = "scikit_build_core.build" - -[project] -name = "nexaai-metal" -version = "0.0.1" -description = "Nexa AI SDK" -readme = "README.md" -license = { text = "MIT" } -authors = [{ name = "Nexa AI", email = "octopus@nexa4ai.com" }] -dependencies = [ - "faster_whisper", - "typing-extensions>=4.5.0", # For ggml - "numpy>=1.20.0", - "diskcache>=5.6.1", - "jinja2>=2.11.3", - "librosa>=0.8.0", - "boto3>=1.34.148", - "botocore>=1.34.148", - "fastapi", - "uvicorn", - "pydantic", - "pillow", - "prompt_toolkit", - "tqdm", # Shared dependencies - "tabulate", - "streamlit" -] -classifiers = [ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", -] - -[project.optional-dependencies] -onnx = [ - "librosa", - "optimum[onnxruntime]>=1.7.3", # for CPU version - "diffusers", # required for image generation - "optuna", - "pydantic", - "PyYAML", - "requests", - "setuptools", - "soundfile", - "streamlit_audiorec", - "transformers", - "ttstokenizer" -] - -[project.urls] -Homepage = "https://github.com/NexaAI/nexaai-sdk-cpp" -Issues = "https://github.com/NexaAI/nexaai-sdk-cpp/issues" -Documentation = "https://docs-test.nexa4ai.com/" - -[project.scripts] -nexa-cli = "nexa.cli.entry:main" -nexa = "nexa.cli.entry:main" -nexaai = "nexa.cli.entry:main" -nexai = "nexa.cli.entry:main" - -[tool.scikit-build] -wheel.packages = [ - "nexa", - "nexa.cli", - "nexa.gguf", - "nexa.gguf.llama", - "nexa.gguf.sd", - "nexa.gguf.streamlit", - "nexa.gguf.server", - "nexa.onnx", - "nexa.onnx.streamlit", - "nexa.onnx.server" -] -sdist.include = ["CMakeLists.txt", "dependency/llama.cpp/*", "dependency/stable-diffusion.cpp/*"] -sdist.exclude = [".github", "build", "dist", "nexa.egg-info", "dependency/llama.cpp/build", "dependency/stable-diffusion.cpp/build"] -build.verbose = true -cmake.build-type = "Release" -cmake.version = ">=3.16" -cmake.args = [ - "-DGGML_METAL=ON -DSD_METAL=ON", - "-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64", -] - -[tool.pytest.ini_options] -testpaths = ["tests"] \ No newline at end of file From 89244553fa96098aa63fc700756b21810e3ebaa9 Mon Sep 17 00:00:00 2001 From: Yu Xing Date: Thu, 22 Aug 2024 05:03:11 +0000 Subject: [PATCH 30/31] resolve conflicts --- README.md | 19 ++++++++++++++++--- nexa/gguf/nexa_inference_image.py | 24 ++++++++++++------------ nexa/gguf/nexa_inference_text.py | 10 +++++----- nexa/gguf/nexa_inference_vlm.py | 24 ++++++++++++------------ nexa/gguf/nexa_inference_voice.py | 6 +++--- nexa/utils.py | 8 +++++++- pyproject.toml | 2 +- 7 files changed, 56 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index b1716169..939d93db 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Detailed API documentation is available [here](docs/index.html). ## Installation -**GPU version(optional)** +**GPU version(optional)** check if you have GPU acceleration (torch required)
@@ -40,16 +40,24 @@ check if you have GPU acceleration (torch required) ``` CMAKE_ARGS="-DGGML_CUDA=on -DSD_CUBLAS=ON" pip install nexaai-gpu ``` + Or you prefer to install our pre-built wheel: + ```bash + pip install nexaai-cuda --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple + ```
Apple M Chip: Apple icon -> about this mac -> Graphics - + if True: ``` CMAKE_ARGS="-DGGML_METAL=on -DSD_METAL=ON" pip install nexaai-gpu ``` + Or you prefer to install our pre-built wheel: + ```bash + pip install nexaai-metal --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple + ```
@@ -77,7 +85,12 @@ check if you have GPU acceleration (torch required) ``` pip install nexaai ``` -
+
+ +Or you prefer to install the pre-built wheel: +```bash +pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple +``` ## Nexa CLI commands diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 494374d7..8c85645a 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -43,7 +43,7 @@ class NexaImageInference: streamlit (bool): Run the inference in Streamlit UI. """ - + def __init__(self, model_path, **kwargs): self.model_path = None @@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs): logging.error("Failed to load the model or pipeline.") exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self, model_path: str): with suppress_stdout_stderr(): from nexa.gguf.sd.stable_diffusion import StableDiffusion @@ -108,9 +108,9 @@ def _save_images(self, images): file_path = os.path.join(output_dir, file_name) image.save(file_path) logging.info(f"\nImage {i+1} saved to: {file_path}") - - def txt2img(self, - prompt, + + def txt2img(self, + prompt, negative_prompt="", cfg_scale=7.5, width=512, @@ -151,7 +151,7 @@ def run_txt2img(self): ) try: images = self.txt2img( - prompt, + prompt, negative_prompt, cfg_scale=self.params["guidance_scale"], width=self.params["width"], @@ -169,9 +169,9 @@ def run_txt2img(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) - def img2img(self, - image_path, - prompt, + def img2img(self, + image_path, + prompt, negative_prompt="", cfg_scale=7.5, width=512, @@ -213,8 +213,8 @@ def run_img2img(self): negative_prompt = nexa_prompt( "Enter your negative prompt (press Enter to skip): " ) - images = self.img2img(image_path, - prompt, + images = self.img2img(image_path, + prompt, negative_prompt, cfg_scale=self.params["guidance_scale"], width=self.params["width"], @@ -224,7 +224,7 @@ def run_img2img(self): control_cond=self.params.get("control_image_path", ""), control_strength=self.params.get("control_strength", 0.9), ) - + self._save_images(images) except KeyboardInterrupt: print(EXIT_REMINDER) diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index fa59e7ee..2760d5d1 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -40,7 +40,7 @@ class NexaTextInference: top_k (int): Top-k sampling parameter. top_p (float): Top-p sampling parameter """ - + def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS self.params.update(kwargs) @@ -98,7 +98,7 @@ def create_embedding( """ return self.model.create_embedding(input) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() @@ -140,7 +140,7 @@ def _load_model(self): def run(self): """ - CLI interactive session. Not for SDK. + CLI interactive session. Not for SDK. """ while True: generated_text = "" @@ -189,7 +189,7 @@ def run(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) print("\n") - + def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, stream=False, stop=None): """ Used for SDK. Generate completion for a chat conversation. @@ -207,7 +207,7 @@ def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top Iterator: Iterator for the completion. """ return self.model.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, stream=stream, stop=stop) - + def create_completion(self, prompt, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, echo=False, stream=False, stop=None): """ Used for SDK. Generate completion for a given prompt. diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 27c057be..e5627ffc 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -86,7 +86,7 @@ class NexaVLMInference: top_k (int): Top-k sampling parameter. top_p (float): Top-p sampling parameter """ - + def __init__(self, model_path, stop_words=None, **kwargs): self.params = DEFAULT_TEXT_GEN_PARAMS @@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs): ) exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): logging.debug(f"Loading model from {self.downloaded_path}") start_time = time.time() @@ -240,18 +240,18 @@ def run(self): except Exception as e: logging.error(f"Error during generation: {e}", exc_info=True) print("\n") - - def create_chat_completion(self, - messages, - max_tokens:int = 2048, + + def create_chat_completion(self, + messages, + max_tokens:int = 2048, temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, - stream=False, + stream=False, stop=[]): """ Generate text completion for a given chat prompt. - + Args: messages (list): List of messages in the chat prompt. temperature (float): Temperature for sampling. @@ -260,7 +260,7 @@ def create_chat_completion(self, top_p (float): Top-p sampling parameter. stream (bool): Stream the output. stop (list): List of stop words for early stopping. - + Returns: Iterator: An iterator of the generated text completion return format: @@ -285,9 +285,9 @@ def create_chat_completion(self, "prompt_tokens": 57, "total_tokens": 74 } - } - usage: message = completion.choices[0].message.content - + } + usage: message = completion.choices[0].message.content + """ return self.model.create_chat_completion( messages=messages, diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index fc8034e3..f61f872c 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs): exit(1) - # @SpinningCursorAnimation() + @SpinningCursorAnimation() def _load_model(self): from faster_whisper import WhisperModel @@ -91,7 +91,7 @@ def run(self): print(EXIT_REMINDER) except Exception as e: logging.error(f"Error during text generation: {e}", exc_info=True) - + def transcribe(self, audio, **kwargs): """ Transcribe the audio file. @@ -171,7 +171,7 @@ def transcribe(self, audio, **kwargs): audio, **kwargs, ) - + def _transcribe_audio(self, audio_path): logging.debug(f"Transcribing audio from: {audio_path}") diff --git a/nexa/utils.py b/nexa/utils.py index 33499a27..2483582f 100644 --- a/nexa/utils.py +++ b/nexa/utils.py @@ -132,7 +132,13 @@ def _spin(self): def __enter__(self): if self._use_alternate_stream: - self.stream = open("/dev/tty", "w") + if sys.platform == "win32": # Windows + self.stream = open('CONOUT$', "w") + else: + try: + self.stream = open('/dev/tty', "w") + except (FileNotFoundError, OSError): + self.stream = open('/dev/stdout', "w") self.thread = threading.Thread(target=self._spin) self.thread.start() return self diff --git a/pyproject.toml b/pyproject.toml index 681d4f04..65a3c414 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "nexaai" -version = "0.0.1" +version = "0.0.2.dev" description = "Nexa AI SDK" readme = "README.md" license = { text = "MIT" } From 1944e97122b27a4bfce02793c8ef2a26aaf720d2 Mon Sep 17 00:00:00 2001 From: Yu Xing Date: Thu, 22 Aug 2024 05:04:56 +0000 Subject: [PATCH 31/31] revert --- CMakeLists.txt | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ffee1ccc..8b4264fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,58 @@ if (GGML_CUDA OR GGML_METAL) add_custom_target(create_empty_file ALL DEPENDS ${EMPTY_FILE_PATH}) endif() +# Project: stable_diffusion_cpp +project(stable_diffusion_cpp) + +option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON) + +if (STABLE_DIFFUSION_BUILD) + set(BUILD_SHARED_LIBS "ON") + option(SD_BUILD_SHARED_LIBS "" "ON") + + # Building llama + if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") + # Need to disable these llama.cpp flags on Apple x86_64, + # otherwise users may encounter invalid instruction errors + set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE) + set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE) + set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE) + set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE) + endif() + + add_subdirectory(dependency/stable-diffusion.cpp) + install( + TARGETS stable-diffusion + LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + ) + + message(STATUS "SKBUILD_PLATLIB_DIR: ${SKBUILD_PLATLIB_DIR}") + # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 + install( + TARGETS stable-diffusion + LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + ) + # Workaround for Windows + CUDA + if (WIN32) + install( + FILES $ + DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib + ) + install( + FILES $ + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib + ) + endif() +endif() + # Project: llama_cpp project(llama_cpp) @@ -132,3 +184,4 @@ if (LLAMA_BUILD) endif() endif() endif() +