From 0f05d20ac9a4c4397fa66d09e7df01861d5d40ba Mon Sep 17 00:00:00 2001 From: Davidqian123 Date: Thu, 19 Dec 2024 23:35:22 +0000 Subject: [PATCH] update load_model function to support loading both model and whisper_model --- nexa/constants.py | 2 + nexa/gguf/server/nexa_service.py | 223 +++++++++++++++++-------------- 2 files changed, 125 insertions(+), 100 deletions(-) diff --git a/nexa/constants.py b/nexa/constants.py index 5f6beb69..fe3c051e 100644 --- a/nexa/constants.py +++ b/nexa/constants.py @@ -66,6 +66,7 @@ class ModelType(Enum): "phi2": "Phi-2:q4_0", "phi3": "Phi-3-mini-128k-instruct:q4_0", "phi3.5": "Phi-3.5-mini-instruct:q4_0", + "phi4": "Phi:q4_0", "llama2-uncensored": "Llama2-7b-chat-uncensored:q4_0", "llama3-uncensored": "Llama3-8B-Lexi-Uncensored:q4_K_M", "openelm": "OpenELM-3B:q4_K_M", @@ -413,6 +414,7 @@ class ModelType(Enum): "Phi-3-mini-128k-instruct": ModelType.NLP, "Phi-3-mini-4k-instruct": ModelType.NLP, "Phi-3.5-mini-instruct": ModelType.NLP, + "Phi-4": ModelType.NLP, "CodeQwen1.5-7B-Instruct": ModelType.NLP, "Qwen2-0.5B-Instruct": ModelType.NLP, "Qwen2-1.5B-Instruct": ModelType.NLP, diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py index 45b26d5a..6b354daa 100644 --- a/nexa/gguf/server/nexa_service.py +++ b/nexa/gguf/server/nexa_service.py @@ -26,6 +26,8 @@ NEXA_RUN_PROJECTOR_MAP, NEXA_RUN_OMNI_VLM_MAP, NEXA_RUN_OMNI_VLM_PROJECTOR_MAP, + NEXA_RUN_MODEL_MAP_AUDIO_LM, + NEXA_RUN_AUDIO_LM_PROJECTOR_MAP, NEXA_RUN_COMPLETION_TEMPLATE_MAP, NEXA_RUN_MODEL_PRECISION_MAP, NEXA_RUN_MODEL_MAP_FUNCTION_CALLING, @@ -80,12 +82,14 @@ ) model = None +whisper_model = None chat_format = None completion_template = None hostname = socket.gethostname() chat_completion_system_prompt = [{"role": "system", "content": "You are a helpful assistant"}] function_call_system_prompt = [{"role": "system", "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"}] model_path = None +whisper_model_path = "faster-whisper-tiny" # by default, use tiny whisper model n_ctx = None is_local_path = False model_type = None @@ -215,6 +219,8 @@ class LoadModelRequest(BaseModel): model_config = { "protected_namespaces": () } +class LoadWhisperModelRequest(BaseModel): + whisper_model_path: str = "faster-whisper-tiny" class DownloadModelRequest(BaseModel): model_path: str = "llama3.2" @@ -295,16 +301,16 @@ async def load_model(): raise ValueError("Multimodal and Audio models are not supported for Hugging Face") downloaded_path, _ = pull_model(model_path, hf=is_huggingface, ms=is_modelscope) else: - if model_path in NEXA_RUN_MODEL_MAP_VLM or model_path in NEXA_RUN_OMNI_VLM_MAP: + if model_path in NEXA_RUN_MODEL_MAP_VLM or model_path in NEXA_RUN_OMNI_VLM_MAP or model_path in NEXA_RUN_MODEL_MAP_AUDIO_LM: if model_path in NEXA_RUN_OMNI_VLM_MAP: - logging.info(f"Path is OmniVLM model: {model_path}") - downloaded_path, _ = pull_model(NEXA_RUN_OMNI_VLM_MAP[model_path]) + downloaded_path, model_type = pull_model(NEXA_RUN_OMNI_VLM_MAP[model_path]) projector_downloaded_path, _ = pull_model(NEXA_RUN_OMNI_VLM_PROJECTOR_MAP[model_path]) - else: - logging.info(f"Path is in NEXA_RUN_MODEL_MAP_VLM: {model_path}") - downloaded_path, _ = pull_model(NEXA_RUN_MODEL_MAP_VLM[model_path]) + elif model_path in NEXA_RUN_MODEL_MAP_VLM: + downloaded_path, model_type = pull_model(NEXA_RUN_MODEL_MAP_VLM[model_path]) projector_downloaded_path, _ = pull_model(NEXA_RUN_PROJECTOR_MAP[model_path]) - model_type = "Multimodal" + elif model_path in NEXA_RUN_MODEL_MAP_AUDIO_LM: + downloaded_path, model_type = pull_model(NEXA_RUN_MODEL_MAP_AUDIO_LM[model_path]) + projector_downloaded_path, _ = pull_model(NEXA_RUN_AUDIO_LM_PROJECTOR_MAP[model_path]) else: downloaded_path, model_type = pull_model(model_path) @@ -436,14 +442,6 @@ async def load_model(): n_gpu_layers=0, # hardcode to use CPU ) logging.info(f"Model loaded as {model}") - elif model_type == "Audio": - with suppress_stdout_stderr(): - model = WhisperModel( - downloaded_path, - device="cpu", # only support cpu for now because cuDNN needs to be installed on user's machine - compute_type="default" - ) - logging.info(f"model loaded as {model}") elif model_type == "AudioLM": with suppress_stdout_stderr(): try: @@ -463,7 +461,24 @@ async def load_model(): logging.info(f"model loaded as {model}") else: raise ValueError(f"Model {model_path} not found in Model Hub. If you are using local path, be sure to add --local_path and --model_type flags.") - + +async def load_whisper_model(custom_whisper_model_path=None): + global whisper_model, whisper_model_path + try: + if custom_whisper_model_path: + whisper_model_path = custom_whisper_model_path + downloaded_path, _ = pull_model(whisper_model_path) + with suppress_stdout_stderr(): + whisper_model = WhisperModel( + downloaded_path, + device="cpu", # only support cpu for now because cuDNN needs to be installed on user's machine + compute_type="default" + ) + logging.info(f"whisper model loaded as {whisper_model}") + except Exception as e: + logging.error(f"Error loading Whisper model: {e}") + raise ValueError(f"Failed to load Whisper model: {str(e)}") + def nexa_run_text_generation( prompt, temperature, stop_words, max_new_tokens, top_k, top_p, logprobs=None, stream=False, is_chat_completion=True ) -> Dict[str, Any]: @@ -710,16 +725,23 @@ def _resp_async_generator(streamer): async def download_model(request: DownloadModelRequest): """Download a model from the model hub""" try: - if request.model_path in NEXA_RUN_MODEL_MAP_VLM: # for Multimodal models - downloaded_path, _ = pull_model(NEXA_RUN_MODEL_MAP_VLM[request.model_path]) - projector_downloaded_path, _ = pull_model(NEXA_RUN_PROJECTOR_MAP[request.model_path]) + if request.model_path in NEXA_RUN_MODEL_MAP_VLM or request.model_path in NEXA_RUN_OMNI_VLM_MAP or request.model_path in NEXA_RUN_MODEL_MAP_AUDIO_LM: # models and projectors + if request.model_path in NEXA_RUN_MODEL_MAP_VLM: + downloaded_path, model_type = pull_model(NEXA_RUN_MODEL_MAP_VLM[request.model_path]) + projector_downloaded_path, _ = pull_model(NEXA_RUN_PROJECTOR_MAP[request.model_path]) + elif request.model_path in NEXA_RUN_OMNI_VLM_MAP: + downloaded_path, model_type = pull_model(NEXA_RUN_OMNI_VLM_MAP[request.model_path]) + projector_downloaded_path, _ = pull_model(NEXA_RUN_OMNI_VLM_PROJECTOR_MAP[request.model_path]) + elif request.model_path in NEXA_RUN_MODEL_MAP_AUDIO_LM: + downloaded_path, model_type = pull_model(NEXA_RUN_MODEL_MAP_AUDIO_LM[request.model_path]) + projector_downloaded_path, _ = pull_model(NEXA_RUN_AUDIO_LM_PROJECTOR_MAP[request.model_path]) return { "status": "success", - "message": "Successfully downloaded multimodal model and projector", + "message": "Successfully downloaded model and projector", "model_path": request.model_path, "model_local_path": downloaded_path, "projector_local_path": projector_downloaded_path, - "model_type": "Multimodal" + "model_type": model_type } else: downloaded_path, model_type = pull_model(request.model_path) @@ -768,6 +790,26 @@ async def load_different_model(request: LoadModelRequest): detail=f"Failed to load model: {str(e)}" ) +@app.post("/v1/load_whisper_model", tags=["Model"]) +async def load_different_whisper_model(request: LoadWhisperModelRequest): + """Load a different Whisper model while maintaining the global model state""" + try: + global whisper_model_path + whisper_model_path = request.whisper_model_path + await load_whisper_model(custom_whisper_model_path=whisper_model_path) + + return { + "status": "success", + "message": f"Successfully loaded Whisper model: {whisper_model_path}", + "model_type": "Audio", + } + except Exception as e: + logging.error(f"Error loading Whisper model: {e}") + raise HTTPException( + status_code=500, + detail=f"Failed to load Whisper model: {str(e)}" + ) + @app.get("/v1/list_models", tags=["Model"]) async def list_models(): """List all models available in the model hub""" @@ -1123,10 +1165,10 @@ async def process_audio( temperature: Optional[float] = Query(0.0, description="Temperature for sampling.") ): try: - if model_type != "Audio": + if not whisper_model: raise HTTPException( status_code=400, - detail="The model that is loaded is not an Audio model. Please use an Audio model." + detail="Whisper model is not loaded. Please load a Whisper model first." ) with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_audio: @@ -1146,7 +1188,7 @@ async def process_audio( if task == "transcribe" and language: task_params["language"] = language - segments, _ = model.transcribe(temp_audio_path, **task_params) + segments, _ = whisper_model.transcribe(temp_audio_path, **task_params) result_text = "".join(segment.text for segment in segments) return JSONResponse(content={"text": result_text}) @@ -1166,94 +1208,75 @@ async def processing_stream_audio( language: Optional[str] = Query("auto", description="Language code (e.g., 'en', 'fr')"), min_chunk: Optional[float] = Query(1.0, description="Minimum chunk duration for streaming"), ): - # Read the entire file into memory - audio_bytes = await file.read() - a_full = load_audio_from_bytes(audio_bytes) - duration = len(a_full) / SAMPLING_RATE - - # Only include language parameter if task is "transcribe" - # For "translate", the language is always defined as "en" - if task == "transcribe" and language != "auto": - used_language = language - else: - used_language = None - - warmup_audio = a_full[:SAMPLING_RATE] # first second - model.transcribe(warmup_audio) - - streamer = StreamASRProcessor(model, task, used_language) - - start = time.time() - beg = 0.0 - - def stream_generator(): - nonlocal beg - while beg < duration: - now = time.time() - start - if now < beg + min_chunk: - time.sleep((beg + min_chunk) - now) - end = time.time() - start - if end > duration: - end = duration + try: + if not whisper_model: + raise HTTPException( + status_code=400, + detail="Whisper model is not loaded. Please load a Whisper model first." + ) - chunk_samples = int((end - beg)*SAMPLING_RATE) - chunk_audio = a_full[int(beg*SAMPLING_RATE):int(beg*SAMPLING_RATE)+chunk_samples] - beg = end + # Read the entire file into memory + audio_bytes = await file.read() + a_full = load_audio_from_bytes(audio_bytes) + duration = len(a_full) / SAMPLING_RATE - streamer.insert_audio_chunk(chunk_audio) - o = streamer.process_iter() + # Only include language parameter if task is "transcribe" + # For "translate", the language is always defined as "en" + if task == "transcribe" and language != "auto": + used_language = language + else: + used_language = None + + warmup_audio = a_full[:SAMPLING_RATE] # first second + whisper_model.transcribe(warmup_audio) + + streamer = StreamASRProcessor(whisper_model, task, used_language) + + start = time.time() + beg = 0.0 + + def stream_generator(): + nonlocal beg + while beg < duration: + now = time.time() - start + if now < beg + min_chunk: + time.sleep((beg + min_chunk) - now) + end = time.time() - start + if end > duration: + end = duration + + chunk_samples = int((end - beg)*SAMPLING_RATE) + chunk_audio = a_full[int(beg*SAMPLING_RATE):int(beg*SAMPLING_RATE)+chunk_samples] + beg = end + + streamer.insert_audio_chunk(chunk_audio) + o = streamer.process_iter() + if o[0] is not None: + data = { + "emission_time_ms": (time.time()-start)*1000, + "segment_start_ms": o[0]*1000, + "segment_end_ms": o[1]*1000, + "text": o[2] + } + yield f"data: {json.dumps(data)}\n\n".encode("utf-8") + + # Final flush + o = streamer.finish() if o[0] is not None: data = { "emission_time_ms": (time.time()-start)*1000, "segment_start_ms": o[0]*1000, "segment_end_ms": o[1]*1000, - "text": o[2] + "text": o[2], + "final": True } yield f"data: {json.dumps(data)}\n\n".encode("utf-8") - # Final flush - o = streamer.finish() - if o[0] is not None: - data = { - "emission_time_ms": (time.time()-start)*1000, - "segment_start_ms": o[0]*1000, - "segment_end_ms": o[1]*1000, - "text": o[2], - "final": True - } - yield f"data: {json.dumps(data)}\n\n".encode("utf-8") - - return StreamingResponse(stream_generator(), media_type="application/x-ndjson") - -@app.post("/v1/audio/translations", tags=["Audio"]) -async def translate_audio( - file: UploadFile = File(...), - beam_size: Optional[int] = Query(5, description="Beam size for translation"), - temperature: Optional[float] = Query(0.0, description="Temperature for sampling"), -): - try: - if model_type != "Audio": - raise HTTPException( - status_code=400, - detail="The model that is loaded is not an Audio model. Please use an Audio model for audio translation." - ) - with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_audio: - temp_audio.write(await file.read()) - temp_audio_path = temp_audio.name + return StreamingResponse(stream_generator(), media_type="application/x-ndjson") - translate_params = { - "beam_size": beam_size, - "task": "translate", - "temperature": temperature, - "vad_filter": True - } - segments, _ = model.transcribe(temp_audio_path, **translate_params) - translation = "".join(segment.text for segment in segments) - return JSONResponse(content={"text": translation}) except Exception as e: - raise HTTPException(status_code=500, detail=f"Error during translation: {str(e)}") - finally: - os.unlink(temp_audio_path) + logging.error(f"Error in audio processing stream: {e}") + raise HTTPException(status_code=500, detail=str(e)) @app.post("/v1/audiolm/chat/completions", tags=["AudioLM"]) async def audio_chat_completions(