From 03be5cc0c95cfc18f7bf9c873658861d597fe03b Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Wed, 13 Sep 2023 18:04:13 +0300 Subject: [PATCH 1/5] Add copilot example Signed-off-by: Hung-Han (Henry) Chen --- examples/copilot.py | 208 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 examples/copilot.py diff --git a/examples/copilot.py b/examples/copilot.py new file mode 100644 index 00000000..2c244b6e --- /dev/null +++ b/examples/copilot.py @@ -0,0 +1,208 @@ +# This is an example of a http steaming server support Github Copilot VSCode extension +# The server depends on fastapi/uvicorn/huggingface-hub (for auto download the model files), to run the server: +# 1. `pip install uvicorn fastapi huggingface-hub` +# 2. `uvicorn copilot:app --reload --host 0.0.0.0 --port 9999` +# 3. Configure VSCode copilot extension: +# ```json +# "github.copilot.advanced": { +# "debug.overrideEngine": "engine", # can be any string. +# "debug.testOverrideProxyUrl": "http://localhost:9999", +# "debug.overrideProxyUrl": "http://localhost:9999" +# } +# ``` + +import sys, os + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from os import times +import logging +import json + +from exllamav2 import ( + ExLlamaV2, + ExLlamaV2Config, + ExLlamaV2Cache, + ExLlamaV2Tokenizer, +) + +from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler + +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse +from huggingface_hub import snapshot_download +from typing import ( + Any, + List, + Optional, +) +from pydantic import BaseModel, Field + +log = logging.getLogger("uvicorn") +log.setLevel("DEBUG") +app = FastAPI() + +# Find one here https://huggingface.co/turboderp +MODEL_HG_REPO_ID = "TheBloke/CodeLlama-34B-Python-GPTQ" + + +@app.on_event("startup") +async def startup_event(): + """_summary_ + Starts up the server, setting log level, downloading the default model if necessary. + + Edited from https://github.com/chenhunghan/ialacol/blob/main/main.py + """ + log.info("Starting up...") + log.info( + "Downloading repo %s to %s/models", + MODEL_HG_REPO_ID, + os.getcwd(), + ) + snapshot_download( + repo_id=MODEL_HG_REPO_ID, + cache_dir="models/.cache", + local_dir="models", + resume_download=True, + ) + log.debug("Creating generator instance...") + model_directory = f"{os.getcwd()}/models" + config = ExLlamaV2Config() + config.model_dir = model_directory + config.prepare() + tokenizer = ExLlamaV2Tokenizer(config) + model = ExLlamaV2(config) + cache = ExLlamaV2Cache(model) + + generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) + # Ensure CUDA is initialized + generator.warmup() + app.state.generator = generator + app.state.tokenizer = tokenizer + + +class CompletionRequestBody(BaseModel): + """_summary_ + from from https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py + """ + + prompt: str = Field( + default="", description="The prompt to generate completions for." + ) + max_tokens: Optional[int] + temperature: Optional[float] + top_p: Optional[float] + stop: Optional[List[str] | str] + stream: bool = Field() + model: str = Field() + # llama.cpp specific parameters + top_k: Optional[int] + repetition_penalty: Optional[float] + last_n_tokens: Optional[int] + seed: Optional[int] + batch_size: Optional[int] + threads: Optional[int] + + # ignored or currently unsupported + suffix: Any + presence_penalty: Any + frequency_penalty: Any + echo: Any + n: Any + logprobs: Any + best_of: Any + logit_bias: Any + user: Any + + class Config: + arbitrary_types_allowed = True + + +@app.post("/v1/engines/{engine}/completions") +async def engine_completions( + # Can't use body as FastAPI require corrent context-type header + # But copilot client maybe not send such header + request: Request, + # copilot client ONLY request param + engine: str, +): + """_summary_ + From https://github.com/chenhunghan/ialacol/blob/main/main.py + + Similar to https://platform.openai.com/docs/api-reference/completions + but with engine param and with /v1/engines + Args: + body (CompletionRequestBody): parsed request body + Returns: + StreamingResponse: streaming response + """ + req_json = await request.json() + log.debug("Body:%s", str(req_json)) + + body = CompletionRequestBody(**req_json, model=engine) + prompt = body.prompt + settings = ExLlamaV2Sampler.Settings() + settings.temperature = body.temperature if body.temperature else 0.85 + settings.top_k = body.top_k if body.top_k else 50 + settings.top_p = body.top_p if body.top_p else 0.8 + settings.token_repetition_penalty = ( + body.repetition_penalty if body.repetition_penalty else 1.15 + ) + tokenizer = app.state.tokenizer + settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id]) + max_new_tokens = body.max_tokens if body.max_tokens else 1024 + + generator = request.app.state.generator + generator.set_stop_conditions([tokenizer.eos_token_id]) + + input_ids = tokenizer.encode(prompt) + + log.debug("Streaming response from %s", engine) + + def stream(): + generator.begin_stream(input_ids, settings) + generated_tokens = 0 + while True: + chunk, eos, _ = generator.stream() + log.debug("Streaming chunk %s", chunk) + created = times() + generated_tokens += 1 + if eos or generated_tokens == max_new_tokens: + stop_data = json.dumps( + { + "id": "id", + "object": "text_completion.chunk", + "created": created, + "model": engine, + "choices": [ + { + "text": "", + "index": 0, + "finish_reason": "stop", + } + ], + } + ) + yield f"data: {stop_data}" + "\n\n" + break + data = json.dumps( + { + "id": "id", + "object": "text_completion.chunk", + "created": created, + "model": engine, + "choices": [ + { + "text": chunk, + "index": 0, + "finish_reason": None, + } + ], + } + ) + yield f"data: {data}" + "\n\n" + + return StreamingResponse( + stream(), + media_type="text/event-stream", + ) From 5c7fbe7b752d22ec9a57e435c9d75945bf72f88b Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Sat, 16 Sep 2023 11:45:48 +0300 Subject: [PATCH 2/5] Add missing model.load() Signed-off-by: Hung-Han (Henry) Chen --- examples/copilot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/copilot.py b/examples/copilot.py index 2c244b6e..4b6388ec 100644 --- a/examples/copilot.py +++ b/examples/copilot.py @@ -2,7 +2,7 @@ # The server depends on fastapi/uvicorn/huggingface-hub (for auto download the model files), to run the server: # 1. `pip install uvicorn fastapi huggingface-hub` # 2. `uvicorn copilot:app --reload --host 0.0.0.0 --port 9999` -# 3. Configure VSCode copilot extension: +# 3. Configure VSCode copilot extension (in VSCode's settings.json): # ```json # "github.copilot.advanced": { # "debug.overrideEngine": "engine", # can be any string. @@ -72,6 +72,7 @@ async def startup_event(): config.prepare() tokenizer = ExLlamaV2Tokenizer(config) model = ExLlamaV2(config) + model.load([16, 24]) cache = ExLlamaV2Cache(model) generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) From 590076e59a1f6b2e6625added119f5be5107e80d Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Sat, 16 Sep 2023 12:02:39 +0300 Subject: [PATCH 3/5] Fix cwd path Signed-off-by: Hung-Han (Henry) Chen --- examples/copilot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/copilot.py b/examples/copilot.py index 4b6388ec..40836da6 100644 --- a/examples/copilot.py +++ b/examples/copilot.py @@ -61,8 +61,8 @@ async def startup_event(): ) snapshot_download( repo_id=MODEL_HG_REPO_ID, - cache_dir="models/.cache", - local_dir="models", + cache_dir=f"{os.getcwd()}/models/.cache", + local_dir=f"{os.getcwd()}/models", resume_download=True, ) log.debug("Creating generator instance...") From 53420723b1aebd5c8df08296fc500100640c6e34 Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Sun, 17 Sep 2023 11:13:01 +0300 Subject: [PATCH 4/5] Finally a working version Signed-off-by: Hung-Han (Henry) Chen --- examples/copilot.py | 56 +++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/examples/copilot.py b/examples/copilot.py index 40836da6..89c8925f 100644 --- a/examples/copilot.py +++ b/examples/copilot.py @@ -31,19 +31,15 @@ from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse from huggingface_hub import snapshot_download -from typing import ( - Any, - List, - Optional, -) -from pydantic import BaseModel, Field +from typing import List, Optional +from pydantic import BaseModel log = logging.getLogger("uvicorn") log.setLevel("DEBUG") app = FastAPI() # Find one here https://huggingface.co/turboderp -MODEL_HG_REPO_ID = "TheBloke/CodeLlama-34B-Python-GPTQ" +MODEL_HG_REPO_ID = "turboderp/CodeLlama-13B-instruct-2.65bpw-h6-exl2" @app.on_event("startup") @@ -71,15 +67,21 @@ async def startup_event(): config.model_dir = model_directory config.prepare() tokenizer = ExLlamaV2Tokenizer(config) + log.debug("Creating tokenizer instance...") model = ExLlamaV2(config) + log.debug("Loading model...") model.load([16, 24]) + log.debug("Creating cache instance...") cache = ExLlamaV2Cache(model) + log.debug("Creating generator instance...") generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) # Ensure CUDA is initialized + log.debug("Warming up generator instance...") generator.warmup() app.state.generator = generator app.state.tokenizer = tokenizer + log.debug("Generator instance created.") class CompletionRequestBody(BaseModel): @@ -87,33 +89,17 @@ class CompletionRequestBody(BaseModel): from from https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py """ - prompt: str = Field( - default="", description="The prompt to generate completions for." - ) - max_tokens: Optional[int] - temperature: Optional[float] - top_p: Optional[float] - stop: Optional[List[str] | str] - stream: bool = Field() - model: str = Field() + prompt: str = "" + max_tokens: Optional[int] = 99999 + temperature: Optional[float] = 0.85 + top_p: Optional[float] = 0.8 + stop: Optional[List[str] | str] = ["\ndef ", "\nclass ", "\nif ", "\n\n#"] + stream: bool = True + model: str = "" # llama.cpp specific parameters - top_k: Optional[int] - repetition_penalty: Optional[float] - last_n_tokens: Optional[int] - seed: Optional[int] - batch_size: Optional[int] - threads: Optional[int] - - # ignored or currently unsupported - suffix: Any - presence_penalty: Any - frequency_penalty: Any - echo: Any - n: Any - logprobs: Any - best_of: Any - logit_bias: Any - user: Any + top_k: Optional[int] = 50 + + repetition_penalty: Optional[float] = 15 class Config: arbitrary_types_allowed = True @@ -144,11 +130,15 @@ async def engine_completions( prompt = body.prompt settings = ExLlamaV2Sampler.Settings() settings.temperature = body.temperature if body.temperature else 0.85 + log.debug("temperature:%s", settings.temperature) settings.top_k = body.top_k if body.top_k else 50 + log.debug("top_k:%s", settings.top_k) settings.top_p = body.top_p if body.top_p else 0.8 + log.debug("top_p:%s", settings.top_p) settings.token_repetition_penalty = ( body.repetition_penalty if body.repetition_penalty else 1.15 ) + log.debug("token_repetition_penalty:%s", settings.token_repetition_penalty) tokenizer = app.state.tokenizer settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id]) max_new_tokens = body.max_tokens if body.max_tokens else 1024 From dee6afe97b989012cc450f537c6f142e372af556 Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Sun, 17 Sep 2023 11:30:51 +0300 Subject: [PATCH 5/5] 34B Signed-off-by: Hung-Han (Henry) Chen --- examples/copilot.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/copilot.py b/examples/copilot.py index 89c8925f..519f96ad 100644 --- a/examples/copilot.py +++ b/examples/copilot.py @@ -39,7 +39,7 @@ app = FastAPI() # Find one here https://huggingface.co/turboderp -MODEL_HG_REPO_ID = "turboderp/CodeLlama-13B-instruct-2.65bpw-h6-exl2" +MODEL_HG_REPO_ID = "turboderp/CodeLlama-34B-instruct-3.0bpw-h6-exl2" @app.on_event("startup") @@ -96,7 +96,6 @@ class CompletionRequestBody(BaseModel): stop: Optional[List[str] | str] = ["\ndef ", "\nclass ", "\nif ", "\n\n#"] stream: bool = True model: str = "" - # llama.cpp specific parameters top_k: Optional[int] = 50 repetition_penalty: Optional[float] = 15