From 03be5cc0c95cfc18f7bf9c873658861d597fe03b Mon Sep 17 00:00:00 2001
From: "Hung-Han (Henry) Chen" <chenhungh@gmail.com>
Date: Wed, 13 Sep 2023 18:04:13 +0300
Subject: [PATCH 1/5] Add copilot example

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 examples/copilot.py | 208 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 examples/copilot.py

diff --git a/examples/copilot.py b/examples/copilot.py
new file mode 100644
index 00000000..2c244b6e
--- /dev/null
+++ b/examples/copilot.py
@@ -0,0 +1,208 @@
+# This is an example of a http steaming server support Github Copilot VSCode extension
+# The server depends on fastapi/uvicorn/huggingface-hub (for auto download the model files), to run the server:
+# 1. `pip install uvicorn fastapi huggingface-hub`
+# 2. `uvicorn copilot:app --reload --host 0.0.0.0 --port 9999`
+# 3. Configure VSCode copilot extension:
+# ```json
+# "github.copilot.advanced": {
+#     "debug.overrideEngine": "engine", # can be any string.
+#     "debug.testOverrideProxyUrl": "http://localhost:9999",
+#     "debug.overrideProxyUrl": "http://localhost:9999"
+# }
+# ```
+
+import sys, os
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from os import times
+import logging
+import json
+
+from exllamav2 import (
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Tokenizer,
+)
+
+from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler
+
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+from huggingface_hub import snapshot_download
+from typing import (
+    Any,
+    List,
+    Optional,
+)
+from pydantic import BaseModel, Field
+
+log = logging.getLogger("uvicorn")
+log.setLevel("DEBUG")
+app = FastAPI()
+
+# Find one here https://huggingface.co/turboderp
+MODEL_HG_REPO_ID = "TheBloke/CodeLlama-34B-Python-GPTQ"
+
+
+@app.on_event("startup")
+async def startup_event():
+    """_summary_
+    Starts up the server, setting log level, downloading the default model if necessary.
+
+    Edited from https://github.com/chenhunghan/ialacol/blob/main/main.py
+    """
+    log.info("Starting up...")
+    log.info(
+        "Downloading repo %s to %s/models",
+        MODEL_HG_REPO_ID,
+        os.getcwd(),
+    )
+    snapshot_download(
+        repo_id=MODEL_HG_REPO_ID,
+        cache_dir="models/.cache",
+        local_dir="models",
+        resume_download=True,
+    )
+    log.debug("Creating generator instance...")
+    model_directory = f"{os.getcwd()}/models"
+    config = ExLlamaV2Config()
+    config.model_dir = model_directory
+    config.prepare()
+    tokenizer = ExLlamaV2Tokenizer(config)
+    model = ExLlamaV2(config)
+    cache = ExLlamaV2Cache(model)
+
+    generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
+    # Ensure CUDA is initialized
+    generator.warmup()
+    app.state.generator = generator
+    app.state.tokenizer = tokenizer
+
+
+class CompletionRequestBody(BaseModel):
+    """_summary_
+    from from https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
+    """
+
+    prompt: str = Field(
+        default="", description="The prompt to generate completions for."
+    )
+    max_tokens: Optional[int]
+    temperature: Optional[float]
+    top_p: Optional[float]
+    stop: Optional[List[str] | str]
+    stream: bool = Field()
+    model: str = Field()
+    # llama.cpp specific parameters
+    top_k: Optional[int]
+    repetition_penalty: Optional[float]
+    last_n_tokens: Optional[int]
+    seed: Optional[int]
+    batch_size: Optional[int]
+    threads: Optional[int]
+
+    # ignored or currently unsupported
+    suffix: Any
+    presence_penalty: Any
+    frequency_penalty: Any
+    echo: Any
+    n: Any
+    logprobs: Any
+    best_of: Any
+    logit_bias: Any
+    user: Any
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+@app.post("/v1/engines/{engine}/completions")
+async def engine_completions(
+    # Can't use body as FastAPI require corrent context-type header
+    # But copilot client maybe not send such header
+    request: Request,
+    # copilot client ONLY request param
+    engine: str,
+):
+    """_summary_
+        From https://github.com/chenhunghan/ialacol/blob/main/main.py
+
+        Similar to https://platform.openai.com/docs/api-reference/completions
+        but with engine param and with /v1/engines
+    Args:
+        body (CompletionRequestBody): parsed request body
+    Returns:
+        StreamingResponse: streaming response
+    """
+    req_json = await request.json()
+    log.debug("Body:%s", str(req_json))
+
+    body = CompletionRequestBody(**req_json, model=engine)
+    prompt = body.prompt
+    settings = ExLlamaV2Sampler.Settings()
+    settings.temperature = body.temperature if body.temperature else 0.85
+    settings.top_k = body.top_k if body.top_k else 50
+    settings.top_p = body.top_p if body.top_p else 0.8
+    settings.token_repetition_penalty = (
+        body.repetition_penalty if body.repetition_penalty else 1.15
+    )
+    tokenizer = app.state.tokenizer
+    settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
+    max_new_tokens = body.max_tokens if body.max_tokens else 1024
+
+    generator = request.app.state.generator
+    generator.set_stop_conditions([tokenizer.eos_token_id])
+
+    input_ids = tokenizer.encode(prompt)
+
+    log.debug("Streaming response from %s", engine)
+
+    def stream():
+        generator.begin_stream(input_ids, settings)
+        generated_tokens = 0
+        while True:
+            chunk, eos, _ = generator.stream()
+            log.debug("Streaming chunk %s", chunk)
+            created = times()
+            generated_tokens += 1
+            if eos or generated_tokens == max_new_tokens:
+                stop_data = json.dumps(
+                    {
+                        "id": "id",
+                        "object": "text_completion.chunk",
+                        "created": created,
+                        "model": engine,
+                        "choices": [
+                            {
+                                "text": "",
+                                "index": 0,
+                                "finish_reason": "stop",
+                            }
+                        ],
+                    }
+                )
+                yield f"data: {stop_data}" + "\n\n"
+                break
+            data = json.dumps(
+                {
+                    "id": "id",
+                    "object": "text_completion.chunk",
+                    "created": created,
+                    "model": engine,
+                    "choices": [
+                        {
+                            "text": chunk,
+                            "index": 0,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+            )
+            yield f"data: {data}" + "\n\n"
+
+    return StreamingResponse(
+        stream(),
+        media_type="text/event-stream",
+    )

From 5c7fbe7b752d22ec9a57e435c9d75945bf72f88b Mon Sep 17 00:00:00 2001
From: "Hung-Han (Henry) Chen" <chenhungh@gmail.com>
Date: Sat, 16 Sep 2023 11:45:48 +0300
Subject: [PATCH 2/5] Add missing model.load()

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 examples/copilot.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/copilot.py b/examples/copilot.py
index 2c244b6e..4b6388ec 100644
--- a/examples/copilot.py
+++ b/examples/copilot.py
@@ -2,7 +2,7 @@
 # The server depends on fastapi/uvicorn/huggingface-hub (for auto download the model files), to run the server:
 # 1. `pip install uvicorn fastapi huggingface-hub`
 # 2. `uvicorn copilot:app --reload --host 0.0.0.0 --port 9999`
-# 3. Configure VSCode copilot extension:
+# 3. Configure VSCode copilot extension (in VSCode's settings.json):
 # ```json
 # "github.copilot.advanced": {
 #     "debug.overrideEngine": "engine", # can be any string.
@@ -72,6 +72,7 @@ async def startup_event():
     config.prepare()
     tokenizer = ExLlamaV2Tokenizer(config)
     model = ExLlamaV2(config)
+    model.load([16, 24])
     cache = ExLlamaV2Cache(model)
 
     generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)

From 590076e59a1f6b2e6625added119f5be5107e80d Mon Sep 17 00:00:00 2001
From: "Hung-Han (Henry) Chen" <chenhungh@gmail.com>
Date: Sat, 16 Sep 2023 12:02:39 +0300
Subject: [PATCH 3/5] Fix cwd path

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 examples/copilot.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/copilot.py b/examples/copilot.py
index 4b6388ec..40836da6 100644
--- a/examples/copilot.py
+++ b/examples/copilot.py
@@ -61,8 +61,8 @@ async def startup_event():
     )
     snapshot_download(
         repo_id=MODEL_HG_REPO_ID,
-        cache_dir="models/.cache",
-        local_dir="models",
+        cache_dir=f"{os.getcwd()}/models/.cache",
+        local_dir=f"{os.getcwd()}/models",
         resume_download=True,
     )
     log.debug("Creating generator instance...")

From 53420723b1aebd5c8df08296fc500100640c6e34 Mon Sep 17 00:00:00 2001
From: "Hung-Han (Henry) Chen" <chenhungh@gmail.com>
Date: Sun, 17 Sep 2023 11:13:01 +0300
Subject: [PATCH 4/5] Finally a working version

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 examples/copilot.py | 56 +++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/examples/copilot.py b/examples/copilot.py
index 40836da6..89c8925f 100644
--- a/examples/copilot.py
+++ b/examples/copilot.py
@@ -31,19 +31,15 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse
 from huggingface_hub import snapshot_download
-from typing import (
-    Any,
-    List,
-    Optional,
-)
-from pydantic import BaseModel, Field
+from typing import List, Optional
+from pydantic import BaseModel
 
 log = logging.getLogger("uvicorn")
 log.setLevel("DEBUG")
 app = FastAPI()
 
 # Find one here https://huggingface.co/turboderp
-MODEL_HG_REPO_ID = "TheBloke/CodeLlama-34B-Python-GPTQ"
+MODEL_HG_REPO_ID = "turboderp/CodeLlama-13B-instruct-2.65bpw-h6-exl2"
 
 
 @app.on_event("startup")
@@ -71,15 +67,21 @@ async def startup_event():
     config.model_dir = model_directory
     config.prepare()
     tokenizer = ExLlamaV2Tokenizer(config)
+    log.debug("Creating tokenizer instance...")
     model = ExLlamaV2(config)
+    log.debug("Loading model...")
     model.load([16, 24])
+    log.debug("Creating cache instance...")
     cache = ExLlamaV2Cache(model)
 
+    log.debug("Creating generator instance...")
     generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
     # Ensure CUDA is initialized
+    log.debug("Warming up generator instance...")
     generator.warmup()
     app.state.generator = generator
     app.state.tokenizer = tokenizer
+    log.debug("Generator instance created.")
 
 
 class CompletionRequestBody(BaseModel):
@@ -87,33 +89,17 @@ class CompletionRequestBody(BaseModel):
     from from https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
     """
 
-    prompt: str = Field(
-        default="", description="The prompt to generate completions for."
-    )
-    max_tokens: Optional[int]
-    temperature: Optional[float]
-    top_p: Optional[float]
-    stop: Optional[List[str] | str]
-    stream: bool = Field()
-    model: str = Field()
+    prompt: str = ""
+    max_tokens: Optional[int] = 99999
+    temperature: Optional[float] = 0.85
+    top_p: Optional[float] = 0.8
+    stop: Optional[List[str] | str] = ["\ndef ", "\nclass ", "\nif ", "\n\n#"]
+    stream: bool = True
+    model: str = ""
     # llama.cpp specific parameters
-    top_k: Optional[int]
-    repetition_penalty: Optional[float]
-    last_n_tokens: Optional[int]
-    seed: Optional[int]
-    batch_size: Optional[int]
-    threads: Optional[int]
-
-    # ignored or currently unsupported
-    suffix: Any
-    presence_penalty: Any
-    frequency_penalty: Any
-    echo: Any
-    n: Any
-    logprobs: Any
-    best_of: Any
-    logit_bias: Any
-    user: Any
+    top_k: Optional[int] = 50
+
+    repetition_penalty: Optional[float] = 15
 
     class Config:
         arbitrary_types_allowed = True
@@ -144,11 +130,15 @@ async def engine_completions(
     prompt = body.prompt
     settings = ExLlamaV2Sampler.Settings()
     settings.temperature = body.temperature if body.temperature else 0.85
+    log.debug("temperature:%s", settings.temperature)
     settings.top_k = body.top_k if body.top_k else 50
+    log.debug("top_k:%s", settings.top_k)
     settings.top_p = body.top_p if body.top_p else 0.8
+    log.debug("top_p:%s", settings.top_p)
     settings.token_repetition_penalty = (
         body.repetition_penalty if body.repetition_penalty else 1.15
     )
+    log.debug("token_repetition_penalty:%s", settings.token_repetition_penalty)
     tokenizer = app.state.tokenizer
     settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
     max_new_tokens = body.max_tokens if body.max_tokens else 1024

From dee6afe97b989012cc450f537c6f142e372af556 Mon Sep 17 00:00:00 2001
From: "Hung-Han (Henry) Chen" <chenhungh@gmail.com>
Date: Sun, 17 Sep 2023 11:30:51 +0300
Subject: [PATCH 5/5] 34B

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 examples/copilot.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/copilot.py b/examples/copilot.py
index 89c8925f..519f96ad 100644
--- a/examples/copilot.py
+++ b/examples/copilot.py
@@ -39,7 +39,7 @@
 app = FastAPI()
 
 # Find one here https://huggingface.co/turboderp
-MODEL_HG_REPO_ID = "turboderp/CodeLlama-13B-instruct-2.65bpw-h6-exl2"
+MODEL_HG_REPO_ID = "turboderp/CodeLlama-34B-instruct-3.0bpw-h6-exl2"
 
 
 @app.on_event("startup")
@@ -96,7 +96,6 @@ class CompletionRequestBody(BaseModel):
     stop: Optional[List[str] | str] = ["\ndef ", "\nclass ", "\nif ", "\n\n#"]
     stream: bool = True
     model: str = ""
-    # llama.cpp specific parameters
     top_k: Optional[int] = 50
 
     repetition_penalty: Optional[float] = 15