TabbyML · moqimoqidea · Jul 14, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/python/tabby-eval/v2-modal/README.md b/python/tabby-eval/v2-modal/README.md
@@ -0,0 +1,106 @@
+# Evaluating Code Completion Quality
+
+## Introduction
+
+This directory contains four Python scripts for evaluating code completion quality:
+
+* `compute_code_completion.py`: Evaluates code completion quality using parameters.
+* `compute_metrics.py`: Evaluates code completion quality given prediction / groundtruth
+* `avg_metrics.py`: Averages the evaluation results of multiple JSONL files.
+* `app.py`: A standalone Modal Tabby Serve service.
+
+## Usage
+
+Run the `compute_code_completion.py` script directly. Here’s an example:
+
+```bash
+python compute_code_completion.py \
+  --endpoint https://moqimoqidea--tabby-server-app-serve-dev.modal.run \
+  --token auth_f1bd0151d4ff4dc6b0ea56cfc82a8b82 \
+  --model TabbyML/StarCoder-1B \
+  --jsonl_file data/data-1.jsonl \
+  --output_jsonl_file_prefix data/20240717-StarCoder-1B
+```
+
+This script will call the Tabby service and evaluate the quality of code completion. The script’s parameters are as follows:
+
+```bash
+python compute_code_completion.py -h
+usage: compute_code_completion.py [-h] --endpoint ENDPOINT --token TOKEN --model MODEL [--jsonl_file JSONL_FILE]
+                                  [--output_jsonl_file_prefix OUTPUT_JSONL_FILE_PREFIX]
+                                  [--start_tabby_server_on_modal START_TABBY_SERVER_ON_MODAL]
+
+eval tabby code completion.
+
+options:
+  -h, --help            show this help message and exit
+  --endpoint ENDPOINT   tabby server endpoint.
+  --token TOKEN         tabby server token.
+  --model MODEL         evaluation model.
+  --jsonl_file JSONL_FILE
+                        evaluation jsonl file.
+  --output_jsonl_file_prefix OUTPUT_JSONL_FILE_PREFIX
+                        output jsonl file prefix, it will generate four files: prediction, evaluation, cross_file_content_prediction,
+                        cross_file_content_evaluation.
+  --start_tabby_server_on_modal START_TABBY_SERVER_ON_MODAL
+                        start tabby server on modal manager, accepts 1 or another.
+```
+
+If you already have a Tabby service running, you can set the `start_tabby_server_on_modal` parameter to 0 to avoid starting a standalone Tabby service. Example:
+
+```bash
+python compute_code_completion.py \
+  --endpoint https://moqimoqidea--tabby-server-app-serve-dev.modal.run \
+  --token auth_f1bd0151d4ff4dc6b0ea56cfc82a8b82 \
+  --model TabbyML/StarCoder-1B \
+  --jsonl_file data.jsonl \
+  --output_jsonl_file_prefix data/20240717-StarCoder-1B \
+  --start_tabby_server_on_modal 0
+```
+
+If you have a JSONL file with code completion results, you can use the `compute_metrics.py` script. Example:
+
+```bash
+python compute_metrics.py \
+  --prediction_jsonl_file data/20240717-StarCoder-1B-prediction.jsonl \
+  --output_evaluation_jsonl_file data/20240717-StarCoder-1B-evaluation.jsonl
+```
+
+The script’s parameters are as follows:
+
+```bash
+python compute_metrics.py -h
+usage: compute_metrics.py [-h] [--prediction_jsonl_file PREDICTION_JSONL_FILE]
+                          [--output_evaluation_jsonl_file OUTPUT_EVALUATION_JSONL_FILE]
+
+eval tabby code completion jsonl.
+
+options:
+  -h, --help            show this help message and exit
+  --prediction_jsonl_file PREDICTION_JSONL_FILE
+                        prediction jsonl file.
+  --output_evaluation_jsonl_file OUTPUT_EVALUATION_JSONL_FILE
+                        output evaluation jsonl file.
+```
+
+If you have a JSONL file with evaluation results, you can use the `avg_metrics.py` script. Example:
+
+```bash
+python avg_metrics.py --evaluation_jsonl_file data/20240717-StarCoder-1B-evaluation.jsonl
+```
+
+The script’s parameters are as follows:
+
+```bash
+python avg_metrics.py -h
+usage: avg_metrics.py [-h] [--evaluation_jsonl_file EVALUATION_JSONL_FILE]
+
+avg tabby code completion metrics.
+
+options:
+  -h, --help            show this help message and exit
+  --evaluation_jsonl_file EVALUATION_JSONL_FILE
+                        evaluation jsonl file.
+```
+
+Feel free to reach out if you have any questions or need further assistance!
diff --git a/python/tabby-eval/v2-modal/app.py b/python/tabby-eval/v2-modal/app.py
@@ -0,0 +1,108 @@
+"""Usage:
+modal serve app.py
+
+To force a rebuild by pulling the latest image tag, use:
+MODAL_FORCE_BUILD=1 modal serve app.py
+"""
+
+import os
+
+from modal import Image, App, asgi_app, gpu, Volume
+
+IMAGE_NAME = "tabbyml/tabby"
+EMBEDDING_MODEL_ID = os.getenv("EMBEDDING_MODEL_ID")
+MODEL_ID = os.getenv("MODEL_ID")
+
+# TODO: If you're testing large datasets, consider using a better GPU and higher parallelism
+GPU_CONFIG = gpu.A10G()
+
+TABBY_BIN = "/opt/tabby/bin/tabby"
+
+
+def download_model(model_id: str):
+    import subprocess
+
+    subprocess.run(
+        [
+            TABBY_BIN,
+            "download",
+            "--model",
+            model_id,
+        ]
+    )
+
+
+image = (
+    Image.from_registry(
+        IMAGE_NAME,
+        add_python="3.11",
+    )
+    .env({
+        "TABBY_MODEL_CACHE_ROOT": "/models",
+        "MODEL_ID": MODEL_ID,
+        "EMBEDDING_MODEL_ID": EMBEDDING_MODEL_ID
+    })
+    .dockerfile_commands("ENTRYPOINT []")
+    .run_function(download_model, kwargs={"model_id": EMBEDDING_MODEL_ID})
+    .run_function(download_model, kwargs={"model_id": MODEL_ID})
+    .pip_install("asgi-proxy-lib")
+)
+
+app = App("tabby-server", image=image)
+
+data_volume = Volume.from_name("tabby-data", create_if_missing=True)
+data_dir = "/data"
+
+
+@app.function(
+    gpu=GPU_CONFIG,
+    allow_concurrent_inputs=10,
+    container_idle_timeout=120,
+    timeout=360,
+    volumes={data_dir: data_volume},
+    _allow_background_volume_commits=True,
+    concurrency_limit=1,
+    cpu=20.0,
+    memory=24 * 1024,
+    keep_warm=1,
+)
+@asgi_app()
+def app_serve():
+    import socket
+    import subprocess
+    import time
+    from asgi_proxy import asgi_proxy
+
+    launcher = subprocess.Popen(
+        [
+            TABBY_BIN,
+            "serve",
+            "--model",
+            os.getenv("MODEL_ID"),
+            "--port",
+            "8000",
+            "--device",
+            "cuda",
+            "--parallelism",
+            "1",
+        ]
+    )
+
+    # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
+    def tabby_ready():
+        try:
+            socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
+            return True
+        except (socket.timeout, ConnectionRefusedError):
+            # Check if launcher webserving process has exited.
+            # If so, a connection can never be made.
+            retcode = launcher.poll()
+            if retcode is not None:
+                raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
+            return False
+
+    while not tabby_ready():
+        time.sleep(1.0)
+
+    print("Tabby server ready!")
+    return asgi_proxy("http://localhost:8000")
diff --git a/python/tabby-eval/v2-modal/avg_metrics.py b/python/tabby-eval/v2-modal/avg_metrics.py
@@ -0,0 +1,31 @@
+import argparse
+import logging
+
+import pandas as pd
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+
+
+def avg_compute(evaluation_jsonl_file):
+    df = pd.read_json(evaluation_jsonl_file, lines=True)
+
+    avg_results = {
+        "block_accuracy": df["block_accuracy"].mean(),
+        "block_edit_distance": df["block_edit_distance"].mean(),
+        "line_accuracy": df["line_accuracy"].mean(),
+        "avg_line_edit_distance": df["avg_line_edit_distance"].mean(),
+    }
+
+    logging.info(f"Average results: {avg_results}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="avg tabby code completion metrics.")
+    parser.add_argument("--evaluation_jsonl_file", type=str, help="evaluation jsonl file.")
+
+    args = parser.parse_args()
+    avg_compute(args.evaluation_jsonl_file)