docs: Update the Tabby Deployment with Modal Tutorial (#2641)

* docs: Update the Tabby Deployment with Modal Tutorial * Update modal tutorial: cache CHAT_MODEL_ID
TabbyML · Jul 15, 2024 · 3362daa · 3362daa
1 parent cc05634
commit 3362daa
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 24 deletions.
diff --git a/website/docs/quick-start/installation/modal/app-running.png b/website/docs/quick-start/installation/modal/app-running.png
diff --git a/website/docs/quick-start/installation/modal/app.py b/website/docs/quick-start/installation/modal/app.py
@@ -1,23 +1,30 @@
 """Usage:
 modal serve app.py
+
+To force a rebuild by pulling the latest image tag, use:
+MODAL_FORCE_BUILD=1 modal serve app.py
 """
 
-from modal import Image, Stub, asgi_app, gpu
+from modal import Image, App, asgi_app, gpu
 
 IMAGE_NAME = "tabbyml/tabby"
 MODEL_ID = "TabbyML/StarCoder-1B"
+CHAT_MODEL_ID = "TabbyML/Qwen2-1.5B-Instruct"
+EMBEDDING_MODEL_ID = "TabbyML/Nomic-Embed-Text"
 GPU_CONFIG = gpu.T4()
 
+TABBY_BIN = "/opt/tabby/bin/tabby"
+
 
-def download_model():
+def download_model(model_id: str):
     import subprocess
 
     subprocess.run(
         [
-            "/opt/tabby/bin/tabby-cpu",
+            TABBY_BIN,
             "download",
             "--model",
-            MODEL_ID,
+            model_id,
         ]
     )
 
@@ -28,38 +35,42 @@ def download_model():
         add_python="3.11",
     )
     .dockerfile_commands("ENTRYPOINT []")
-    .run_function(download_model)
+    .run_function(download_model, kwargs={"model_id": EMBEDDING_MODEL_ID})
+    .run_function(download_model, kwargs={"model_id": CHAT_MODEL_ID})
+    .run_function(download_model, kwargs={"model_id": MODEL_ID})
     .pip_install("asgi-proxy-lib")
 )
 
-stub = Stub("tabby-server-" + MODEL_ID.split("/")[-1], image=image)
+app = App("tabby-server", image=image)
 
 
-@stub.function(
+@app.function(
     gpu=GPU_CONFIG,
     allow_concurrent_inputs=10,
     container_idle_timeout=120,
     timeout=360,
 )
 @asgi_app()
-def app():
+def app_serve():
     import socket
     import subprocess
     import time
     from asgi_proxy import asgi_proxy
 
     launcher = subprocess.Popen(
         [
-            "/opt/tabby/bin/tabby",
+            TABBY_BIN,
             "serve",
             "--model",
             MODEL_ID,
+            "--chat-model",
+            CHAT_MODEL_ID,
             "--port",
             "8000",
             "--device",
             "cuda",
             "--parallelism",
-            "4",
+            "1",
         ]
     )
 

diff --git a/website/docs/quick-start/installation/modal/index.md b/website/docs/quick-start/installation/modal/index.md
@@ -7,22 +7,27 @@
 First we import the components we need from `modal`.
 
 ```python
-from modal import Image, Stub, asgi_app, gpu
+from modal import Image, App, asgi_app, gpu
 ```
 
 Next, we set the base docker image version, which model to serve, taking care to specify the GPU configuration required to fit the model into VRAM.
 
 ```python
 IMAGE_NAME = "tabbyml/tabby"
 MODEL_ID = "TabbyML/StarCoder-1B"
+CHAT_MODEL_ID = "TabbyML/Qwen2-1.5B-Instruct"
+EMBEDDING_MODEL_ID = "TabbyML/Nomic-Embed-Text"
 GPU_CONFIG = gpu.T4()
+
+TABBY_BIN = "/opt/tabby/bin/tabby"
 ```
 
 Currently supported GPUs in Modal:
 
 - `T4`: Low-cost GPU option, providing 16GiB of GPU memory.
 - `L4`: Mid-tier GPU option, providing 24GiB of GPU memory.
 - `A100`: The most powerful GPU available in the cloud. Available in 40GiB and 80GiB GPU memory configurations.
+- `H100`: The flagship data center GPU of the Hopper architecture. Enhanced support for FP8 precision and a Transformer Engine that provides up to 4X faster training over the prior generation for GPT-3 (175B) models.
 - `A10G`: A10G GPUs deliver up to 3.3x better ML training performance, 3x better ML inference performance, and 3x better graphics performance, in comparison to NVIDIA T4 GPUs.
 - `Any`: Selects any one of the GPU classes available within Modal, according to availability.
 
@@ -35,15 +40,15 @@ We want to create a Modal image which has the Tabby model cache pre-populated. T
 ### Download the weights
 
 ```python
-def download_model():
+def download_model(model_id: str):
     import subprocess
 
     subprocess.run(
         [
-            "/opt/tabby/bin/tabby-cpu",
+            TABBY_BIN,
             "download",
             "--model",
-            MODEL_ID,
+            model_id,
         ]
     )
 ```
@@ -64,45 +69,53 @@ image = (
         add_python="3.11",
     )
     .dockerfile_commands("ENTRYPOINT []")
-    .run_function(download_model)
+    .run_function(download_model, kwargs={"model_id": EMBEDDING_MODEL_ID})
+    .run_function(download_model, kwargs={"model_id": CHAT_MODEL_ID})
+    .run_function(download_model, kwargs={"model_id": MODEL_ID})
     .pip_install("asgi-proxy-lib")
 )
 ```
 
 ### The app function
 
-The endpoint function is represented with Modal's `@stub.function`. Here, we:
+The endpoint function is represented with Modal's `@app.function`. Here, we:
 
 1. Launch the Tabby process and wait for it to be ready to accept requests.
 2. Create an ASGI proxy to tunnel requests from the Modal web endpoint to the local Tabby server.
 3. Specify that each container is allowed to handle up to 10 requests simultaneously.
 4. Keep idle containers for 2 minutes before spinning them down.
 
 ```python
-stub = Stub("tabby-server-" + MODEL_ID.split("/")[-1], image=image)
-@stub.function(
+app = App("tabby-server", image=image)
+
+
+@app.function(
     gpu=GPU_CONFIG,
     allow_concurrent_inputs=10,
     container_idle_timeout=120,
     timeout=360,
 )
 @asgi_app()
-def app():
+def app_serve():
     import socket
     import subprocess
     import time
     from asgi_proxy import asgi_proxy
 
     launcher = subprocess.Popen(
         [
-            "/opt/tabby/bin/tabby",
+            TABBY_BIN,
             "serve",
             "--model",
             MODEL_ID,
+            "--chat-model",
+            CHAT_MODEL_ID,
             "--port",
             "8000",
             "--device",
             "cuda",
+            "--parallelism",
+            "1",
         ]
     )
 
@@ -128,9 +141,11 @@ def app():
 
 ### Serve the app
 
-Once we deploy this model with `modal serve app.py`, it will output the url of the web endpoint, in a form of `https://<USERNAME>--tabby-server-starcoder-1b-app-dev.modal.run`.
+Once we deploy this model with `modal serve app.py`, it will output the url of the web endpoint, in a form of `https://<USERNAME>--tabby-server-app-serve-dev.modal.run`.
+
+If you encounter any issues, particularly related to caching, you can force a rebuild by running `MODAL_FORCE_BUILD=1 modal serve app.py`. This ensures that the latest image tag is used by ignoring cached layers.
 
 ![App Running](./app-running.png)
 
 Now it can be used as tabby server url in tabby editor extensions!
-See [app.py](https://github.com/TabbyML/tabby/blob/main/website/docs/installation/modal/app.py) for the full code used in this tutorial. 
+See [app.py](https://github.com/TabbyML/tabby/blob/main/website/docs/quick-start/installation/modal/app.py) for the full code used in this tutorial.