From 912f8aab3d8d3ae27ce9274ee046eb74b209b3ff Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Sat, 21 Oct 2023 22:47:48 -0700
Subject: [PATCH] fix(docs): add installation guide on modal (#608)

* fix(docs): add installation guide on modal

* remove modal-deploy
---
 .../docs/installation/modal}/app.py           |  15 +--
 website/docs/installation/modal/index.md      | 124 ++++++++++++++++++
 website/package.json                          |   1 +
 website/yarn.lock                             |   8 ++
 4 files changed, 137 insertions(+), 11 deletions(-)
 rename {experimental/modal-deploy => website/docs/installation/modal}/app.py (88%)
 create mode 100644 website/docs/installation/modal/index.md

diff --git a/experimental/modal-deploy/app.py b/website/docs/installation/modal/app.py
similarity index 88%
rename from experimental/modal-deploy/app.py
rename to website/docs/installation/modal/app.py
index 794a80b201a2..3654382986b2 100644
--- a/experimental/modal-deploy/app.py
+++ b/website/docs/installation/modal/app.py
@@ -1,11 +1,4 @@
-"""
-modal serve app.py
-"""
-
-from pathlib import Path
-
-import modal
-from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
+from modal import Image, Stub, asgi_app, gpu
 
 IMAGE_NAME = "tabbyml/tabby:0.3.1"
 MODEL_ID = "TabbyML/StarCoder-1B"
@@ -27,7 +20,7 @@ def download_model():
 
 image = (
     Image.from_registry(
-        "tabbyml/tabby:0.3.1",
+        IMAGE_NAME,
         add_python="3.11",
     )
     .dockerfile_commands("ENTRYPOINT []")
@@ -65,7 +58,7 @@ def app():
     )
 
     # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
-    def webserver_ready():
+    def tabby_ready():
         try:
             socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
             return True
@@ -77,7 +70,7 @@ def webserver_ready():
                 raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
             return False
 
-    while not webserver_ready():
+    while not tabby_ready():
         time.sleep(1.0)
 
     print("Tabby server ready!")
diff --git a/website/docs/installation/modal/index.md b/website/docs/installation/modal/index.md
new file mode 100644
index 000000000000..accab3b0af44
--- /dev/null
+++ b/website/docs/installation/modal/index.md
@@ -0,0 +1,124 @@
+# Modal
+
+Modal is a serverless GPU provider. By leveraging Modal, your Tabby instance will run on demand. When there are no requests to the Tabby server for a certain amount of time, Modal will schedule the container to sleep, thereby saving GPU costs.
+
+## Setup
+
+First we import the components we need from `modal`.
+
+```python
+from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
+```
+
+Next, we set the base docker image version, which model to serve, taking care to specify the GPU configuration required to fit the model into VRAM.
+
+```python
+MODEL_ID = "TabbyML/StarCoder-1B"
+GPU_CONFIG = gpu.T4()
+```
+
+## Define the container image
+
+We want to create a Modal image which has the Tabby model cache pre-populated. The benefit of this is that the container no longer has to re-download the model - instead, it will take advantage of Modal’s internal filesystem for faster cold starts.
+
+### Download the weights
+
+```python
+def download_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            "/opt/tabby/bin/tabby",
+            "download",
+            "--model",
+            MODEL_ID,
+        ]
+    )
+```
+
+
+### Image definition
+
+We’ll start from a image by tabby, and override the default ENTRYPOINT for Modal to run its own which enables seamless serverless deployments.
+
+Next we run the download step to pre-populate the image with our model weights.
+
+Finally, we install the `asgi-proxy-lib` to interface with modal's asgi webserver over localhost.
+
+```python
+image = (
+    Image.from_registry(
+        "tabbyml/tabby:0.3.1",
+        add_python="3.11",
+    )
+    .dockerfile_commands("ENTRYPOINT []")
+    .run_function(download_model)
+    .pip_install("asgi-proxy-lib")
+)
+```
+
+### The app function
+
+The endpoint function is represented with Modal's `@stub.function`. Here, we:
+
+1. Launch the Tabby process and wait for it to be ready to accept requests.
+2. Create an ASGI proxy to tunnel requests from the Modal web endpoint to the local Tabby server.
+3. Specify that each container is allowed to handle up to 10 requests simultaneously.
+4. Keep idle containers for 2 minutes before spinning them down.
+
+```python
+@stub.function(
+    gpu=GPU_CONFIG,
+    allow_concurrent_inputs=10,
+    container_idle_timeout=120,
+    timeout=360,
+)
+@asgi_app()
+def app():
+    import socket
+    import subprocess
+    import time
+    from asgi_proxy import asgi_proxy
+
+    launcher = subprocess.Popen(
+        [
+            "/opt/tabby/bin/tabby",
+            "serve",
+            "--model",
+            MODEL_ID,
+            "--port",
+            "8000",
+            "--device",
+            "cuda",
+        ]
+    )
+
+    # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
+    def tabby_ready():
+        try:
+            socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
+            return True
+        except (socket.timeout, ConnectionRefusedError):
+            # Check if launcher webserving process has exited.
+            # If so, a connection can never be made.
+            retcode = launcher.poll()
+            if retcode is not None:
+                raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
+            return False
+
+    while not tabby_ready():
+        time.sleep(1.0)
+
+    print("Tabby server ready!")
+    return asgi_proxy("http://localhost:8000")
+```
+
+### Serve the app
+
+Once we deploy this model with `modal serve app.py`, it will output the url of the web endpoint, in a form of `https://<USERNAME>--tabby-server-starcoder-1b-app-dev.modal.run`, it can be used as tabby server url in tabby editor extensions!
+
+See [app.py](https://github.com/TabbyML/tabby/tree/main/website/docs/modal/app.py) for a complete example.
+
+## Feedback and support
+If you have improvement suggestions or need specific support, please join [Tabby Slack community](https://join.slack.com/t/tabbycommunity/shared_invite/zt-1xeiddizp-bciR2RtFTaJ37RBxr8VxpA) or reach out on [Tabby’s GitHub repository](https://github.com/TabbyML/tabby).
diff --git a/website/package.json b/website/package.json
index e8592fafb174..98a32d9dfb63 100644
--- a/website/package.json
+++ b/website/package.json
@@ -25,6 +25,7 @@
     "postcss": "^8.4.24",
     "posthog-docusaurus": "^2.0.0",
     "prism-react-renderer": "^1.3.5",
+    "raw-loader": "^4.0.2",
     "react": "^17.0.2",
     "react-dom": "^17.0.2",
     "tailwindcss": "^3.3.2",
diff --git a/website/yarn.lock b/website/yarn.lock
index 674c12521eab..afb00735bd71 100644
--- a/website/yarn.lock
+++ b/website/yarn.lock
@@ -6893,6 +6893,14 @@ raw-body@2.5.1:
     iconv-lite "0.4.24"
     unpipe "1.0.0"
 
+raw-loader@^4.0.2:
+  version "4.0.2"
+  resolved "https://registry.yarnpkg.com/raw-loader/-/raw-loader-4.0.2.tgz#1aac6b7d1ad1501e66efdac1522c73e59a584eb6"
+  integrity sha512-ZnScIV3ag9A4wPX/ZayxL/jZH+euYb6FcUinPcgiQW0+UBtEv0O6Q3lGd3cqJ+GHH+rksEv3Pj99oxJ3u3VIKA==
+  dependencies:
+    loader-utils "^2.0.0"
+    schema-utils "^3.0.0"
+
 rc@1.2.8, rc@^1.2.8:
   version "1.2.8"
   resolved "https://registry.yarnpkg.com/rc/-/rc-1.2.8.tgz#cd924bf5200a075b83c188cd6b9e211b7fc0d3ed"