diff --git a/experimental/modal-deploy/app.py b/website/docs/installation/modal/app.py similarity index 88% rename from experimental/modal-deploy/app.py rename to website/docs/installation/modal/app.py index 794a80b201a2..3654382986b2 100644 --- a/experimental/modal-deploy/app.py +++ b/website/docs/installation/modal/app.py @@ -1,11 +1,4 @@ -""" -modal serve app.py -""" - -from pathlib import Path - -import modal -from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method +from modal import Image, Stub, asgi_app, gpu IMAGE_NAME = "tabbyml/tabby:0.3.1" MODEL_ID = "TabbyML/StarCoder-1B" @@ -27,7 +20,7 @@ def download_model(): image = ( Image.from_registry( - "tabbyml/tabby:0.3.1", + IMAGE_NAME, add_python="3.11", ) .dockerfile_commands("ENTRYPOINT []") @@ -65,7 +58,7 @@ def app(): ) # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs. - def webserver_ready(): + def tabby_ready(): try: socket.create_connection(("127.0.0.1", 8000), timeout=1).close() return True @@ -77,7 +70,7 @@ def webserver_ready(): raise RuntimeError(f"launcher exited unexpectedly with code {retcode}") return False - while not webserver_ready(): + while not tabby_ready(): time.sleep(1.0) print("Tabby server ready!") diff --git a/website/docs/installation/modal/index.md b/website/docs/installation/modal/index.md new file mode 100644 index 000000000000..accab3b0af44 --- /dev/null +++ b/website/docs/installation/modal/index.md @@ -0,0 +1,124 @@ +# Modal + +Modal is a serverless GPU provider. By leveraging Modal, your Tabby instance will run on demand. When there are no requests to the Tabby server for a certain amount of time, Modal will schedule the container to sleep, thereby saving GPU costs. + +## Setup + +First we import the components we need from `modal`. + +```python +from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method +``` + +Next, we set the base docker image version, which model to serve, taking care to specify the GPU configuration required to fit the model into VRAM. + +```python +MODEL_ID = "TabbyML/StarCoder-1B" +GPU_CONFIG = gpu.T4() +``` + +## Define the container image + +We want to create a Modal image which has the Tabby model cache pre-populated. The benefit of this is that the container no longer has to re-download the model - instead, it will take advantage of Modal’s internal filesystem for faster cold starts. + +### Download the weights + +```python +def download_model(): + import subprocess + + subprocess.run( + [ + "/opt/tabby/bin/tabby", + "download", + "--model", + MODEL_ID, + ] + ) +``` + + +### Image definition + +We’ll start from a image by tabby, and override the default ENTRYPOINT for Modal to run its own which enables seamless serverless deployments. + +Next we run the download step to pre-populate the image with our model weights. + +Finally, we install the `asgi-proxy-lib` to interface with modal's asgi webserver over localhost. + +```python +image = ( + Image.from_registry( + "tabbyml/tabby:0.3.1", + add_python="3.11", + ) + .dockerfile_commands("ENTRYPOINT []") + .run_function(download_model) + .pip_install("asgi-proxy-lib") +) +``` + +### The app function + +The endpoint function is represented with Modal's `@stub.function`. Here, we: + +1. Launch the Tabby process and wait for it to be ready to accept requests. +2. Create an ASGI proxy to tunnel requests from the Modal web endpoint to the local Tabby server. +3. Specify that each container is allowed to handle up to 10 requests simultaneously. +4. Keep idle containers for 2 minutes before spinning them down. + +```python +@stub.function( + gpu=GPU_CONFIG, + allow_concurrent_inputs=10, + container_idle_timeout=120, + timeout=360, +) +@asgi_app() +def app(): + import socket + import subprocess + import time + from asgi_proxy import asgi_proxy + + launcher = subprocess.Popen( + [ + "/opt/tabby/bin/tabby", + "serve", + "--model", + MODEL_ID, + "--port", + "8000", + "--device", + "cuda", + ] + ) + + # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs. + def tabby_ready(): + try: + socket.create_connection(("127.0.0.1", 8000), timeout=1).close() + return True + except (socket.timeout, ConnectionRefusedError): + # Check if launcher webserving process has exited. + # If so, a connection can never be made. + retcode = launcher.poll() + if retcode is not None: + raise RuntimeError(f"launcher exited unexpectedly with code {retcode}") + return False + + while not tabby_ready(): + time.sleep(1.0) + + print("Tabby server ready!") + return asgi_proxy("http://localhost:8000") +``` + +### Serve the app + +Once we deploy this model with `modal serve app.py`, it will output the url of the web endpoint, in a form of `https://--tabby-server-starcoder-1b-app-dev.modal.run`, it can be used as tabby server url in tabby editor extensions! + +See [app.py](https://github.com/TabbyML/tabby/tree/main/website/docs/modal/app.py) for a complete example. + +## Feedback and support +If you have improvement suggestions or need specific support, please join [Tabby Slack community](https://join.slack.com/t/tabbycommunity/shared_invite/zt-1xeiddizp-bciR2RtFTaJ37RBxr8VxpA) or reach out on [Tabby’s GitHub repository](https://github.com/TabbyML/tabby). diff --git a/website/package.json b/website/package.json index e8592fafb174..98a32d9dfb63 100644 --- a/website/package.json +++ b/website/package.json @@ -25,6 +25,7 @@ "postcss": "^8.4.24", "posthog-docusaurus": "^2.0.0", "prism-react-renderer": "^1.3.5", + "raw-loader": "^4.0.2", "react": "^17.0.2", "react-dom": "^17.0.2", "tailwindcss": "^3.3.2", diff --git a/website/yarn.lock b/website/yarn.lock index 674c12521eab..afb00735bd71 100644 --- a/website/yarn.lock +++ b/website/yarn.lock @@ -6893,6 +6893,14 @@ raw-body@2.5.1: iconv-lite "0.4.24" unpipe "1.0.0" +raw-loader@^4.0.2: + version "4.0.2" + resolved "https://registry.yarnpkg.com/raw-loader/-/raw-loader-4.0.2.tgz#1aac6b7d1ad1501e66efdac1522c73e59a584eb6" + integrity sha512-ZnScIV3ag9A4wPX/ZayxL/jZH+euYb6FcUinPcgiQW0+UBtEv0O6Q3lGd3cqJ+GHH+rksEv3Pj99oxJ3u3VIKA== + dependencies: + loader-utils "^2.0.0" + schema-utils "^3.0.0" + rc@1.2.8, rc@^1.2.8: version "1.2.8" resolved "https://registry.yarnpkg.com/rc/-/rc-1.2.8.tgz#cd924bf5200a075b83c188cd6b9e211b7fc0d3ed"