From 9c46709c3250e2cff5302ecd64ab78912add3884 Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Sat, 23 Dec 2023 01:13:29 +0800
Subject: [PATCH] docs: add installation tutorials for skypilot serving (#1099)

* docs: add installation tutorials for skypilot serving

* Apply suggestions from code review

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update index.md

* Update index.md

---------

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
---
 website/docs/installation/skypilot/index.md   | 86 +++++++++++++++++++
 .../installation/skypilot/service-ready.png   |  3 +
 .../installation/skypilot/start-service.png   |  3 +
 .../installation/skypilot/tabby-ready.png     |  3 +
 website/docs/installation/skypilot/tabby.yaml | 12 +++
 5 files changed, 107 insertions(+)
 create mode 100644 website/docs/installation/skypilot/index.md
 create mode 100644 website/docs/installation/skypilot/service-ready.png
 create mode 100644 website/docs/installation/skypilot/start-service.png
 create mode 100644 website/docs/installation/skypilot/tabby-ready.png
 create mode 100644 website/docs/installation/skypilot/tabby.yaml

diff --git a/website/docs/installation/skypilot/index.md b/website/docs/installation/skypilot/index.md
new file mode 100644
index 000000000000..89c1053139f4
--- /dev/null
+++ b/website/docs/installation/skypilot/index.md
@@ -0,0 +1,86 @@
+# SkyPilot Serving
+
+[SkyPilot](https://skypilot.readthedocs.io/en/latest/) is a versatile framework designed for the execution of LLMs, AI, and batch jobs on any cloud vendors. It stands out by offering significant cost savings, optimal GPU availability, and managed execution capabilities.
+
+[SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) is SkyPilot’s model serving library. SkyServe (short for SkyPilot Serving) takes an existing serving framework and deploys it across one or more regions or clouds.
+
+When leveraging SkyServe, all replica Tabby instances are seamlessly deployed within your own cloud accounts and VPCs.
+
+## Configuration
+
+At first, let's specified the resource requirements for the Tabby service in the YAML configuration for SkyServe.
+
+```yaml
+resources:
+  ports: 8080
+  accelerators: T4:1
+```
+
+Skypilot supports GPU from various cloud vendors. Please refer to the official [Skypilot documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) for detailed installation instructions.
+
+As Tabby exposes its health check at `/v1/health`, we can define the following service configuration:
+
+```yaml
+service:
+  readiness_probe: /v1/health
+  replicas: 1
+```
+
+Finally, we define the command line that actually initiates the container job:
+
+```yaml
+run: |
+  docker run --gpus all -p 8080:8080 -v ~/.tabby:/data \
+    tabbyml/tabby \
+    serve --model TabbyML/StarCoder-1B --device cuda
+```
+
+## Launch the service
+
+We first execute `sky serve up tabby.yaml -n tabby`.
+
+![start tabby service](./start-service.png)
+
+If everything goes well, you'll see messages below
+![service ready](./service-ready.png)
+
+This finishes launching SkyServe's control VM which runs a load balancer for this serve; the actual replica running the Tabby service is undergoing provisioning.
+
+When you execute the following command, you'll encounter a message indicating that the replica is not ready:
+
+```bash
+$ curl -L 'http://44.203.34.65:30001/v1/health'
+
+{"detail":"No available replicas. Use \"sky serve status [SERVICE_NAME]\" to check the replica status."}%
+```
+
+You can monitor the progress of starting the actual tabby job by checking the replica log:
+
+```bash
+# Tailing the logs of replica 1 for the tabby service
+sky serve logs tabby 1
+```
+
+Once the service is ready, you will see something like the following:
+
+![tabby ready](./tabby-ready.png)
+
+SkyServe uses a redirect load balancer at its front, so the `-L` command is necessary if you would like to test the completion api with `curl`.
+
+```bash
+$ curl -L -X 'POST' \
+  'http://44.203.34.65:30001/v1/completions' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "language": "python",
+  "segments": {
+    "prefix": "def fib(n):\n    ",
+    "suffix": "\n        return fib(n - 1) + fib(n - 2)"
+  }
+}'
+
+{"id":"cmpl-ba9aae81-ed9c-419b-9616-fceb92cdbe79","choices":[{"index":0,"text":"    if n <= 1:\n            return n"}]}
+```
+
+Now, you can utilize the load balancer URL (`http://44.203.34.65:30001` in this case) within Tabby editor extensions. Please refer to `tabby.yaml` for the comprehensive configuration used in this tutorial.
diff --git a/website/docs/installation/skypilot/service-ready.png b/website/docs/installation/skypilot/service-ready.png
new file mode 100644
index 000000000000..1491db4b8a08
--- /dev/null
+++ b/website/docs/installation/skypilot/service-ready.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a413df338623f81c14e470f93cabb11be6368a738e70309a4192560a663c7d51
+size 39367
diff --git a/website/docs/installation/skypilot/start-service.png b/website/docs/installation/skypilot/start-service.png
new file mode 100644
index 000000000000..2a8ce8e600cf
--- /dev/null
+++ b/website/docs/installation/skypilot/start-service.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:544447575a60fc264db698a747326a03c9b47a310dc2092af6e32b699cb9b425
+size 31796
diff --git a/website/docs/installation/skypilot/tabby-ready.png b/website/docs/installation/skypilot/tabby-ready.png
new file mode 100644
index 000000000000..1d8561bb2073
--- /dev/null
+++ b/website/docs/installation/skypilot/tabby-ready.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ec8c99fd93fa82466e2ab21003d6f8f12c2ee5cc34e7305abae52110e0be9bb
+size 31349
diff --git a/website/docs/installation/skypilot/tabby.yaml b/website/docs/installation/skypilot/tabby.yaml
new file mode 100644
index 000000000000..15939a0c07e2
--- /dev/null
+++ b/website/docs/installation/skypilot/tabby.yaml
@@ -0,0 +1,12 @@
+resources:
+  ports: 8080
+  accelerators: T4:1
+
+service:
+  readiness_probe: /v1/health
+  replicas: 1
+
+run: |
+  docker run --gpus all -p 8080:8080 -v ~/data:/data \
+    tabbyml/tabby \
+    serve --model TabbyML/StarCoder-1B --device cuda