.github/workflows/test-pytorch-xla-tpu-tgi-jetstream.yml

name: Optimum TPU / Test TGI on TPU / Jetstream Pytorch

on:
  push:
    branches: [ main, ci-ephemeral-tpu ]
  pull_request:
    branches: [ main ]
    paths:
      - "text-generation-inference/**"
  # This can be used to trigger workflow from the web interface
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  do-the-job:
    name: Run TGI tests - Jetstream Pytorch
    runs-on:
      group: gcp-ct5lp-hightpu-8t
    container:
      image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_tpuvm
      options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}}
    env:
      PJRT_DEVICE: TPU
    steps:
      - name: Checkout
        uses: actions/checkout@v4

      - name: Build and test TGI server
        run: |
          make test_installs jetstream_requirements tgi_server
          find text-generation-inference/ -name "text_generation_server-*whl" -exec python -m pip install {} \;
          JETSTREAM_PT=1 HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_TPU_CI }} \
            python -m pytest -sv text-generation-inference/tests -k "jetstream and greedy and Meta-Llama" --runslow
          JETSTREAM_PT=1 HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_TPU_CI }} \
            python -m pytest -sv text-generation-inference/tests -k "jetstream and greedy and gemma" --runslow