debug dind #14
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Debug DIND | |
on: | |
push: | |
jobs: | |
debug-dind: | |
runs-on: | |
group: gcp-ct5lp-hightpu-8t | |
container: | |
image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_tpuvm | |
options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache -e PJRT_DEVICE=TPU | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install docker | |
run: | | |
apt-get update -y | |
apt-get install -y docker.io | |
# the run test step is similar to "Create test server Dockerfile", "Create minimal test server", "Build and run test container" | |
# but it's not in a single step inside a python script so it can be run locally | |
# this scripts work on the local machine, not in the github actions environment | |
# i left commented out for now, as "Create test server Dockerfile", "Create minimal test server", "Build and run test container" maybe be easier to debug in a ci environment | |
# - name: Run test | |
# run: | | |
# python debug-dind-locally/test.py | |
# - name: Create test server Dockerfile | |
# run: | | |
# cat << EOF > Dockerfile | |
# FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_tpuvm | |
# WORKDIR /app | |
# RUN pip install fastapi uvicorn | |
# COPY server.py . | |
# EXPOSE 8080 | |
# CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8080"] | |
# EOF | |
# - name: Create minimal test server | |
# run: | | |
# cat << EOF > server.py | |
# from fastapi import FastAPI | |
# from pydantic import BaseModel | |
# app = FastAPI() | |
# class GenerateRequest(BaseModel): | |
# inputs: str | |
# @app.post("/generate") | |
# async def generate(request: GenerateRequest): | |
# return { | |
# "generated_text": "Hello World!", | |
# "request_received": request.dict() | |
# } | |
# EOF | |
# - name: Build and run test container | |
# run: | | |
# docker build -t test-tgi-server . | |
# docker run -d -p 8080:8080 --name test-server test-tgi-server | |
# sleep 5 | |
# docker logs -f test-server & | |
# sleep 10 | |
# # Test the endpoint | |
# curl --max-time 30 localhost:8080/generate \ | |
# -X POST \ | |
# -d '{"inputs":"test message"}' \ | |
# -H 'Content-Type: application/json' | |
# # Clean up | |
# docker stop test-server | |
# @pauline, everything above is alternative tests that show the same behavior (bug) | |
# More or less when doing DIND, i cannot get the container to be reachable from the runner | |
# test-pytorch-xla-tpu-tgi-integration.yml is my full workflow when i first notice this behavior | |
# test-pytorch-xla-tpu-tgi-integration.yml starts a TGI webserver and then do requests to it to check that it's working, this is my end goal | |
# this is a minimal test to see if i can get the container to be reachable from the runner and it does not work | |
# i'm trying to find out why the container is not reachable from the runner, do you have any idea why? | |
- name: Test HTTP request | |
run: | | |
# Run the whoami container with environment variables | |
# adding --ipc host --privileged did not help | |
docker run ${{ vars.V5_LITEPOD_8_ENV}} --network host -d --name network-test -p 5001:80 traefik/whoami | |
# Wait for container to be ready | |
sleep 5 | |
# Show container status and logs | |
docker ps | |
docker logs network-test | |
# Test with multiple endpoints to verify connectivity | |
echo "Testing basic endpoint..." | |
curl --max-time 30 -v localhost:5001 |