debug dind #16
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Debug DIND | |
on: | |
push: | |
jobs: | |
debug-dind: | |
runs-on: | |
group: gcp-ct5lp-hightpu-8t | |
container: | |
image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_tpuvm | |
options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache -e PJRT_DEVICE=TPU | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install docker | |
run: | | |
apt-get update -y | |
apt-get install -y docker.io | |
# @pauline, everything below are alternative tests that show the same behavior (bug) | |
# this is the most concise test that shows the behavior and i think it is the most straightforward to understand | |
# More or less when doing DIND, i cannot get the container to be reachable from the runner | |
# test-pytorch-xla-tpu-tgi-integration.yml is my full workflow when i first notice this behavior | |
# test-pytorch-xla-tpu-tgi-integration.yml starts a TGI webserver and then do requests to it to check that it's working, this is my end goal | |
# this is a minimal test to see if i can get the container to be reachable from the runner and it does not work | |
# i'm trying to find out why the container is not reachable from the runner, do you have any idea why? | |
- name: Test HTTP request | |
run: | | |
# Run the whoami container with environment variables | |
# @pauline adding --ipc host, --privileged and/or --network host did not help me. I also tried to use different ports 80, 8080, 5001 | |
docker run ${{ vars.V5_LITEPOD_8_ENV}} -d --name network-test -p 5001:80 traefik/whoami | |
# Wait for container to be ready | |
sleep 5 | |
# Show container status and logs | |
docker ps | |
docker logs network-test | |
# Test with multiple endpoints to verify connectivity | |
echo "Testing basic endpoint..." | |
curl --max-time 30 -v localhost:5001 | |
# OPTION 1: altnerative test 1 that shows the same behavior | |
# the run test step is similar to "Create test server Dockerfile", "Create minimal test server", "Build and run test container" | |
# but it's in a single step inside a python script so it can be run locally | |
# this scripts work on the local machine but not in the github actions environment and this is not expected, the behavior should be the same | |
# - name: Run test | |
# run: | | |
# python debug-dind-locally/test.py | |
# OPTION 2: altnerative test 2 that shows the same behavior (until end of the file) | |
# - name: Create test server Dockerfile | |
# run: | | |
# cat << EOF > Dockerfile | |
# FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_tpuvm | |
# WORKDIR /app | |
# RUN pip install fastapi uvicorn | |
# COPY server.py . | |
# EXPOSE 8080 | |
# CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8080"] | |
# EOF | |
# - name: Create minimal test server | |
# run: | | |
# cat << EOF > server.py | |
# from fastapi import FastAPI | |
# from pydantic import BaseModel | |
# app = FastAPI() | |
# class GenerateRequest(BaseModel): | |
# inputs: str | |
# @app.post("/generate") | |
# async def generate(request: GenerateRequest): | |
# return { | |
# "generated_text": "Hello World!", | |
# "request_received": request.dict() | |
# } | |
# EOF | |
# - name: Build and run test container | |
# run: | | |
# docker build -t test-tgi-server . | |
# docker run -d -p 8080:8080 --name test-server test-tgi-server | |
# sleep 5 | |
# docker logs -f test-server & | |
# sleep 10 | |
# # Test the endpoint | |
# curl --max-time 30 localhost:8080/generate \ | |
# -X POST \ | |
# -d '{"inputs":"test message"}' \ | |
# -H 'Content-Type: application/json' | |
# # Clean up | |
# docker stop test-server | |