Skip to content

Commit

Permalink
👷 Add integration test to the CI (#140)
Browse files Browse the repository at this point in the history
* feat(ci): adding integration test to ci

* fix(ci): fix flaky docker build behavior in ci

* refactor: rename V5_LITEPOD_8_ENV to TPU_ENV
  • Loading branch information
baptistecolle authored Jan 8, 2025
1 parent 20772b8 commit 9c791e0
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 5 deletions.
42 changes: 42 additions & 0 deletions .github/workflows/test-pytorch-xla-tpu-tgi-integration.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: Optimum TPU / Test TGI on TPU / Integration Tests

on:
schedule:
- cron: '0 4 * * *' # run at 4 AM UTC
# This can be used to allow manually triggering nightlies from the web interface
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
integration-tests:
name: Run TGI Integration Tests
runs-on:
group: gcp-ct5lp-hightpu-8t

env:
PJRT_DEVICE: TPU
HF_HUB_CACHE: /mnt/hf_cache/cache_huggingface
HF_TOKEN: ${{ secrets.HF_TOKEN_OPTIMUM_TPU_CI }}
TPU_ENV: ${{ vars.V5_LITEPOD_8_ENV}}

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install Python
run: |
sudo apt-get update -y
sudo apt-get install -y python3 python3-pip
sudo ln -s /usr/bin/python3 /usr/bin/python
# To build the docker image in the ci, we need to use the network host option
- name: Build TGI Docker Image
run: |
make tpu-tgi NETWORK=host
- name: Run integration tests
run: |
make tgi_docker_test
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@ clean:
rm -rf dist deps
make -C text-generation-inference/server/ clean

# normal usage: make tpu-tgi
# ci usage: make tpu-tgi NETWORK=host, to build the docker image with the network host option
tpu-tgi:
docker build --rm -f text-generation-inference/docker/Dockerfile \
--build-arg VERSION=$(VERSION) \
--build-arg TGI_VERSION=$(TGI_VERSION) \
--ulimit nofile=100000:100000 \
$(if $(NETWORK),--network $(NETWORK),) \
-t huggingface/optimum-tpu:$(VERSION)-tgi .
docker tag huggingface/optimum-tpu:$(VERSION)-tgi huggingface/optimum-tpu:latest

Expand Down Expand Up @@ -111,6 +114,6 @@ tgi_test: test_installs tgi_server
-exec python -m pip install --force-reinstall {} \;
python -m pytest -sv text-generation-inference/tests -m torch_xla

tgi_docker_test: tpu-tgi
tgi_docker_test:
python -m pip install -r text-generation-inference/integration-tests/requirements.txt
python -m pytest -sv text-generation-inference/integration-tests
50 changes: 47 additions & 3 deletions text-generation-inference/integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,43 @@
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest")
HF_TOKEN = os.getenv("HF_TOKEN", None)
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
TPU_ENV = os.getenv("TPU_ENV")

logger.add(
sys.stderr,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
level="INFO"
)

def validate_ci_tpu_env_format(env_string: str) -> bool:
"""
Validate that CI TPU environment string follows '--env Argument' pattern.
Returns True if valid, False otherwise.
"""
parts = env_string.split()
return len(parts) % 2 == 0 and all(
parts[i] == "--env" and not parts[i + 1].startswith("--env")
for i in range(0, len(parts), 2)
)

def process_ci_tpu_env_vars(env_string: str) -> dict:
"""
Process CI TPU environment string and return dictionary of environment variables.
"""
env_vars = {}
# Extract variables from string
tpu_vars = [x.strip() for x in env_string.split('--env') if x.strip()]

# Process each variable
for var in tpu_vars:
env_value = os.environ.get(var, "")
env_vars[var] = env_value
# Log if environment variable is not set
if not env_value:
logger.warning(f"TPU environment variable {var} is not set")

return env_vars


def cleanup_handler(signum, frame):
logger.info("\nCleaning up containers due to shutdown, please wait...")
Expand Down Expand Up @@ -85,7 +115,8 @@ async def health(self, timeout: int = 60):
if attempt == timeout - 1:
logger.error(f"Health check failed after {timeout}s: {str(e)}")
raise RuntimeError(f"Health check failed: {str(e)}")
logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}")
if attempt % 10 == 0: # Only log every 10th attempt
logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}")
time.sleep(1)
except Exception as e:
logger.error(f"Unexpected error during health check: {str(e)}")
Expand Down Expand Up @@ -168,17 +199,30 @@ def docker_launcher(

env = {
"LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
"HF_HUB_ENABLE_HF_TRANSFER": "0"
"HF_HUB_ENABLE_HF_TRANSFER": "0",
"PJRT_DEVICE": "TPU"
}
env.update(MODEL_CONFIGS[model_name]["env_config"].copy())


# Add model_id to env
env["MODEL_ID"] = model_id

if HF_TOKEN is not None:
env["HF_TOKEN"] = HF_TOKEN

# Add TPU environment variables when running in CI
if TPU_ENV:
logger.info(f"TPU_ENV is set, adding specific TPU environment variables for the CI")
logger.debug(f"TPU_ENV: {TPU_ENV}")
# Validate TPU environment format
if not validate_ci_tpu_env_format(TPU_ENV):
raise ValueError("Invalid TPU environment format", TPU_ENV)

# Process TPU environment variables
tpu_env_vars = process_ci_tpu_env_vars(TPU_ENV)
env.update(tpu_env_vars)


for var in ["MAX_BATCH_SIZE", "HF_SEQUENCE_LENGTH"]:
if var in os.environ:
env[var] = os.environ[var]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ pytest >= 7.4.0
pytest-asyncio >= 0.21.1
docker >= 6.1.3
Levenshtein
loguru
2 changes: 1 addition & 1 deletion text-generation-inference/integration-tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"model_id": "google/gemma-2b-it",
"sequence_length": 1024,
"expected_greedy_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data",
"expected_sampling_output": "Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain",
"expected_sampling_output": "\n\n**Deep learning** is a subfield of machine learning that enables computers to learn from data without explicit programming",
"expected_batch_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data",
"args": [
"--max-input-length", "512",
Expand Down

0 comments on commit 9c791e0

Please sign in to comment.