diff --git a/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml b/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml new file mode 100644 index 00000000..b462018f --- /dev/null +++ b/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml @@ -0,0 +1,42 @@ +name: Optimum TPU / Test TGI on TPU / Integration Tests + +on: + schedule: + - cron: '0 4 * * *' # run at 4 AM UTC + # This can be used to allow manually triggering nightlies from the web interface + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + integration-tests: + name: Run TGI Integration Tests + runs-on: + group: gcp-ct5lp-hightpu-8t + + env: + PJRT_DEVICE: TPU + HF_HUB_CACHE: /mnt/hf_cache/cache_huggingface + HF_TOKEN: ${{ secrets.HF_TOKEN_OPTIMUM_TPU_CI }} + TPU_ENV: ${{ vars.V5_LITEPOD_8_ENV}} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install Python + run: | + sudo apt-get update -y + sudo apt-get install -y python3 python3-pip + sudo ln -s /usr/bin/python3 /usr/bin/python + + # To build the docker image in the ci, we need to use the network host option + - name: Build TGI Docker Image + run: | + make tpu-tgi NETWORK=host + + - name: Run integration tests + run: | + make tgi_docker_test diff --git a/Makefile b/Makefile index 84627b78..5d2f4085 100644 --- a/Makefile +++ b/Makefile @@ -42,11 +42,14 @@ clean: rm -rf dist deps make -C text-generation-inference/server/ clean +# normal usage: make tpu-tgi +# ci usage: make tpu-tgi NETWORK=host, to build the docker image with the network host option tpu-tgi: docker build --rm -f text-generation-inference/docker/Dockerfile \ --build-arg VERSION=$(VERSION) \ --build-arg TGI_VERSION=$(TGI_VERSION) \ --ulimit nofile=100000:100000 \ + $(if $(NETWORK),--network $(NETWORK),) \ -t huggingface/optimum-tpu:$(VERSION)-tgi . docker tag huggingface/optimum-tpu:$(VERSION)-tgi huggingface/optimum-tpu:latest @@ -111,6 +114,6 @@ tgi_test: test_installs tgi_server -exec python -m pip install --force-reinstall {} \; python -m pytest -sv text-generation-inference/tests -m torch_xla -tgi_docker_test: tpu-tgi +tgi_docker_test: python -m pip install -r text-generation-inference/integration-tests/requirements.txt python -m pytest -sv text-generation-inference/integration-tests diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py index 3115e3dd..3365cfd1 100644 --- a/text-generation-inference/integration-tests/conftest.py +++ b/text-generation-inference/integration-tests/conftest.py @@ -23,6 +23,7 @@ DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest") HF_TOKEN = os.getenv("HF_TOKEN", None) DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") +TPU_ENV = os.getenv("TPU_ENV") logger.add( sys.stderr, @@ -30,6 +31,35 @@ level="INFO" ) +def validate_ci_tpu_env_format(env_string: str) -> bool: + """ + Validate that CI TPU environment string follows '--env Argument' pattern. + Returns True if valid, False otherwise. + """ + parts = env_string.split() + return len(parts) % 2 == 0 and all( + parts[i] == "--env" and not parts[i + 1].startswith("--env") + for i in range(0, len(parts), 2) + ) + +def process_ci_tpu_env_vars(env_string: str) -> dict: + """ + Process CI TPU environment string and return dictionary of environment variables. + """ + env_vars = {} + # Extract variables from string + tpu_vars = [x.strip() for x in env_string.split('--env') if x.strip()] + + # Process each variable + for var in tpu_vars: + env_value = os.environ.get(var, "") + env_vars[var] = env_value + # Log if environment variable is not set + if not env_value: + logger.warning(f"TPU environment variable {var} is not set") + + return env_vars + def cleanup_handler(signum, frame): logger.info("\nCleaning up containers due to shutdown, please wait...") @@ -85,7 +115,8 @@ async def health(self, timeout: int = 60): if attempt == timeout - 1: logger.error(f"Health check failed after {timeout}s: {str(e)}") raise RuntimeError(f"Health check failed: {str(e)}") - logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}") + if attempt % 10 == 0: # Only log every 10th attempt + logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}") time.sleep(1) except Exception as e: logger.error(f"Unexpected error during health check: {str(e)}") @@ -168,17 +199,30 @@ def docker_launcher( env = { "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", - "HF_HUB_ENABLE_HF_TRANSFER": "0" + "HF_HUB_ENABLE_HF_TRANSFER": "0", + "PJRT_DEVICE": "TPU" } env.update(MODEL_CONFIGS[model_name]["env_config"].copy()) - # Add model_id to env env["MODEL_ID"] = model_id if HF_TOKEN is not None: env["HF_TOKEN"] = HF_TOKEN + # Add TPU environment variables when running in CI + if TPU_ENV: + logger.info(f"TPU_ENV is set, adding specific TPU environment variables for the CI") + logger.debug(f"TPU_ENV: {TPU_ENV}") + # Validate TPU environment format + if not validate_ci_tpu_env_format(TPU_ENV): + raise ValueError("Invalid TPU environment format", TPU_ENV) + + # Process TPU environment variables + tpu_env_vars = process_ci_tpu_env_vars(TPU_ENV) + env.update(tpu_env_vars) + + for var in ["MAX_BATCH_SIZE", "HF_SEQUENCE_LENGTH"]: if var in os.environ: env[var] = os.environ[var] diff --git a/text-generation-inference/integration-tests/requirements.txt b/text-generation-inference/integration-tests/requirements.txt index 58765d39..3206b458 100644 --- a/text-generation-inference/integration-tests/requirements.txt +++ b/text-generation-inference/integration-tests/requirements.txt @@ -16,3 +16,4 @@ pytest >= 7.4.0 pytest-asyncio >= 0.21.1 docker >= 6.1.3 Levenshtein +loguru diff --git a/text-generation-inference/integration-tests/test_model.py b/text-generation-inference/integration-tests/test_model.py index a9b44c75..efa5f474 100644 --- a/text-generation-inference/integration-tests/test_model.py +++ b/text-generation-inference/integration-tests/test_model.py @@ -28,7 +28,7 @@ "model_id": "google/gemma-2b-it", "sequence_length": 1024, "expected_greedy_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data", - "expected_sampling_output": "Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain", + "expected_sampling_output": "\n\n**Deep learning** is a subfield of machine learning that enables computers to learn from data without explicit programming", "expected_batch_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data", "args": [ "--max-input-length", "512",