Skip to content

Commit

Permalink
Bump TGI version to v3.0.0 (#135)
Browse files Browse the repository at this point in the history
* chore(ci): remove TGI_VERSION argument from workflow

The Dockerfile has a default value, it is easier to only maintain that.

* feat(TGI): update to v3.0.0

Update to TGI 3.0.0, using a simplified Cargo.toml.
This is based on the work done on optimum-neuron:
huggingface/optimum-neuron#748

* fix(tgi): add merge_lora kwarg to download_weights

* fix(tgi): return the correct FinishReason on stop string

* fix(tgi): set max_batch_prefill_tokens

Starting from TGI 2.4.1, the evaluation of the default value for
max_batch_prefill_tokens in the TGI launcher has changed, leading it
to be set to a default value of 4096 on tpu, while it was previously
set to max_batch_size * max_input_tokens.

This is now fixed in the entrypoint, pending a fix in the launcher.

* review(docker): unset MAX_INPUT_LENGTH when set
  • Loading branch information
tengomucho authored Jan 6, 2025
1 parent 0b9cfd2 commit 20772b8
Show file tree
Hide file tree
Showing 10 changed files with 125 additions and 32 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/tpu-tgi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ jobs:
labels: ${{ steps.meta.outputs.labels }}
build-args: |
VERSION=${{ steps.version.outputs.version }}
TGI_VERSION=v2.4.1
- name: Generate artifact attestation for TGI
Expand All @@ -97,7 +96,6 @@ jobs:
labels: ${{ steps.meta-ie.outputs.labels }}
build-args: |
VERSION=${{ steps.version.outputs.version }}
TGI_VERSION=v2.4.1
target: inference-endpoint


Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))

.PHONY: build_dist style style_check clean

TGI_VERSION ?= v2.4.1
TGI_VERSION ?= v3.0.0

rwildcard=$(wildcard $1) $(foreach d,$1,$(call rwildcard,$(addsuffix /$(notdir $d),$(wildcard $(dir $d)*))))

Expand Down
47 changes: 47 additions & 0 deletions text-generation-inference/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
[workspace]
members = [
"backends/v2",
"backends/grpc-metadata",
"launcher",
"router"
]
default-members = [
"backends/v2",
"backends/grpc-metadata",
"launcher",
"router"
]
resolver = "2"

[workspace.package]
version = "3.0.0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"

[workspace.dependencies]
base64 = "0.22.0"
tokenizers = { version = "0.20.0", features = ["http"] }
hf-hub = { version = "0.3.1", features = ["tokio"] }
metrics = { version = "0.23.0" }
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
minijinja = { version = "2.2.0", features = ["json"] }
minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
pyo3 = { version = "0.22.2", features = ["auto-initialize"] }

[profile.release]
incremental = true

[profile.release-binary]
inherits = "release"
debug = 1
incremental = true
panic = "abort"

[profile.release-opt]
inherits = "release"
debug = 0
incremental = false
lto = "fat"
opt-level = 3
codegen-units = 1
43 changes: 24 additions & 19 deletions text-generation-inference/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,35 @@
# Fetch and extract the TGI sources
FROM alpine AS tgi
# TGI version 2.4.1 by default
ARG TGI_VERSION=v2.4.1
# TGI version 3.0.0 by default
ARG TGI_VERSION=v3.0.0
RUN test -n ${TGI_VERSION:?}
RUN mkdir -p /tgi
ADD https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1

# Build cargo components (adapted from TGI original Dockerfile)
# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
FROM lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
# Note: we cannot use the cargo-chef base image as it uses python 3.11
FROM ubuntu:22.04 AS chef

RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
curl ca-certificates build-essential \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN cargo install cargo-chef --locked

WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef AS planner
COPY --from=tgi /tgi/Cargo.toml Cargo.toml
COPY text-generation-inference/Cargo.toml Cargo.toml
COPY --from=tgi /tgi/Cargo.lock Cargo.lock
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
COPY --from=tgi /tgi/proto proto
COPY --from=tgi /tgi/benchmark benchmark
COPY --from=tgi /tgi/router router
COPY --from=tgi /tgi/backends backends
COPY --from=tgi /tgi/launcher launcher
Expand All @@ -29,22 +39,25 @@ FROM chef AS builder
ARG ENABLE_GOOGLE_FEATURE
RUN echo "Google Feature Status: ${ENABLE_GOOGLE_FEATURE}"

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
python3.11-dev
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
unzip python3-dev libssl-dev pkg-config \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
rm -f $PROTOC_ZIP

COPY text-generation-inference/Cargo.toml Cargo.toml
COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

COPY --from=tgi /tgi/Cargo.toml Cargo.toml
COPY --from=tgi /tgi/Cargo.lock Cargo.lock
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
COPY --from=tgi /tgi/proto proto
COPY --from=tgi /tgi/benchmark benchmark
COPY --from=tgi /tgi/router router
COPY --from=tgi /tgi/backends backends
COPY --from=tgi /tgi/launcher launcher
Expand Down Expand Up @@ -114,8 +127,6 @@ ARG VERSION=${VERSION}
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
libpython3.10 \
libpython3.11 \
python3.11 \
git \
gnupg2 \
wget \
Expand All @@ -142,18 +153,14 @@ ENV PORT=${PORT:-80}
ENV HF_HOME=${ENABLE_GOOGLE_FEATURE:+/tmp}
ENV HF_HOME=${HF_HOME:-/data}

# Install requirements for TGI, that uses python3.11
RUN python3.11 -m pip install transformers==${TRANSFORMERS_VERSION}

# Install requirements for optimum-tpu, then for TGI then optimum-tpu
RUN python3 -m pip install hf_transfer safetensors==${SAFETENSORS_VERSION} typer
COPY --from=optimum-tpu-installer /opt/optimum-tpu /opt/optimum-tpu
RUN python3 /opt/optimum-tpu/optimum/tpu/cli.py install-jetstream-pytorch --yes
RUN python3 -m pip install -e /opt/optimum-tpu \
-f https://storage.googleapis.com/libtpu-releases/index.html

# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark

# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router-v2 /usr/local/bin/text-generation-router
# Install launcher
Expand Down Expand Up @@ -196,5 +203,3 @@ ENTRYPOINT ["./entrypoint.sh"]
FROM tpu_base

ENTRYPOINT ["text-generation-launcher"]
# This is commented out in the original TGI Dockerfile
# CMD ["--json-output"]
16 changes: 16 additions & 0 deletions text-generation-inference/docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@ if [[ -z "${MAX_BATCH_SIZE}" ]]; then
fi
export MAX_BATCH_SIZE="${MAX_BATCH_SIZE}"

# At some point we used to have MAX_INPUT_LENGTH, now we should use MAX_INPUT_TOKENS
# (This would be done automatically by the launcher, but we need to calculate the
# MAX_BATCH_PREFILL_TOKENS if not set)
if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then
MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH}
fi
if [[ -n "${MAX_INPUT_LENGTH}" ]]; then
echo "MAX_INPUT_LENGTH is deprecated, please use MAX_INPUT_TOKENS instead. Variable will be unset."
unset MAX_INPUT_LENGTH
fi

if [[ -z "${MAX_BATCH_PREFILL_TOKENS}" ]]; then
MAX_BATCH_PREFILL_TOKENS=$(( ${MAX_BATCH_SIZE} * ${MAX_INPUT_TOKENS} ))
fi
export MAX_BATCH_PREFILL_TOKENS="${MAX_BATCH_PREFILL_TOKENS}"

if [[ -z "${JSON_OUTPUT_DISABLE}" ]]; then
JSON_OUTPUT_DISABLE=--json-output
else
Expand Down
2 changes: 1 addition & 1 deletion text-generation-inference/server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
pkg_name := text_generation_server
BUILDDIR ?= $(CURDIR)/build
VERSION ?= 0.0.1
TGI_VERSION ?= "v2.4.1"
TGI_VERSION ?= "v3.0.0"
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
mkfile_dir := $(dir $(mkfile_path))
pkg_dir := $(BUILDDIR)/$(pkg_name)
Expand Down
4 changes: 2 additions & 2 deletions text-generation-inference/server/build-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
build
grpcio-tools==1.62.1
mypy-protobuf==3.2.0
grpcio-tools==1.53.0
mypy-protobuf
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def download_weights(
auto_convert: Optional[bool] = None,
extension: Optional[str] = None,
trust_remote_code: Optional[bool] = None,
merge_lora: Optional[bool] = None,
):
"""Download the model weights.
Expand All @@ -122,6 +123,8 @@ def download_weights(
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
if auto_convert is not None:
logger.warning("'auto_convert' argument is not supported and will be ignored.")
if merge_lora is not None:
logger.warning("'merge_lora' argument is not supported and will be ignored.")

# Import here after the logger is added to log potential import exceptions
from optimum.tpu.model import fetch_model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@ def attention_mask(self) -> torch.LongTensor:
def max_token(self) -> int:
return self._generation_config.max_length

@property
def max_new_tokens(self) -> int:
# The current value of max_new_tokens: might be different of the target max_new_tokens
# if the slot has been paused and resumed.
return self._generation_config.max_new_tokens


class TpuGeneratorSingleThread(Generator):
"""A Generator for models running on TPU, single threaded."""
Expand Down Expand Up @@ -474,6 +480,9 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
slot.assign(self.batch_id, request, self.model.generation_config)
self.slots.append(slot)
logger.debug(f"Request {slot.request_id} assigned to slot {slot.id}")
logger.debug(
f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
)
# Reconstruct the full inputs (without padding) as seen by the model.
# This comprises:
# - the inputs for new requests,
Expand Down Expand Up @@ -576,6 +585,8 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
slot.append(next_token)
slot.resume()
logger.debug("Model ready for decoding")
if next_batch is not None:
logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
return generation, next_batch

@torch.no_grad
Expand Down Expand Up @@ -704,14 +715,16 @@ def _post_generate(
if next_token == self.tokenizer.eos_token_id:
finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
elif slot.stopped:
# For now we only support the length stopping criteria
finish_reason = FinishReason.FINISH_REASON_LENGTH
if slot.generated_tokens == slot.max_new_tokens:
finish_reason = FinishReason.FINISH_REASON_LENGTH
else:
finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
if finish_reason is not None:
# We must include the generated text for each finished sequence in the response
generated_text = GeneratedText(
text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
)
logger.debug(f"Finished generating tokens for request {request_id}")
logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
# This slot is now empty, it will be removed from the list of
# active slots once a new prefill is requested
slot.clear()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,11 @@ def empty(self) -> bool:
def seed(self) -> int:
return self._seed

@property
def max_new_tokens(self) -> int:
# The current value of max_new_tokens: might be different of the target max_new_tokens
# if the slot has been paused and resumed.
return self._generation_config.max_new_tokens

class PrefillSlot:
def __init__(self):
Expand Down Expand Up @@ -443,7 +448,9 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
self.prefill_slot.set(slot)
self.slot_index += 1
slot.assign(self.batch_id, request, self.model.generation_config)
logger.debug(f"Request {slot.request_id} assigned to slot {slot.id}")
logger.debug(
f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
)

# Tokenize the inputs
input_ids, true_lengths = self._token_encode(request.inputs, slot.truncate)
Expand Down Expand Up @@ -475,6 +482,8 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
cached_batch = self._cached_batch(self.batch_id, prefilled_active_slots)
self.batch_id += 1
logger.debug("Model ready for decoding")
if cached_batch is not None:
logger.debug(f"Next batch is {cached_batch.id} with requests: {cached_batch.request_ids}")
return generations, cached_batch

def _select_from_slots(self, logits: jnp.ndarray, batch_size: int=0) -> jnp.ndarray:
Expand Down Expand Up @@ -566,15 +575,17 @@ def _post_generate(self, slot: Slot, next_token: int, generations: List[Generati
if next_token == self.tokenizer.eos_token_id:
finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
elif slot.stopped:
# For now we only support the length stopping criteria
finish_reason = FinishReason.FINISH_REASON_LENGTH
if slot.generated_tokens == slot.max_new_tokens:
finish_reason = FinishReason.FINISH_REASON_LENGTH
else:
finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
request_id = slot.request_id
if finish_reason is not None:
# We must include the generated text for each finished sequence in the response
generated_text = GeneratedText(
text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
)
logger.debug(f"Finished generating tokens for request {request_id}")
logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
# This slot is now empty, it will be removed from the list of
# active slots.
slot.clear()
Expand Down

0 comments on commit 20772b8

Please sign in to comment.