diff --git a/.github/workflows/tpu-tgi-release.yml b/.github/workflows/tpu-tgi-release.yml index 62eda842..f9979133 100644 --- a/.github/workflows/tpu-tgi-release.yml +++ b/.github/workflows/tpu-tgi-release.yml @@ -76,7 +76,6 @@ jobs: labels: ${{ steps.meta.outputs.labels }} build-args: | VERSION=${{ steps.version.outputs.version }} - TGI_VERSION=v2.4.1 - name: Generate artifact attestation for TGI @@ -97,7 +96,6 @@ jobs: labels: ${{ steps.meta-ie.outputs.labels }} build-args: | VERSION=${{ steps.version.outputs.version }} - TGI_VERSION=v2.4.1 target: inference-endpoint diff --git a/Makefile b/Makefile index a71d1f66..84627b78 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL)) .PHONY: build_dist style style_check clean -TGI_VERSION ?= v2.4.1 +TGI_VERSION ?= v3.0.0 rwildcard=$(wildcard $1) $(foreach d,$1,$(call rwildcard,$(addsuffix /$(notdir $d),$(wildcard $(dir $d)*)))) diff --git a/text-generation-inference/Cargo.toml b/text-generation-inference/Cargo.toml new file mode 100644 index 00000000..5889d4fb --- /dev/null +++ b/text-generation-inference/Cargo.toml @@ -0,0 +1,47 @@ +[workspace] +members = [ + "backends/v2", + "backends/grpc-metadata", + "launcher", + "router" +] +default-members = [ + "backends/v2", + "backends/grpc-metadata", + "launcher", + "router" +] +resolver = "2" + +[workspace.package] +version = "3.0.0" +edition = "2021" +authors = ["Olivier Dehaene"] +homepage = "https://github.com/huggingface/text-generation-inference" + +[workspace.dependencies] +base64 = "0.22.0" +tokenizers = { version = "0.20.0", features = ["http"] } +hf-hub = { version = "0.3.1", features = ["tokio"] } +metrics = { version = "0.23.0" } +metrics-exporter-prometheus = { version = "0.15.1", features = [] } +minijinja = { version = "2.2.0", features = ["json"] } +minijinja-contrib = { version = "2.0.2", features = ["pycompat"] } +pyo3 = { version = "0.22.2", features = ["auto-initialize"] } + +[profile.release] +incremental = true + +[profile.release-binary] +inherits = "release" +debug = 1 +incremental = true +panic = "abort" + +[profile.release-opt] +inherits = "release" +debug = 0 +incremental = false +lto = "fat" +opt-level = 3 +codegen-units = 1 \ No newline at end of file diff --git a/text-generation-inference/docker/Dockerfile b/text-generation-inference/docker/Dockerfile index f0e09d97..7d3faf21 100644 --- a/text-generation-inference/docker/Dockerfile +++ b/text-generation-inference/docker/Dockerfile @@ -1,25 +1,35 @@ # Fetch and extract the TGI sources FROM alpine AS tgi -# TGI version 2.4.1 by default -ARG TGI_VERSION=v2.4.1 +# TGI version 3.0.0 by default +ARG TGI_VERSION=v3.0.0 RUN test -n ${TGI_VERSION:?} RUN mkdir -p /tgi ADD https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz /tgi/sources.tar.gz RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1 # Build cargo components (adapted from TGI original Dockerfile) -# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04) -FROM lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef +# Note: we cannot use the cargo-chef base image as it uses python 3.11 +FROM ubuntu:22.04 AS chef + +RUN apt-get update -y \ + && apt-get install -y --no-install-recommends \ + curl ca-certificates build-essential \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y +ENV PATH="/root/.cargo/bin:${PATH}" +RUN cargo install cargo-chef --locked + WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse FROM chef AS planner -COPY --from=tgi /tgi/Cargo.toml Cargo.toml +COPY text-generation-inference/Cargo.toml Cargo.toml COPY --from=tgi /tgi/Cargo.lock Cargo.lock COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml COPY --from=tgi /tgi/proto proto -COPY --from=tgi /tgi/benchmark benchmark COPY --from=tgi /tgi/router router COPY --from=tgi /tgi/backends backends COPY --from=tgi /tgi/launcher launcher @@ -29,22 +39,25 @@ FROM chef AS builder ARG ENABLE_GOOGLE_FEATURE RUN echo "Google Feature Status: ${ENABLE_GOOGLE_FEATURE}" -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - python3.11-dev +RUN apt-get update -y \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + unzip python3-dev libssl-dev pkg-config \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ rm -f $PROTOC_ZIP +COPY text-generation-inference/Cargo.toml Cargo.toml COPY --from=planner /usr/src/recipe.json recipe.json RUN cargo chef cook --profile release-opt --recipe-path recipe.json -COPY --from=tgi /tgi/Cargo.toml Cargo.toml COPY --from=tgi /tgi/Cargo.lock Cargo.lock COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml COPY --from=tgi /tgi/proto proto -COPY --from=tgi /tgi/benchmark benchmark COPY --from=tgi /tgi/router router COPY --from=tgi /tgi/backends backends COPY --from=tgi /tgi/launcher launcher @@ -114,8 +127,6 @@ ARG VERSION=${VERSION} RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ libpython3.10 \ - libpython3.11 \ - python3.11 \ git \ gnupg2 \ wget \ @@ -142,9 +153,6 @@ ENV PORT=${PORT:-80} ENV HF_HOME=${ENABLE_GOOGLE_FEATURE:+/tmp} ENV HF_HOME=${HF_HOME:-/data} -# Install requirements for TGI, that uses python3.11 -RUN python3.11 -m pip install transformers==${TRANSFORMERS_VERSION} - # Install requirements for optimum-tpu, then for TGI then optimum-tpu RUN python3 -m pip install hf_transfer safetensors==${SAFETENSORS_VERSION} typer COPY --from=optimum-tpu-installer /opt/optimum-tpu /opt/optimum-tpu @@ -152,8 +160,7 @@ RUN python3 /opt/optimum-tpu/optimum/tpu/cli.py install-jetstream-pytorch --yes RUN python3 -m pip install -e /opt/optimum-tpu \ -f https://storage.googleapis.com/libtpu-releases/index.html -# Install benchmarker -COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark + # Install router COPY --from=builder /usr/src/target/release-opt/text-generation-router-v2 /usr/local/bin/text-generation-router # Install launcher @@ -196,5 +203,3 @@ ENTRYPOINT ["./entrypoint.sh"] FROM tpu_base ENTRYPOINT ["text-generation-launcher"] -# This is commented out in the original TGI Dockerfile -# CMD ["--json-output"] diff --git a/text-generation-inference/docker/entrypoint.sh b/text-generation-inference/docker/entrypoint.sh index e09a15a5..7621d352 100755 --- a/text-generation-inference/docker/entrypoint.sh +++ b/text-generation-inference/docker/entrypoint.sh @@ -10,6 +10,22 @@ if [[ -z "${MAX_BATCH_SIZE}" ]]; then fi export MAX_BATCH_SIZE="${MAX_BATCH_SIZE}" +# At some point we used to have MAX_INPUT_LENGTH, now we should use MAX_INPUT_TOKENS +# (This would be done automatically by the launcher, but we need to calculate the +# MAX_BATCH_PREFILL_TOKENS if not set) +if [[ -z "${MAX_INPUT_TOKENS}" && -n ${MAX_INPUT_LENGTH} ]]; then + MAX_INPUT_TOKENS=${MAX_INPUT_LENGTH} +fi +if [[ -n "${MAX_INPUT_LENGTH}" ]]; then + echo "MAX_INPUT_LENGTH is deprecated, please use MAX_INPUT_TOKENS instead. Variable will be unset." + unset MAX_INPUT_LENGTH +fi + +if [[ -z "${MAX_BATCH_PREFILL_TOKENS}" ]]; then + MAX_BATCH_PREFILL_TOKENS=$(( ${MAX_BATCH_SIZE} * ${MAX_INPUT_TOKENS} )) +fi +export MAX_BATCH_PREFILL_TOKENS="${MAX_BATCH_PREFILL_TOKENS}" + if [[ -z "${JSON_OUTPUT_DISABLE}" ]]; then JSON_OUTPUT_DISABLE=--json-output else diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile index 56e481b0..1a8fc6ec 100644 --- a/text-generation-inference/server/Makefile +++ b/text-generation-inference/server/Makefile @@ -2,7 +2,7 @@ pkg_name := text_generation_server BUILDDIR ?= $(CURDIR)/build VERSION ?= 0.0.1 -TGI_VERSION ?= "v2.4.1" +TGI_VERSION ?= "v3.0.0" mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) mkfile_dir := $(dir $(mkfile_path)) pkg_dir := $(BUILDDIR)/$(pkg_name) diff --git a/text-generation-inference/server/build-requirements.txt b/text-generation-inference/server/build-requirements.txt index 64b2b6e9..562cb819 100644 --- a/text-generation-inference/server/build-requirements.txt +++ b/text-generation-inference/server/build-requirements.txt @@ -1,3 +1,3 @@ build -grpcio-tools==1.62.1 -mypy-protobuf==3.2.0 \ No newline at end of file +grpcio-tools==1.53.0 +mypy-protobuf \ No newline at end of file diff --git a/text-generation-inference/server/text_generation_server/cli.py b/text-generation-inference/server/text_generation_server/cli.py index ab5fc10f..363da708 100644 --- a/text-generation-inference/server/text_generation_server/cli.py +++ b/text-generation-inference/server/text_generation_server/cli.py @@ -99,6 +99,7 @@ def download_weights( auto_convert: Optional[bool] = None, extension: Optional[str] = None, trust_remote_code: Optional[bool] = None, + merge_lora: Optional[bool] = None, ): """Download the model weights. @@ -122,6 +123,8 @@ def download_weights( logger.warning("'trust_remote_code' argument is not supported and will be ignored.") if auto_convert is not None: logger.warning("'auto_convert' argument is not supported and will be ignored.") + if merge_lora is not None: + logger.warning("'merge_lora' argument is not supported and will be ignored.") # Import here after the logger is added to log potential import exceptions from optimum.tpu.model import fetch_model diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py index cf7e1f3b..277a7b3f 100644 --- a/text-generation-inference/server/text_generation_server/generator.py +++ b/text-generation-inference/server/text_generation_server/generator.py @@ -298,6 +298,12 @@ def attention_mask(self) -> torch.LongTensor: def max_token(self) -> int: return self._generation_config.max_length + @property + def max_new_tokens(self) -> int: + # The current value of max_new_tokens: might be different of the target max_new_tokens + # if the slot has been paused and resumed. + return self._generation_config.max_new_tokens + class TpuGeneratorSingleThread(Generator): """A Generator for models running on TPU, single threaded.""" @@ -474,6 +480,9 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]: slot.assign(self.batch_id, request, self.model.generation_config) self.slots.append(slot) logger.debug(f"Request {slot.request_id} assigned to slot {slot.id}") + logger.debug( + f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}" + ) # Reconstruct the full inputs (without padding) as seen by the model. # This comprises: # - the inputs for new requests, @@ -576,6 +585,8 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]: slot.append(next_token) slot.resume() logger.debug("Model ready for decoding") + if next_batch is not None: + logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}") return generation, next_batch @torch.no_grad @@ -704,14 +715,16 @@ def _post_generate( if next_token == self.tokenizer.eos_token_id: finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN elif slot.stopped: - # For now we only support the length stopping criteria - finish_reason = FinishReason.FINISH_REASON_LENGTH + if slot.generated_tokens == slot.max_new_tokens: + finish_reason = FinishReason.FINISH_REASON_LENGTH + else: + finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE if finish_reason is not None: # We must include the generated text for each finished sequence in the response generated_text = GeneratedText( text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason ) - logger.debug(f"Finished generating tokens for request {request_id}") + logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens") # This slot is now empty, it will be removed from the list of # active slots once a new prefill is requested slot.clear() diff --git a/text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py b/text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py index f4a2e653..1baed358 100644 --- a/text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py +++ b/text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py @@ -228,6 +228,11 @@ def empty(self) -> bool: def seed(self) -> int: return self._seed + @property + def max_new_tokens(self) -> int: + # The current value of max_new_tokens: might be different of the target max_new_tokens + # if the slot has been paused and resumed. + return self._generation_config.max_new_tokens class PrefillSlot: def __init__(self): @@ -443,7 +448,9 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]: self.prefill_slot.set(slot) self.slot_index += 1 slot.assign(self.batch_id, request, self.model.generation_config) - logger.debug(f"Request {slot.request_id} assigned to slot {slot.id}") + logger.debug( + f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}" + ) # Tokenize the inputs input_ids, true_lengths = self._token_encode(request.inputs, slot.truncate) @@ -475,6 +482,8 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]: cached_batch = self._cached_batch(self.batch_id, prefilled_active_slots) self.batch_id += 1 logger.debug("Model ready for decoding") + if cached_batch is not None: + logger.debug(f"Next batch is {cached_batch.id} with requests: {cached_batch.request_ids}") return generations, cached_batch def _select_from_slots(self, logits: jnp.ndarray, batch_size: int=0) -> jnp.ndarray: @@ -566,15 +575,17 @@ def _post_generate(self, slot: Slot, next_token: int, generations: List[Generati if next_token == self.tokenizer.eos_token_id: finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN elif slot.stopped: - # For now we only support the length stopping criteria - finish_reason = FinishReason.FINISH_REASON_LENGTH + if slot.generated_tokens == slot.max_new_tokens: + finish_reason = FinishReason.FINISH_REASON_LENGTH + else: + finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE request_id = slot.request_id if finish_reason is not None: # We must include the generated text for each finished sequence in the response generated_text = GeneratedText( text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason ) - logger.debug(f"Finished generating tokens for request {request_id}") + logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens") # This slot is now empty, it will be removed from the list of # active slots. slot.clear()