[Pipelines] Remove end_pos output from Llama graph

This value is not used in either the naive or continuous batching path, and adds unnecessary handling to the code. As a result it is being removed. MODULAR_ORIG_COMMIT_REV_ID: 43904ccf482234e116ef45fc251e24b7734113ae
modularml · Dec 17, 2024 · 7995339 · 7995339
1 parent ddc384e
commit 7995339
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 23 deletions.
diff --git a/pipelines/python/llama3/model.py b/pipelines/python/llama3/model.py
@@ -48,24 +48,13 @@ def execute(self, *model_inputs: Tensor) -> ModelOutputs:
             *model_inputs, copy_inputs_to_device=False
         )
 
-        if self.pipeline_config.cache_strategy == KVCacheStrategy.CONTINUOUS:
-            if self.pipeline_config.enable_echo:
-                assert len(model_outputs) == 2
-                return ModelOutputs(
-                    next_token_logits=model_outputs[0], logits=model_outputs[1]
-                )
-            else:
-                assert len(model_outputs) == 1
-                return ModelOutputs(next_token_logits=model_outputs[0])
+        if self.pipeline_config.enable_echo:
+            return ModelOutputs(
+                next_token_logits=model_outputs[0],
+                logits=model_outputs[1],
+            )
         else:
-            if self.pipeline_config.enable_echo:
-                assert len(model_outputs) == 3
-                return ModelOutputs(
-                    next_token_logits=model_outputs[0], logits=model_outputs[2]
-                )
-            else:
-                assert len(model_outputs) == 2
-                return ModelOutputs(next_token_logits=model_outputs[0])
+            return ModelOutputs(next_token_logits=model_outputs[0])
 
     def _prepare_continuous_initial_token_inputs(
         self, context_batch: list[TextContext]
@@ -283,17 +272,19 @@ def _build_graph(self, weights: GGUFWeights) -> Graph:
                 ]
                 else DType.float32
             )
-            logits, end_pos = model(
+            logits = model(
                 tokens,
                 attention_mask.cast(mask_dtype),
                 k_cache,
                 v_cache,
                 start_pos,
-            )
+            )[0]
+
             if self.pipeline_config.enable_echo:
-                graph.output(logits[:, -1], end_pos, logits)
+                graph.output(logits[:, -1], logits)
             else:
-                graph.output(logits[:, -1], end_pos)
+                graph.output(logits[:, -1])
+
             return graph
 
     def compute_log_probabilities(

diff --git a/pipelines/python/nn/transformer/naive_transformer.py b/pipelines/python/nn/transformer/naive_transformer.py
@@ -88,8 +88,6 @@ def __call__(
                 i,
             )
 
-        seq_len = TensorValue(tokens.shape[1])  # type: ignore
         return (
             ops.cast(self.output(self.norm(h)), k_cache.dtype),  # type: ignore
-            start_pos + seq_len,
         )