From 5c07717eac52d76eaf4a5281d9ba4dacb7014b5e Mon Sep 17 00:00:00 2001 From: Sahil Date: Tue, 1 Oct 2024 15:54:02 -0700 Subject: [PATCH 1/2] Modify get_cost to account for prompt caching --- src/autolabel/models/anthropic.py | 2 +- src/autolabel/models/base.py | 10 ++++-- src/autolabel/models/cohere.py | 4 ++- src/autolabel/models/google.py | 4 ++- src/autolabel/models/hf_pipeline.py | 4 ++- src/autolabel/models/hf_pipeline_vision.py | 4 ++- src/autolabel/models/mistral.py | 4 ++- src/autolabel/models/openai.py | 36 ++++++++++++++++------ src/autolabel/models/openai_vision.py | 4 ++- src/autolabel/models/refuelV2.py | 4 ++- src/autolabel/models/vllm.py | 6 ++-- src/autolabel/schema.py | 3 ++ 12 files changed, 64 insertions(+), 21 deletions(-) diff --git a/src/autolabel/models/anthropic.py b/src/autolabel/models/anthropic.py index 3968fdd6..186a2025 100644 --- a/src/autolabel/models/anthropic.py +++ b/src/autolabel/models/anthropic.py @@ -105,7 +105,7 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: latencies=[0 for _ in prompts], ) - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost(self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None) -> float: num_prompt_toks = len(self.tokenizer.encode(prompt).ids) if label: num_label_toks = len(self.tokenizer.encode(label).ids) diff --git a/src/autolabel/models/base.py b/src/autolabel/models/base.py index e4b1365f..24e0483b 100644 --- a/src/autolabel/models/base.py +++ b/src/autolabel/models/base.py @@ -50,7 +50,11 @@ async def label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResul new_results = self._label(missing_prompts, output_schema) for ind, prompt in enumerate(missing_prompts): costs.append( - self.get_cost(prompt, label=new_results.generations[ind][0].text) + self.get_cost( + prompt, + label=new_results.generations[ind][0].text, + llm_output=new_results.llm_output, + ) ) # Set the existing prompts to the new results @@ -77,7 +81,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: pass @abstractmethod - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: pass def get_cached_prompts(self, prompts: List[str]) -> Optional[str]: diff --git a/src/autolabel/models/cohere.py b/src/autolabel/models/cohere.py index 55d2be53..bbcdad7c 100644 --- a/src/autolabel/models/cohere.py +++ b/src/autolabel/models/cohere.py @@ -66,7 +66,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: latencies=[0 for _ in prompts], ) - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: num_prompt_toks = len(self.co.tokenize(prompt).tokens) if label: num_label_toks = len(self.co.tokenize(label).tokens) diff --git a/src/autolabel/models/google.py b/src/autolabel/models/google.py index 4731f302..72d69d14 100644 --- a/src/autolabel/models/google.py +++ b/src/autolabel/models/google.py @@ -151,7 +151,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: latencies=[0 for _ in prompts], ) - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: if self.model_name is None: return 0.0 cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name] diff --git a/src/autolabel/models/hf_pipeline.py b/src/autolabel/models/hf_pipeline.py index 775b3a20..ea9b3202 100644 --- a/src/autolabel/models/hf_pipeline.py +++ b/src/autolabel/models/hf_pipeline.py @@ -116,7 +116,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: latencies=[0 for _ in prompts], ) - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: # Model inference for this model is being run locally # Revisit this in the future when we support HF inference endpoints return 0.0 diff --git a/src/autolabel/models/hf_pipeline_vision.py b/src/autolabel/models/hf_pipeline_vision.py index 49ef94ae..96f89cde 100644 --- a/src/autolabel/models/hf_pipeline_vision.py +++ b/src/autolabel/models/hf_pipeline_vision.py @@ -107,7 +107,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: generations=generations, errors=[None] * len(generations) ) - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: # Model inference for this model is being run locally # Revisit this in the future when we support HF inference endpoints return 0.0 diff --git a/src/autolabel/models/mistral.py b/src/autolabel/models/mistral.py index 260e23ad..d8deed0e 100644 --- a/src/autolabel/models/mistral.py +++ b/src/autolabel/models/mistral.py @@ -197,7 +197,9 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes generations=generations, errors=errors, latencies=latencies ) - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: cost_per_prompt_char = self.COST_PER_PROMPT_TOKEN[self.model_name] cost_per_completion_char = self.COST_PER_COMPLETION_TOKEN[self.model_name] return cost_per_prompt_char * len(prompt) + cost_per_completion_char * ( diff --git a/src/autolabel/models/openai.py b/src/autolabel/models/openai.py index 1d0da475..2f5b4747 100644 --- a/src/autolabel/models/openai.py +++ b/src/autolabel/models/openai.py @@ -235,6 +235,7 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes end_time = time() return RefuelLLMResult( generations=generations, + llm_output=result.llm_output, errors=[None] * len(generations), latencies=[end_time - start_time] * len(generations), ) @@ -307,6 +308,7 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: end_time = time() return RefuelLLMResult( generations=generations, + llm_output=result.llm_output, errors=[None] * len(generations), latencies=[end_time - start_time] * len(generations), ) @@ -339,19 +341,35 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: latencies=[0 for _ in prompts], ) - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: - encoding = self.tiktoken.encoding_for_model(self.model_name) - num_prompt_toks = len(encoding.encode(prompt)) - if label: - num_label_toks = len(encoding.encode(label)) + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: + num_cached_toks = 0 + if llm_output and "token_usage" in llm_output: + num_prompt_toks = llm_output["token_usage"]["prompt_tokens"] + num_label_toks = llm_output["token_usage"]["completion_tokens"] + num_cached_toks = ( + llm_output["token_usage"] + .get("prompt_tokens_details", {}) + .get("cached_tokens", 0) + ) + num_prompt_toks -= num_cached_toks else: - # get an upper bound - num_label_toks = self.model_params["max_tokens"] + encoding = self.tiktoken.encoding_for_model(self.model_name) + num_prompt_toks = len(encoding.encode(prompt)) + if label: + num_label_toks = len(encoding.encode(label)) + else: + # get an upper bound + num_label_toks = self.model_params["max_tokens"] cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name] + cost_per_cached_prompt_token = cost_per_prompt_token / 2.0 cost_per_completion_token = self.COST_PER_COMPLETION_TOKEN[self.model_name] - return (num_prompt_toks * cost_per_prompt_token) + ( - num_label_toks * cost_per_completion_token + return ( + (num_prompt_toks * cost_per_prompt_token) + + (num_cached_toks * cost_per_cached_prompt_token) + + (num_label_toks * cost_per_completion_token) ) def returns_token_probs(self) -> bool: diff --git a/src/autolabel/models/openai_vision.py b/src/autolabel/models/openai_vision.py index b37a2336..6f7f0e24 100644 --- a/src/autolabel/models/openai_vision.py +++ b/src/autolabel/models/openai_vision.py @@ -136,7 +136,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: latencies=[time() - start_time] * len(generations), ) - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: encoding = self.tiktoken.encoding_for_model(self.model_name) num_prompt_toks = len(encoding.encode(prompt)) if label: diff --git a/src/autolabel/models/refuelV2.py b/src/autolabel/models/refuelV2.py index 40eb16e9..11dc10f7 100644 --- a/src/autolabel/models/refuelV2.py +++ b/src/autolabel/models/refuelV2.py @@ -279,7 +279,9 @@ def _prepare_output_schema(self, schema: Dict) -> Dict: curr_schema[key] = self._prepare_output_schema(curr_schema[key]) return curr_schema - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: return 0 def returns_token_probs(self) -> bool: diff --git a/src/autolabel/models/vllm.py b/src/autolabel/models/vllm.py index a4ea51a3..9a2b72e4 100644 --- a/src/autolabel/models/vllm.py +++ b/src/autolabel/models/vllm.py @@ -1,5 +1,5 @@ import logging -from typing import List, Optional +from typing import Dict, List, Optional from autolabel.models import BaseModel from autolabel.configs import AutolabelConfig @@ -115,7 +115,9 @@ def _process_confidence_request(self, logprobs): resp.append({curr_logprob_obj.decoded_token: curr_logprob_obj.logprob}) return resp - def get_cost(self, prompt: str, label: Optional[str] = "") -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: return 0 def returns_token_probs(self) -> bool: diff --git a/src/autolabel/schema.py b/src/autolabel/schema.py index 4fac3c7a..08d434ca 100644 --- a/src/autolabel/schema.py +++ b/src/autolabel/schema.py @@ -199,6 +199,9 @@ class RefuelLLMResult(BaseModel): generations: List[List[Union[Generation, ChatGeneration]]] + """Arbitrary LLM provider-specific output.""" + llm_output: Optional[dict] = None + """Errors encountered while running the labeling job""" errors: List[Optional[LabelingError]] From 96de3209326875fbd657c3ce263878e31036005f Mon Sep 17 00:00:00 2001 From: Sahil Date: Tue, 1 Oct 2024 17:40:49 -0700 Subject: [PATCH 2/2] black fmt --- src/autolabel/models/anthropic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/autolabel/models/anthropic.py b/src/autolabel/models/anthropic.py index 186a2025..581f236f 100644 --- a/src/autolabel/models/anthropic.py +++ b/src/autolabel/models/anthropic.py @@ -105,7 +105,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult: latencies=[0 for _ in prompts], ) - def get_cost(self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None) -> float: + def get_cost( + self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None + ) -> float: num_prompt_toks = len(self.tokenizer.encode(prompt).ids) if label: num_label_toks = len(self.tokenizer.encode(label).ids)