From 5c07717eac52d76eaf4a5281d9ba4dacb7014b5e Mon Sep 17 00:00:00 2001
From: Sahil <yadavsahil197@gmail.com>
Date: Tue, 1 Oct 2024 15:54:02 -0700
Subject: [PATCH 1/2] Modify get_cost to account for prompt caching

---
 src/autolabel/models/anthropic.py          |  2 +-
 src/autolabel/models/base.py               | 10 ++++--
 src/autolabel/models/cohere.py             |  4 ++-
 src/autolabel/models/google.py             |  4 ++-
 src/autolabel/models/hf_pipeline.py        |  4 ++-
 src/autolabel/models/hf_pipeline_vision.py |  4 ++-
 src/autolabel/models/mistral.py            |  4 ++-
 src/autolabel/models/openai.py             | 36 ++++++++++++++++------
 src/autolabel/models/openai_vision.py      |  4 ++-
 src/autolabel/models/refuelV2.py           |  4 ++-
 src/autolabel/models/vllm.py               |  6 ++--
 src/autolabel/schema.py                    |  3 ++
 12 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/src/autolabel/models/anthropic.py b/src/autolabel/models/anthropic.py
index 3968fdd6..186a2025 100644
--- a/src/autolabel/models/anthropic.py
+++ b/src/autolabel/models/anthropic.py
@@ -105,7 +105,7 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None) -> float:
         num_prompt_toks = len(self.tokenizer.encode(prompt).ids)
         if label:
             num_label_toks = len(self.tokenizer.encode(label).ids)
diff --git a/src/autolabel/models/base.py b/src/autolabel/models/base.py
index e4b1365f..24e0483b 100644
--- a/src/autolabel/models/base.py
+++ b/src/autolabel/models/base.py
@@ -50,7 +50,11 @@ async def label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResul
                 new_results = self._label(missing_prompts, output_schema)
             for ind, prompt in enumerate(missing_prompts):
                 costs.append(
-                    self.get_cost(prompt, label=new_results.generations[ind][0].text)
+                    self.get_cost(
+                        prompt,
+                        label=new_results.generations[ind][0].text,
+                        llm_output=new_results.llm_output,
+                    )
                 )
 
             # Set the existing prompts to the new results
@@ -77,7 +81,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
         pass
 
     @abstractmethod
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         pass
 
     def get_cached_prompts(self, prompts: List[str]) -> Optional[str]:
diff --git a/src/autolabel/models/cohere.py b/src/autolabel/models/cohere.py
index 55d2be53..bbcdad7c 100644
--- a/src/autolabel/models/cohere.py
+++ b/src/autolabel/models/cohere.py
@@ -66,7 +66,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         num_prompt_toks = len(self.co.tokenize(prompt).tokens)
         if label:
             num_label_toks = len(self.co.tokenize(label).tokens)
diff --git a/src/autolabel/models/google.py b/src/autolabel/models/google.py
index 4731f302..72d69d14 100644
--- a/src/autolabel/models/google.py
+++ b/src/autolabel/models/google.py
@@ -151,7 +151,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         if self.model_name is None:
             return 0.0
         cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name]
diff --git a/src/autolabel/models/hf_pipeline.py b/src/autolabel/models/hf_pipeline.py
index 775b3a20..ea9b3202 100644
--- a/src/autolabel/models/hf_pipeline.py
+++ b/src/autolabel/models/hf_pipeline.py
@@ -116,7 +116,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         # Model inference for this model is being run locally
         # Revisit this in the future when we support HF inference endpoints
         return 0.0
diff --git a/src/autolabel/models/hf_pipeline_vision.py b/src/autolabel/models/hf_pipeline_vision.py
index 49ef94ae..96f89cde 100644
--- a/src/autolabel/models/hf_pipeline_vision.py
+++ b/src/autolabel/models/hf_pipeline_vision.py
@@ -107,7 +107,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
             generations=generations, errors=[None] * len(generations)
         )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         # Model inference for this model is being run locally
         # Revisit this in the future when we support HF inference endpoints
         return 0.0
diff --git a/src/autolabel/models/mistral.py b/src/autolabel/models/mistral.py
index 260e23ad..d8deed0e 100644
--- a/src/autolabel/models/mistral.py
+++ b/src/autolabel/models/mistral.py
@@ -197,7 +197,9 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes
             generations=generations, errors=errors, latencies=latencies
         )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         cost_per_prompt_char = self.COST_PER_PROMPT_TOKEN[self.model_name]
         cost_per_completion_char = self.COST_PER_COMPLETION_TOKEN[self.model_name]
         return cost_per_prompt_char * len(prompt) + cost_per_completion_char * (
diff --git a/src/autolabel/models/openai.py b/src/autolabel/models/openai.py
index 1d0da475..2f5b4747 100644
--- a/src/autolabel/models/openai.py
+++ b/src/autolabel/models/openai.py
@@ -235,6 +235,7 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes
             end_time = time()
             return RefuelLLMResult(
                 generations=generations,
+                llm_output=result.llm_output,
                 errors=[None] * len(generations),
                 latencies=[end_time - start_time] * len(generations),
             )
@@ -307,6 +308,7 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
             end_time = time()
             return RefuelLLMResult(
                 generations=generations,
+                llm_output=result.llm_output,
                 errors=[None] * len(generations),
                 latencies=[end_time - start_time] * len(generations),
             )
@@ -339,19 +341,35 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
-        encoding = self.tiktoken.encoding_for_model(self.model_name)
-        num_prompt_toks = len(encoding.encode(prompt))
-        if label:
-            num_label_toks = len(encoding.encode(label))
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
+        num_cached_toks = 0
+        if llm_output and "token_usage" in llm_output:
+            num_prompt_toks = llm_output["token_usage"]["prompt_tokens"]
+            num_label_toks = llm_output["token_usage"]["completion_tokens"]
+            num_cached_toks = (
+                llm_output["token_usage"]
+                .get("prompt_tokens_details", {})
+                .get("cached_tokens", 0)
+            )
+            num_prompt_toks -= num_cached_toks
         else:
-            # get an upper bound
-            num_label_toks = self.model_params["max_tokens"]
+            encoding = self.tiktoken.encoding_for_model(self.model_name)
+            num_prompt_toks = len(encoding.encode(prompt))
+            if label:
+                num_label_toks = len(encoding.encode(label))
+            else:
+                # get an upper bound
+                num_label_toks = self.model_params["max_tokens"]
 
         cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name]
+        cost_per_cached_prompt_token = cost_per_prompt_token / 2.0
         cost_per_completion_token = self.COST_PER_COMPLETION_TOKEN[self.model_name]
-        return (num_prompt_toks * cost_per_prompt_token) + (
-            num_label_toks * cost_per_completion_token
+        return (
+            (num_prompt_toks * cost_per_prompt_token)
+            + (num_cached_toks * cost_per_cached_prompt_token)
+            + (num_label_toks * cost_per_completion_token)
         )
 
     def returns_token_probs(self) -> bool:
diff --git a/src/autolabel/models/openai_vision.py b/src/autolabel/models/openai_vision.py
index b37a2336..6f7f0e24 100644
--- a/src/autolabel/models/openai_vision.py
+++ b/src/autolabel/models/openai_vision.py
@@ -136,7 +136,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
             latencies=[time() - start_time] * len(generations),
         )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         encoding = self.tiktoken.encoding_for_model(self.model_name)
         num_prompt_toks = len(encoding.encode(prompt))
         if label:
diff --git a/src/autolabel/models/refuelV2.py b/src/autolabel/models/refuelV2.py
index 40eb16e9..11dc10f7 100644
--- a/src/autolabel/models/refuelV2.py
+++ b/src/autolabel/models/refuelV2.py
@@ -279,7 +279,9 @@ def _prepare_output_schema(self, schema: Dict) -> Dict:
                     curr_schema[key] = self._prepare_output_schema(curr_schema[key])
         return curr_schema
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         return 0
 
     def returns_token_probs(self) -> bool:
diff --git a/src/autolabel/models/vllm.py b/src/autolabel/models/vllm.py
index a4ea51a3..9a2b72e4 100644
--- a/src/autolabel/models/vllm.py
+++ b/src/autolabel/models/vllm.py
@@ -1,5 +1,5 @@
 import logging
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from autolabel.models import BaseModel
 from autolabel.configs import AutolabelConfig
@@ -115,7 +115,9 @@ def _process_confidence_request(self, logprobs):
             resp.append({curr_logprob_obj.decoded_token: curr_logprob_obj.logprob})
         return resp
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         return 0
 
     def returns_token_probs(self) -> bool:
diff --git a/src/autolabel/schema.py b/src/autolabel/schema.py
index 4fac3c7a..08d434ca 100644
--- a/src/autolabel/schema.py
+++ b/src/autolabel/schema.py
@@ -199,6 +199,9 @@ class RefuelLLMResult(BaseModel):
 
     generations: List[List[Union[Generation, ChatGeneration]]]
 
+    """Arbitrary LLM provider-specific output."""
+    llm_output: Optional[dict] = None
+
     """Errors encountered while running the labeling job"""
     errors: List[Optional[LabelingError]]
 

From 96de3209326875fbd657c3ce263878e31036005f Mon Sep 17 00:00:00 2001
From: Sahil <yadavsahil197@gmail.com>
Date: Tue, 1 Oct 2024 17:40:49 -0700
Subject: [PATCH 2/2] black fmt

---
 src/autolabel/models/anthropic.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/autolabel/models/anthropic.py b/src/autolabel/models/anthropic.py
index 186a2025..581f236f 100644
--- a/src/autolabel/models/anthropic.py
+++ b/src/autolabel/models/anthropic.py
@@ -105,7 +105,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None) -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         num_prompt_toks = len(self.tokenizer.encode(prompt).ids)
         if label:
             num_label_toks = len(self.tokenizer.encode(label).ids)