refuel-ai · yadavsahil197 · Oct 1, 2024 · Oct 2, 2024
diff --git a/src/autolabel/models/anthropic.py b/src/autolabel/models/anthropic.py
@@ -105,7 +105,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         num_prompt_toks = len(self.tokenizer.encode(prompt).ids)
         if label:
             num_label_toks = len(self.tokenizer.encode(label).ids)

diff --git a/src/autolabel/models/base.py b/src/autolabel/models/base.py
@@ -50,7 +50,11 @@ async def label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResul
                 new_results = self._label(missing_prompts, output_schema)
             for ind, prompt in enumerate(missing_prompts):
                 costs.append(
-                    self.get_cost(prompt, label=new_results.generations[ind][0].text)
+                    self.get_cost(
+                        prompt,
+                        label=new_results.generations[ind][0].text,
+                        llm_output=new_results.llm_output,
+                    )
                 )
 
             # Set the existing prompts to the new results
@@ -77,7 +81,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
         pass
 
     @abstractmethod
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         pass
 
     def get_cached_prompts(self, prompts: List[str]) -> Optional[str]:

diff --git a/src/autolabel/models/cohere.py b/src/autolabel/models/cohere.py
@@ -66,7 +66,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         num_prompt_toks = len(self.co.tokenize(prompt).tokens)
         if label:
             num_label_toks = len(self.co.tokenize(label).tokens)

diff --git a/src/autolabel/models/google.py b/src/autolabel/models/google.py
@@ -151,7 +151,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         if self.model_name is None:
             return 0.0
         cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name]

diff --git a/src/autolabel/models/hf_pipeline.py b/src/autolabel/models/hf_pipeline.py
@@ -116,7 +116,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         # Model inference for this model is being run locally
         # Revisit this in the future when we support HF inference endpoints
         return 0.0

diff --git a/src/autolabel/models/hf_pipeline_vision.py b/src/autolabel/models/hf_pipeline_vision.py
@@ -107,7 +107,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
             generations=generations, errors=[None] * len(generations)
         )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         # Model inference for this model is being run locally
         # Revisit this in the future when we support HF inference endpoints
         return 0.0

diff --git a/src/autolabel/models/mistral.py b/src/autolabel/models/mistral.py
@@ -197,7 +197,9 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes
             generations=generations, errors=errors, latencies=latencies
         )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         cost_per_prompt_char = self.COST_PER_PROMPT_TOKEN[self.model_name]
         cost_per_completion_char = self.COST_PER_COMPLETION_TOKEN[self.model_name]
         return cost_per_prompt_char * len(prompt) + cost_per_completion_char * (

diff --git a/src/autolabel/models/openai.py b/src/autolabel/models/openai.py
@@ -235,6 +235,7 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes
             end_time = time()
             return RefuelLLMResult(
                 generations=generations,
+                llm_output=result.llm_output,
                 errors=[None] * len(generations),
                 latencies=[end_time - start_time] * len(generations),
             )
@@ -307,6 +308,7 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
             end_time = time()
             return RefuelLLMResult(
                 generations=generations,
+                llm_output=result.llm_output,
                 errors=[None] * len(generations),
                 latencies=[end_time - start_time] * len(generations),
             )
@@ -339,19 +341,35 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
                 latencies=[0 for _ in prompts],
             )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
-        encoding = self.tiktoken.encoding_for_model(self.model_name)
-        num_prompt_toks = len(encoding.encode(prompt))
-        if label:
-            num_label_toks = len(encoding.encode(label))
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
+        num_cached_toks = 0
+        if llm_output and "token_usage" in llm_output:
+            num_prompt_toks = llm_output["token_usage"]["prompt_tokens"]
+            num_label_toks = llm_output["token_usage"]["completion_tokens"]
+            num_cached_toks = (
+                llm_output["token_usage"]
+                .get("prompt_tokens_details", {})
+                .get("cached_tokens", 0)
+            )
+            num_prompt_toks -= num_cached_toks
         else:
-            # get an upper bound
-            num_label_toks = self.model_params["max_tokens"]
+            encoding = self.tiktoken.encoding_for_model(self.model_name)
+            num_prompt_toks = len(encoding.encode(prompt))
+            if label:
+                num_label_toks = len(encoding.encode(label))
+            else:
+                # get an upper bound
+                num_label_toks = self.model_params["max_tokens"]
 
         cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name]
+        cost_per_cached_prompt_token = cost_per_prompt_token / 2.0
         cost_per_completion_token = self.COST_PER_COMPLETION_TOKEN[self.model_name]
-        return (num_prompt_toks * cost_per_prompt_token) + (
-            num_label_toks * cost_per_completion_token
+        return (
+            (num_prompt_toks * cost_per_prompt_token)
+            + (num_cached_toks * cost_per_cached_prompt_token)
+            + (num_label_toks * cost_per_completion_token)
         )
 
     def returns_token_probs(self) -> bool:

diff --git a/src/autolabel/models/openai_vision.py b/src/autolabel/models/openai_vision.py
@@ -136,7 +136,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
             latencies=[time() - start_time] * len(generations),
         )
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         encoding = self.tiktoken.encoding_for_model(self.model_name)
         num_prompt_toks = len(encoding.encode(prompt))
         if label:

diff --git a/src/autolabel/models/refuelV2.py b/src/autolabel/models/refuelV2.py
@@ -279,7 +279,9 @@ def _prepare_output_schema(self, schema: Dict) -> Dict:
                     curr_schema[key] = self._prepare_output_schema(curr_schema[key])
         return curr_schema
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         return 0
 
     def returns_token_probs(self) -> bool:

diff --git a/src/autolabel/models/vllm.py b/src/autolabel/models/vllm.py
@@ -1,5 +1,5 @@
 import logging
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from autolabel.models import BaseModel
 from autolabel.configs import AutolabelConfig
@@ -115,7 +115,9 @@ def _process_confidence_request(self, logprobs):
             resp.append({curr_logprob_obj.decoded_token: curr_logprob_obj.logprob})
         return resp
 
-    def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
+    def get_cost(
+        self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
+    ) -> float:
         return 0
 
     def returns_token_probs(self) -> bool:

diff --git a/src/autolabel/schema.py b/src/autolabel/schema.py
@@ -199,6 +199,9 @@ class RefuelLLMResult(BaseModel):
 
     generations: List[List[Union[Generation, ChatGeneration]]]
 
+    """Arbitrary LLM provider-specific output."""
+    llm_output: Optional[dict] = None
+
     """Errors encountered while running the labeling job"""
     errors: List[Optional[LabelingError]]