Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify get_cost to account for prompt caching #911

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/autolabel/models/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
num_prompt_toks = len(self.tokenizer.encode(prompt).ids)
if label:
num_label_toks = len(self.tokenizer.encode(label).ids)
Expand Down
10 changes: 8 additions & 2 deletions src/autolabel/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ async def label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResul
new_results = self._label(missing_prompts, output_schema)
for ind, prompt in enumerate(missing_prompts):
costs.append(
self.get_cost(prompt, label=new_results.generations[ind][0].text)
self.get_cost(
prompt,
label=new_results.generations[ind][0].text,
llm_output=new_results.llm_output,
)
)

# Set the existing prompts to the new results
Expand All @@ -77,7 +81,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
pass

@abstractmethod
def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
pass

def get_cached_prompts(self, prompts: List[str]) -> Optional[str]:
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
num_prompt_toks = len(self.co.tokenize(prompt).tokens)
if label:
num_label_toks = len(self.co.tokenize(label).tokens)
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
if self.model_name is None:
return 0.0
cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name]
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/hf_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
# Model inference for this model is being run locally
# Revisit this in the future when we support HF inference endpoints
return 0.0
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/hf_pipeline_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
generations=generations, errors=[None] * len(generations)
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
# Model inference for this model is being run locally
# Revisit this in the future when we support HF inference endpoints
return 0.0
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,9 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes
generations=generations, errors=errors, latencies=latencies
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
cost_per_prompt_char = self.COST_PER_PROMPT_TOKEN[self.model_name]
cost_per_completion_char = self.COST_PER_COMPLETION_TOKEN[self.model_name]
return cost_per_prompt_char * len(prompt) + cost_per_completion_char * (
Expand Down
36 changes: 27 additions & 9 deletions src/autolabel/models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ async def _alabel(self, prompts: List[str], output_schema: Dict) -> RefuelLLMRes
end_time = time()
return RefuelLLMResult(
generations=generations,
llm_output=result.llm_output,
errors=[None] * len(generations),
latencies=[end_time - start_time] * len(generations),
)
Expand Down Expand Up @@ -307,6 +308,7 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
end_time = time()
return RefuelLLMResult(
generations=generations,
llm_output=result.llm_output,
errors=[None] * len(generations),
latencies=[end_time - start_time] * len(generations),
)
Expand Down Expand Up @@ -339,19 +341,35 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[0 for _ in prompts],
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
encoding = self.tiktoken.encoding_for_model(self.model_name)
num_prompt_toks = len(encoding.encode(prompt))
if label:
num_label_toks = len(encoding.encode(label))
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
num_cached_toks = 0
if llm_output and "token_usage" in llm_output:
num_prompt_toks = llm_output["token_usage"]["prompt_tokens"]
num_label_toks = llm_output["token_usage"]["completion_tokens"]
num_cached_toks = (
llm_output["token_usage"]
.get("prompt_tokens_details", {})
.get("cached_tokens", 0)
)
num_prompt_toks -= num_cached_toks
else:
# get an upper bound
num_label_toks = self.model_params["max_tokens"]
encoding = self.tiktoken.encoding_for_model(self.model_name)
num_prompt_toks = len(encoding.encode(prompt))
if label:
num_label_toks = len(encoding.encode(label))
else:
# get an upper bound
num_label_toks = self.model_params["max_tokens"]

cost_per_prompt_token = self.COST_PER_PROMPT_TOKEN[self.model_name]
cost_per_cached_prompt_token = cost_per_prompt_token / 2.0
cost_per_completion_token = self.COST_PER_COMPLETION_TOKEN[self.model_name]
return (num_prompt_toks * cost_per_prompt_token) + (
num_label_toks * cost_per_completion_token
return (
(num_prompt_toks * cost_per_prompt_token)
+ (num_cached_toks * cost_per_cached_prompt_token)
+ (num_label_toks * cost_per_completion_token)
)

def returns_token_probs(self) -> bool:
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/openai_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def _label(self, prompts: List[str], output_schema: Dict) -> RefuelLLMResult:
latencies=[time() - start_time] * len(generations),
)

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
encoding = self.tiktoken.encoding_for_model(self.model_name)
num_prompt_toks = len(encoding.encode(prompt))
if label:
Expand Down
4 changes: 3 additions & 1 deletion src/autolabel/models/refuelV2.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,9 @@ def _prepare_output_schema(self, schema: Dict) -> Dict:
curr_schema[key] = self._prepare_output_schema(curr_schema[key])
return curr_schema

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
return 0

def returns_token_probs(self) -> bool:
Expand Down
6 changes: 4 additions & 2 deletions src/autolabel/models/vllm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Optional
from typing import Dict, List, Optional

from autolabel.models import BaseModel
from autolabel.configs import AutolabelConfig
Expand Down Expand Up @@ -115,7 +115,9 @@ def _process_confidence_request(self, logprobs):
resp.append({curr_logprob_obj.decoded_token: curr_logprob_obj.logprob})
return resp

def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
def get_cost(
self, prompt: str, label: Optional[str] = "", llm_output: Optional[Dict] = None
) -> float:
return 0

def returns_token_probs(self) -> bool:
Expand Down
3 changes: 3 additions & 0 deletions src/autolabel/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ class RefuelLLMResult(BaseModel):

generations: List[List[Union[Generation, ChatGeneration]]]

"""Arbitrary LLM provider-specific output."""
llm_output: Optional[dict] = None

"""Errors encountered while running the labeling job"""
errors: List[Optional[LabelingError]]

Expand Down
Loading