-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add truncation on token count and not on word count * decouple system prompt formatting from document formatting * add check in prompt formatter for token length * update tests
- Loading branch information
Showing
10 changed files
with
290 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import logging | ||
from dataclasses import dataclass | ||
|
||
import pandas as pd | ||
|
||
from buster.tokenizers import Tokenizer | ||
|
||
logger = logging.getLogger(__name__) | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
|
||
@dataclass | ||
class DocumentsFormatter: | ||
tokenizer: Tokenizer | ||
max_tokens: int | ||
|
||
def format( | ||
self, | ||
matched_documents: pd.DataFrame, | ||
) -> tuple[str, pd.DataFrame]: | ||
"""Format our matched documents to plaintext. | ||
We also make sure they fit in the alloted max_tokens space. | ||
""" | ||
documents_str = "" | ||
total_tokens = 0 | ||
max_tokens = self.max_tokens | ||
|
||
num_total_docs = len(matched_documents) | ||
num_preserved_docs = 0 | ||
for doc in matched_documents.content.to_list(): | ||
num_preserved_docs += 1 | ||
token_count, encoded = self.tokenizer.num_tokens(doc, return_encoded=True) | ||
if total_tokens + token_count <= max_tokens: | ||
documents_str += f"<DOCUMENT> {doc} <\\DOCUMENT>" | ||
total_tokens += token_count | ||
else: | ||
logger.warning("truncating document to fit...") | ||
remaining_tokens = max_tokens - total_tokens | ||
truncated_doc = self.tokenizer.decode(encoded[:remaining_tokens]) | ||
documents_str += f"<DOCUMENT> {truncated_doc} <\\DOCUMENT>" | ||
logger.warning(f"Documents after truncation: {documents_str}") | ||
break | ||
|
||
if num_preserved_docs < (num_total_docs): | ||
logger.warning( | ||
f"{num_preserved_docs}/{num_total_docs} documents were preserved from the matched documents due to truncation." | ||
) | ||
matched_documents = matched_documents.iloc[:num_preserved_docs] | ||
|
||
return documents_str, matched_documents | ||
|
||
|
||
def document_formatter_factory(tokenizer: Tokenizer, max_tokens): | ||
return DocumentsFormatter( | ||
tokenizer=tokenizer, | ||
max_tokens=max_tokens, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from .base import Tokenizer | ||
from .gpt import GPTTokenizer | ||
|
||
|
||
def tokenizer_factory(tokenizer_cfg: dict) -> Tokenizer: | ||
model_name = tokenizer_cfg["model_name"] | ||
if model_name in ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]: | ||
return GPTTokenizer(model_name) | ||
|
||
raise ValueError(f"Tokenizer not implemented for {model_name=}") | ||
|
||
|
||
__all__ = [Tokenizer, GPTTokenizer, tokenizer_factory] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Union | ||
|
||
|
||
class Tokenizer(ABC): | ||
"""Abstract base class for a tokenizer.""" | ||
|
||
def __init__(self, model_name: str): | ||
self.model_name = model_name | ||
|
||
@abstractmethod | ||
def encode(self, string: str) -> list[int]: | ||
... | ||
|
||
@abstractmethod | ||
def decode(self, encoded: list[int]) -> str: | ||
... | ||
|
||
def num_tokens(self, string: str, return_encoded: bool = False) -> Union[int, tuple[int, list[int]]]: | ||
encoded = self.encode(string) | ||
if return_encoded: | ||
return len(encoded), encoded | ||
return len(encoded) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import tiktoken | ||
|
||
from buster.tokenizers import Tokenizer | ||
|
||
|
||
class GPTTokenizer(Tokenizer): | ||
"""Tokenizer from openai, supports most GPT models.""" | ||
|
||
def __init__(self, model_name: str): | ||
super().__init__(model_name) | ||
self.encoder = tiktoken.encoding_for_model(model_name=model_name) | ||
|
||
def encode(self, string: str): | ||
return self.encoder.encode(string) | ||
|
||
def decode(self, encoded: list[int]): | ||
return self.encoder.decode(encoded) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.