Skip to content

Commit

Permalink
prepend NaN values to logprobs and clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
transcendingvictor committed Feb 17, 2024
1 parent 77033f8 commit 67bf0f2
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 128 deletions.
27 changes: 0 additions & 27 deletions scripts/generate_logprobs_delete.sh

This file was deleted.

25 changes: 22 additions & 3 deletions scripts/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from jaxtyping import Int
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM

Expand All @@ -12,15 +13,33 @@
torch.set_grad_enabled(False)


def main(model_name, batch_size, dataset_name, token):
def main(model_name: str, batch_size: Int, dataset_name: str, token: str):
"""
Outputs the log probabilities of the next token for each token in the validation dataset.
And uploads the resulting dataset to huggingface.
Args:
- model_name: The name of the model to use for inference
- batch_size: The batch size for processing. 80 worked well in CPU.
- dataset_name: The name of the dataset from which validation set will be loaded
- token: Hugging Face API token
"""
val_ds = load_validation_dataset(dataset_name)

# model accepts 2D tensors (batch_size, seq_len)
val_sequences = torch.tensor([s["tokens"] for s in val_ds])
model = AutoModelForCausalLM.from_pretrained(model_name)
logprobs, next_logprobs = get_all_and_next_logprobs(model, val_sequences)

df_dataset = pd.DataFrame({"logprobs": next_logprobs.tolist()})
accumulated_logprobs = torch.tensor([], dtype=torch.float32)

for i in tqdm(range(0, len(val_sequences), batch_size)):
batch_sequences = val_sequences[i : i + batch_size]
_, next_logprobs = get_all_and_next_logprobs(model, val_sequences)
accumulated_logprobs = torch.cat((accumulated_logprobs, next_logprobs), dim=0)

nan_tensor = torch.full((accumulated_logprobs.size(0), 1), float("nan"))
extended_next_logprobs = torch.cat([nan_tensor, next_logprobs], dim=1) # 513 tokens

df_dataset = pd.DataFrame({"logprobs": extended_next_logprobs.tolist()})
hf_dataset = Dataset.from_pandas(df_dataset)

# change the repo_id to your hf username
Expand Down
98 changes: 0 additions & 98 deletions scripts/inference_delete.py

This file was deleted.

0 comments on commit 67bf0f2

Please sign in to comment.