diff --git a/scripts/generate_logprobs.sh b/scripts/generate_logprobs.sh index 770ca7ad..fc1f836a 100644 --- a/scripts/generate_logprobs.sh +++ b/scripts/generate_logprobs.sh @@ -4,7 +4,7 @@ BATCH_SIZE=80 # This worked well in my CPU, but 200 was too much DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized" USERNAME="transcendingvictor" # your Hugging Face username -TOKEN="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" # your Hugging Face API token +TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaaa" # your Hugging Face API token # List of models diff --git a/scripts/inference.py b/scripts/inference.py index bb0c346c..52076e3b 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -1,6 +1,7 @@ import argparse import os +import numpy as np import pandas as pd import torch from datasets import Dataset, load_dataset @@ -35,27 +36,21 @@ def main( model = AutoModelForCausalLM.from_pretrained(model_name) - logprobs_list = [] total_sequences = ( len(val_ds) if not funct_test else 320 ) # Use only 320 sequences if funct_test is True + logprobs = np.empty((total_sequences, 513)) + logprobs[:, 0] = float("nan") for i in tqdm(range(0, total_sequences, batch_size)): batch_end = min(i + batch_size, total_sequences) batch_sequences = [val_ds[j]["tokens"] for j in range(i, batch_end)] batch_sequences_tensor = torch.tensor(batch_sequences) - _, next_logprobs = get_all_and_next_logprobs(model, batch_sequences_tensor) - logprobs_list.append(next_logprobs) + logprobs_tensor = get_all_and_next_logprobs(model, batch_sequences_tensor)[1] + logprobs[i:batch_end, 1:] = logprobs_tensor.cpu().numpy() - accumulated_logprobs = torch.cat(logprobs_list, dim=0) - - nan_tensor = torch.full((accumulated_logprobs.size(0), 1), float("nan")) - extended_next_logprobs = torch.cat( - [nan_tensor, accumulated_logprobs], dim=1 - ) # 513 tokens - - df_dataset = pd.DataFrame({"logprobs": extended_next_logprobs.tolist()}) + df_dataset = pd.DataFrame({"logprobs": [row for row in logprobs]}) hf_dataset = Dataset.from_pandas(df_dataset) # change the repo_id to your hf username in generate_logprobs.sh diff --git a/tests/scripts/functional_test_generate_logprobs.sh b/tests/scripts/functional_test_generate_logprobs.sh index 9f95190c..95085645 100644 --- a/tests/scripts/functional_test_generate_logprobs.sh +++ b/tests/scripts/functional_test_generate_logprobs.sh @@ -5,7 +5,7 @@ BATCH_SIZE=80 DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized" USERNAME="transcendingvictor" # Your Hugging Face username -TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaaaaaa" # Your Hugging Face API token +TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaa" # Your Hugging Face API token # List of models declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"