diff --git a/scripts/generate_logprobs.sh b/scripts/generate_logprobs.sh index 770ca7ad..fc1f836a 100644 --- a/scripts/generate_logprobs.sh +++ b/scripts/generate_logprobs.sh @@ -4,7 +4,7 @@ BATCH_SIZE=80 # This worked well in my CPU, but 200 was too much DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized" USERNAME="transcendingvictor" # your Hugging Face username -TOKEN="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" # your Hugging Face API token +TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaaa" # your Hugging Face API token # List of models diff --git a/scripts/inference.py b/scripts/inference.py index bb0c346c..52076e3b 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -1,6 +1,7 @@ import argparse import os +import numpy as np import pandas as pd import torch from datasets import Dataset, load_dataset @@ -35,27 +36,21 @@ def main( model = AutoModelForCausalLM.from_pretrained(model_name) - logprobs_list = [] total_sequences = ( len(val_ds) if not funct_test else 320 ) # Use only 320 sequences if funct_test is True + logprobs = np.empty((total_sequences, 513)) + logprobs[:, 0] = float("nan") for i in tqdm(range(0, total_sequences, batch_size)): batch_end = min(i + batch_size, total_sequences) batch_sequences = [val_ds[j]["tokens"] for j in range(i, batch_end)] batch_sequences_tensor = torch.tensor(batch_sequences) - _, next_logprobs = get_all_and_next_logprobs(model, batch_sequences_tensor) - logprobs_list.append(next_logprobs) + logprobs_tensor = get_all_and_next_logprobs(model, batch_sequences_tensor)[1] + logprobs[i:batch_end, 1:] = logprobs_tensor.cpu().numpy() - accumulated_logprobs = torch.cat(logprobs_list, dim=0) - - nan_tensor = torch.full((accumulated_logprobs.size(0), 1), float("nan")) - extended_next_logprobs = torch.cat( - [nan_tensor, accumulated_logprobs], dim=1 - ) # 513 tokens - - df_dataset = pd.DataFrame({"logprobs": extended_next_logprobs.tolist()}) + df_dataset = pd.DataFrame({"logprobs": [row for row in logprobs]}) hf_dataset = Dataset.from_pandas(df_dataset) # change the repo_id to your hf username in generate_logprobs.sh diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py index 01bf4cf1..6519eaca 100644 --- a/scripts/label_all_tokens.py +++ b/scripts/label_all_tokens.py @@ -1,10 +1,10 @@ import argparse import pickle -from pathlib import Path from tqdm.auto import tqdm from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from delphi.constants import STATIC_ASSETS_DIR from delphi.eval import token_labelling @@ -35,14 +35,13 @@ def main(): args = parser.parse_args() # Access command-line arguments - # Directory to save the results - SAVE_DIR = Path("src/delphi/eval/") + model_name = args.model_name print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n") print(f"You chose the model: {model_name}\n") print( - f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{SAVE_DIR}'\n" + f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{STATIC_ASSETS_DIR}'\n" ) # ================ (1) ================= @@ -60,8 +59,8 @@ def main(): # Save the list of all tokens to a file filename = "all_tokens_list.txt" - filepath = SAVE_DIR / filename - with open(filepath, "w", encoding="utf-8") as f: + filepath = STATIC_ASSETS_DIR.joinpath(filename) + with open(f"{filepath}", "w", encoding="utf-8") as f: f.write(tokens_str) print(f"Saved the list of all tokens to:\n\t{filepath}\n") @@ -88,8 +87,8 @@ def main(): # Save the labelled tokens to a file filename = "labelled_token_ids_dict.pkl" - filepath = SAVE_DIR / filename - with open(filepath, "wb") as f: + filepath = STATIC_ASSETS_DIR.joinpath(filename) + with open(f"{filepath}", "wb") as f: pickle.dump(labelled_token_ids_dict, f) print(f"Saved the labelled tokens to:\n\t{filepath}\n") @@ -97,7 +96,7 @@ def main(): # sanity check that The pickled and the original dict are the same print("Sanity check ...", end="") # load pickle - with open(filepath, "rb") as f: + with open(f"{filepath}", "rb") as f: pickled = pickle.load(f) # compare assert labelled_token_ids_dict == pickled diff --git a/scripts/map_tokens.py b/scripts/map_tokens.py index 1be48f97..a82a0faf 100644 --- a/scripts/map_tokens.py +++ b/scripts/map_tokens.py @@ -5,6 +5,7 @@ import pandas as pd from datasets import Dataset +from delphi.constants import STATIC_ASSETS_DIR from delphi.eval.token_map import token_map from delphi.eval.utils import load_validation_dataset @@ -38,6 +39,7 @@ hf_dataset = Dataset.from_dict({"prompt_pos_idx": list(complete_mapping.values())}) + repo_id = f"{args.username}/v0-token-map" # location in to hf hf_dataset.push_to_hub( diff --git a/setup.py b/setup.py index 80059fc2..a4156702 100644 --- a/setup.py +++ b/setup.py @@ -5,4 +5,5 @@ version="0.1", packages=find_packages(where="src"), package_dir={"": "src"}, + package_data={"delphi.static": ["*"]}, ) diff --git a/src/delphi/constants.py b/src/delphi/constants.py new file mode 100644 index 00000000..5216566c --- /dev/null +++ b/src/delphi/constants.py @@ -0,0 +1,3 @@ +from importlib.resources import files + +STATIC_ASSETS_DIR = files("delphi.static") diff --git a/src/delphi/eval/labelled_token_ids_dict.pkl b/src/delphi/eval/labelled_token_ids_dict.pkl deleted file mode 100644 index 5fe96a39..00000000 Binary files a/src/delphi/eval/labelled_token_ids_dict.pkl and /dev/null differ diff --git a/src/delphi/static/__init__.py b/src/delphi/static/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/delphi/eval/all_tokens_list.txt b/src/delphi/static/all_tokens_list.txt similarity index 100% rename from src/delphi/eval/all_tokens_list.txt rename to src/delphi/static/all_tokens_list.txt diff --git a/src/delphi/static/labelled_token_ids_dict.pkl b/src/delphi/static/labelled_token_ids_dict.pkl new file mode 100644 index 00000000..e8442a01 Binary files /dev/null and b/src/delphi/static/labelled_token_ids_dict.pkl differ diff --git a/src/delphi/static/token_map.pkl b/src/delphi/static/token_map.pkl new file mode 100644 index 00000000..dec1a6a7 Binary files /dev/null and b/src/delphi/static/token_map.pkl differ diff --git a/tests/scripts/functional_test_generate_logprobs.sh b/tests/scripts/functional_test_generate_logprobs.sh index 9f95190c..95085645 100644 --- a/tests/scripts/functional_test_generate_logprobs.sh +++ b/tests/scripts/functional_test_generate_logprobs.sh @@ -5,7 +5,7 @@ BATCH_SIZE=80 DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized" USERNAME="transcendingvictor" # Your Hugging Face username -TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaaaaaa" # Your Hugging Face API token +TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaa" # Your Hugging Face API token # List of models declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"