diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py index 01bf4cf1..6519eaca 100644 --- a/scripts/label_all_tokens.py +++ b/scripts/label_all_tokens.py @@ -1,10 +1,10 @@ import argparse import pickle -from pathlib import Path from tqdm.auto import tqdm from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from delphi.constants import STATIC_ASSETS_DIR from delphi.eval import token_labelling @@ -35,14 +35,13 @@ def main(): args = parser.parse_args() # Access command-line arguments - # Directory to save the results - SAVE_DIR = Path("src/delphi/eval/") + model_name = args.model_name print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n") print(f"You chose the model: {model_name}\n") print( - f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{SAVE_DIR}'\n" + f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{STATIC_ASSETS_DIR}'\n" ) # ================ (1) ================= @@ -60,8 +59,8 @@ def main(): # Save the list of all tokens to a file filename = "all_tokens_list.txt" - filepath = SAVE_DIR / filename - with open(filepath, "w", encoding="utf-8") as f: + filepath = STATIC_ASSETS_DIR.joinpath(filename) + with open(f"{filepath}", "w", encoding="utf-8") as f: f.write(tokens_str) print(f"Saved the list of all tokens to:\n\t{filepath}\n") @@ -88,8 +87,8 @@ def main(): # Save the labelled tokens to a file filename = "labelled_token_ids_dict.pkl" - filepath = SAVE_DIR / filename - with open(filepath, "wb") as f: + filepath = STATIC_ASSETS_DIR.joinpath(filename) + with open(f"{filepath}", "wb") as f: pickle.dump(labelled_token_ids_dict, f) print(f"Saved the labelled tokens to:\n\t{filepath}\n") @@ -97,7 +96,7 @@ def main(): # sanity check that The pickled and the original dict are the same print("Sanity check ...", end="") # load pickle - with open(filepath, "rb") as f: + with open(f"{filepath}", "rb") as f: pickled = pickle.load(f) # compare assert labelled_token_ids_dict == pickled diff --git a/scripts/map_tokens.py b/scripts/map_tokens.py index 327f4651..2acea2da 100644 --- a/scripts/map_tokens.py +++ b/scripts/map_tokens.py @@ -3,6 +3,7 @@ import argparse import pickle +from delphi.constants import STATIC_ASSETS_DIR from delphi.eval.token_map import token_map from delphi.eval.utils import load_validation_dataset @@ -18,5 +19,5 @@ mapping = token_map(dataset) - with open(f"data/{args.output}", "wb") as f: + with open(f"{STATIC_ASSETS_DIR}/{args.output}", "wb") as f: pickle.dump(mapping, file=f) diff --git a/setup.py b/setup.py index 80059fc2..a4156702 100644 --- a/setup.py +++ b/setup.py @@ -5,4 +5,5 @@ version="0.1", packages=find_packages(where="src"), package_dir={"": "src"}, + package_data={"delphi.static": ["*"]}, ) diff --git a/src/delphi/constants.py b/src/delphi/constants.py new file mode 100644 index 00000000..5216566c --- /dev/null +++ b/src/delphi/constants.py @@ -0,0 +1,3 @@ +from importlib.resources import files + +STATIC_ASSETS_DIR = files("delphi.static") diff --git a/src/delphi/eval/labelled_token_ids_dict.pkl b/src/delphi/eval/labelled_token_ids_dict.pkl deleted file mode 100644 index 5fe96a39..00000000 Binary files a/src/delphi/eval/labelled_token_ids_dict.pkl and /dev/null differ diff --git a/src/delphi/static/__init__.py b/src/delphi/static/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/delphi/eval/all_tokens_list.txt b/src/delphi/static/all_tokens_list.txt similarity index 100% rename from src/delphi/eval/all_tokens_list.txt rename to src/delphi/static/all_tokens_list.txt diff --git a/src/delphi/static/labelled_token_ids_dict.pkl b/src/delphi/static/labelled_token_ids_dict.pkl new file mode 100644 index 00000000..e8442a01 Binary files /dev/null and b/src/delphi/static/labelled_token_ids_dict.pkl differ diff --git a/src/delphi/static/token_map.pkl b/src/delphi/static/token_map.pkl new file mode 100644 index 00000000..dec1a6a7 Binary files /dev/null and b/src/delphi/static/token_map.pkl differ