delphi-suite · jettjaniak · Mar 13, 2024 · Feb 18, 2024 · Feb 18, 2024 · Feb 18, 2024
diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,7 @@ tqdm==4.66.1
 ipywidgets==8.1.1
 nbformat==5.9.2
 pytest==7.4.4
+pytest-dependency==0.6.0
 black==23.12.1
 jaxtyping==0.2.25
 beartype==0.16.4
@@ -15,4 +16,5 @@ chardet==5.2.0
 sentencepiece==0.1.99
 protobuf==4.25.2
 plotly==5.18.0
-spacy-transformers==1.3.4
+spacy-transformers==1.3.4
+pandas==1.3.4
diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py
@@ -1,6 +1,8 @@
 import argparse
 import pickle
+from pathlib import Path
 
+import pandas as pd
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -26,81 +28,105 @@ def main():
     # Setup argparse
     parser = argparse.ArgumentParser(description="Tokenization and labeling utility.")
     parser.add_argument(
-        "--model_name",
+        "--model-name",
         type=str,
         help="Name of the model to use for tokenization and labeling.",
         default="delphi-suite/delphi-llama2-100k",
         required=False,
     )
+    parser.add_argument(
+        "--save-dir", type=str, help="Directory to save the results.", required=True
+    )
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        help="Format to save the results in. Options: csv, pkl. Default: csv.",
+        default="csv",
+        required=False,
+    )
     args = parser.parse_args()
 
     # Access command-line arguments
-
+    # Directory to save the results
+    save_dir = Path(args.save_dir)
+    save_dir.mkdir(parents=True, exist_ok=True)  # create directory if it does not exist
     model_name = args.model_name
+    output_format = args.output_format
+    assert output_format in [
+        "csv",
+        "pkl",
+    ], f"Invalid output format. Allowed: csv, pkl. Got: {output_format}"
 
     print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n")
     print(f"You chose the model: {model_name}\n")
     print(
-        f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{STATIC_ASSETS_DIR}'\n"
+        f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{save_dir}'\n"
     )
 
     # ================ (1) =================
     print("(1) Create a list of all tokens in the tokenizer's vocabulary ...")
 
     # Load the tokenizer from Huggingface
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    vocab_size = tokenizer.vocab_size
-    print("Loaded the tokenizer.\nThe vocab size is:", vocab_size)
+    print("Loaded the tokenizer.\nThe vocab size is:", tokenizer.vocab_size)
 
-    # Create a list of all tokens in the tokenizer's vocabulary
-    tokens_str = ""  # will hold all tokens and their ids
-    for i in range(tokenizer.vocab_size):
-        tokens_str += f"{i},{decode(tokenizer, i)}\n"
+    tokens_str, labelled_token_ids_dict = token_labelling.label_tokens_from_tokenizer(
+        tokenizer
+    )
 
     # Save the list of all tokens to a file
     filename = "all_tokens_list.txt"
-    filepath = STATIC_ASSETS_DIR.joinpath(filename)
-    with open(f"{filepath}", "w", encoding="utf-8") as f:
+    filepath = save_dir / filename  # TODO: use the static files of python module
+    with open(filepath, "w", encoding="utf-8") as f:
         f.write(tokens_str)
 
     print(f"Saved the list of all tokens to:\n\t{filepath}\n")
 
     # ================ (2) =================
     print("(2) Label each token ...")
 
-    # let's label each token
-    labelled_token_ids_dict: dict[int, dict[str, bool]] = {}  # token_id: labels
-    max_token_id = tokenizer.vocab_size  # stop at which token id, vocab size
-    # we iterate over all token_ids individually
-    for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
-        # decode the token_ids to get a list of tokens, a 'sentence'
-        tokens = decode(tokenizer, token_id)  # list of tokens == sentence
-        # put the sentence into a list, to make it a batch of sentences
-        sentences = [tokens]
-        # label the batch of sentences
-        labels = token_labelling.label_batch_sentences(
-            sentences, tokenized=True, verbose=False
-        )
-        # create a dict with the token_ids and their labels
-        # update the labelled_token_ids_dict with the new dict
-        labelled_token_ids_dict[token_id] = labels[0][0]
-
-    # Save the labelled tokens to a file
-    filename = "labelled_token_ids_dict.pkl"
-    filepath = STATIC_ASSETS_DIR.joinpath(filename)
-    with open(f"{filepath}", "wb") as f:
-        pickle.dump(labelled_token_ids_dict, f)
-
-    print(f"Saved the labelled tokens to:\n\t{filepath}\n")
-
-    # sanity check that The pickled and the original dict are the same
-    print("Sanity check ...", end="")
-    # load pickle
-    with open(f"{filepath}", "rb") as f:
-        pickled = pickle.load(f)
-    # compare
-    assert labelled_token_ids_dict == pickled
-    print(" completed.")
+    if output_format == "pkl":
+        # Save the labelled tokens to a file
+        filename = "labelled_token_ids.pkl"
+        filepath = save_dir / filename
+        with open(filepath, "wb") as f:
+            pickle.dump(labelled_token_ids_dict, f)
+
+        print(f"Saved the labelled tokens to:\n\t{filepath}\n")
+
+        # sanity check that The pickled and the original dict are the same
+        print("Sanity check ...", end="")
+        # load pickle
+        with open(filepath, "rb") as f:
+            pickled = pickle.load(f)
+        # compare
+        assert labelled_token_ids_dict == pickled
+        print(" completed.")
+
+    # ----------- CSV ------------------------
+    if output_format == "csv":
+        print("\nCreating the CSV ...")
+
+        df = token_labelling.convert_label_dict_to_df(labelled_token_ids_dict)
+
+        print("Sanity check pandas csv ...", end="")
+        # Perform sanity check, that the table was created correctly
+        for row_index, row_values in df.iterrows():
+            token_id = row_values.iloc[0]
+            label_pandas = list(
+                row_values.iloc[1:]
+            )  # we exclude the token_id from the colum
+            label_dict = list(labelled_token_ids_dict[token_id].values())[:]
+            assert (
+                label_pandas == label_dict
+            ), f"The dataframes are not equal for row {token_id}\n{label_pandas}\n{label_dict}"
+        print(" completed.")
+
+        # save the dataframe to a csv
+        filename = "labelled_token_ids.csv"
+        filepath = save_dir / filename
+        df.to_csv(filepath, index=False)
+        print(f"Saved the labelled tokens as CSV to:\n\t{filepath}\n")
 
     print(" END ".center(50, "="))
 

diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py
@@ -1,8 +1,13 @@
+import pickle
+from pathlib import Path
 from typing import Callable, Optional
 
+import pandas as pd
 import spacy
 from spacy.tokens import Doc, Token
 from spacy.util import is_package
+from tqdm.auto import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 # make sure the english language model capabilities are installed by the equivalent of:
 # python -m spacy download en_core_web_sm
@@ -208,3 +213,110 @@ def label_batch_sentences(
             print("\n")
 
     return labelled_sentences
+
+
+def label_tokens_from_tokenizer(
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+) -> tuple[str, dict[int, dict[str, bool]]]:
+    """
+    Labels all tokens in a tokenizer's vocabulary with the corresponding token categories (POS, named entity, etc). Returns two things: 1) `tokens_str`, a string where each token comprises 'token_id,token_str\n' and 2) `labelled_token_ids_dict` a dict that contains for each token_id (key) the corresponding token labels, which is in turn a dict, whith the label categories as keys and their boolean values as the dict's values.
+
+    Parameters
+    ----------
+    tokenizer : The tokenizer with its tokens to be labelled.
+
+    Returns
+    -------
+    tokens_str, labelled_token_ids_dict
+
+    """
+
+    def decode(
+        tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+        token_ids: int | list[int],
+    ) -> str:
+        return tokenizer.decode(token_ids, skip_special_tokens=True)
+
+    vocab_size = tokenizer.vocab_size
+
+    # 1) Create a list of all tokens in the tokenizer's vocabulary
+    tokens_str = ""  # will hold all tokens and their ids
+    for i in range(vocab_size):
+        tokens_str += f"{i},{decode(tokenizer, i)}\n"
+
+    # 2) let's label each token
+    labelled_token_ids_dict = {}  # token_id: labels
+    max_token_id = vocab_size  # stop at which token id, vocab size
+    # we iterate over all token_ids individually
+    for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
+        # decode the token_ids to get a list of tokens, a 'sentence'
+        token = decode(tokenizer, token_id)  # list of tokens == sentence
+        # put the sentence into a list, to make it a batch of sentences
+        sentences = [token]
+        # label the batch of sentences
+        labels = label_batch_sentences(sentences, tokenized=True, verbose=False)
+        # create a dict with the token_ids and their labels
+        # update the labelled_token_ids_dict with the new dict
+        label = labels[0][0]  # first sentence of batch, label of first token
+        labelled_token_ids_dict[token_id] = label
+
+    return tokens_str, labelled_token_ids_dict
+
+
+def import_token_labels(path: str | Path):
+    """
+    Imports token labels from a file. May be a .pkl or a .csv
+
+    Parameters
+    ----------
+    path : str | Path
+        The path to the file.
+
+    Returns
+    -------
+    dict[int, dict[str, bool]]
+        Returns the labelled tokens dict. Each token_id has its own dict having the labels.
+    """
+    if isinstance(path, str):
+        path = Path(path)
+    # make sure the file_type is compatible
+    file_type = path.suffix
+    assert file_type in [
+        ".csv",
+        ".pkl",
+    ], f"Invalid file type. Allowed: csv, pkl. Got: {file_type}"
+    # make sure file exists
+    if not path.exists():
+        raise FileNotFoundError(f"There is no file under {path}")
+    # load the file if CSV
+    if file_type == ".csv":
+        df = pd.read_csv(str(path))
+        categories = list(df.columns[1:])  # excluding first column: token_id
+        loaded_label_dict: dict[int, dict[str, bool]] = {}
+        # go through each row and construct the dict
+        for _, row in df.iterrows():
+            token_id = int(row["token_id"])
+            labels = {cat: bool(row[cat] == 1) for cat in categories}
+            loaded_label_dict[token_id] = labels
+
+    # load the file if a pickle
+    elif file_type == ".pkl":
+        with open(path, "rb") as f:
+            loaded_label_dict = pickle.load(f)
+
+    return loaded_label_dict
+
+
+def convert_label_dict_to_df(
+    labelled_token_ids_dict: dict[int, dict[str, bool]]
+) -> pd.DataFrame:
+    """
+    Takes a `labelled_token_ids_dict` and converts it into a Pandas Dataframe.
+    """
+    df = pd.DataFrame(labelled_token_ids_dict.items(), columns=["token_id", "label"])
+    # split the label column into multiple columns
+    df = df.join(pd.DataFrame(df.pop("label").tolist()))
+    # Change datatype of columns to float
+    df = df.astype(int)
+
+    return df
diff --git a/tests/eval/test_token_labelling.py b/tests/eval/test_token_labelling.py
@@ -1,10 +1,15 @@
+import pickle
+from pathlib import Path
+
 import pytest
-import spacy
 from spacy.language import Language
 from spacy.tokens import Doc
+from transformers import AutoTokenizer
 
 import delphi.eval.token_labelling as tl
 
+labelled_token_ids_dict: dict[int, dict[str, bool]] = {}
+
 
 @pytest.fixture
 def dummy_doc() -> tuple[str, Doc, dict[str, bool]]:
@@ -112,3 +117,75 @@ def test_label_batch_sentences(dummy_doc):
     # iterate through tokens in doc
     for token, label in zip(doc, labels[0]):
         assert label == tl.label_single_token(token)
+
+
+def is_valid_structure(obj: dict[int, dict[str, bool]]) -> bool:
+    """
+    Checks whether the obj fits the structure of `dict[int, dict[str, bool]]`. Returns True, if it fits, False otherwise.
+    """
+    if not isinstance(obj, dict):
+        print(f"Main structure is not dict! Instead is type {type(obj)}")
+        return False
+    for key, value in obj.items():
+        if not isinstance(key, int) or not isinstance(value, dict):
+            print(
+                f"Main structure is dict, but its keys are either not int or its values are not dicts. Instead key is type {type(key)} and value is type {type(value)}"
+            )
+            return False
+        for sub_key, sub_value in value.items():
+            if not isinstance(sub_key, str) or not isinstance(sub_value, bool):
+                print(
+                    f"The structure dict[int, dict[X, Y]] is True, but either X is not str or Y is not bool. Instead X is type {type(sub_key)} and Y is type {type(sub_value)}"
+                )
+                return False
+    return True
+
+
+@pytest.mark.dependency()
+def test_label_tokens_from_tokenizer():
+    """
+    Simple test, checking if download of tokinzer and the labelling of all tokens in its vocabulary works.
+    """
+    global labelled_token_ids_dict
+    # get a tokinzer
+    model_name = "delphi-suite/delphi-llama2-100k"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    vocab_size = tokenizer.vocab_size
+
+    tokens_str, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer)
+    # count the number of lines in the token_str
+    assert tokens_str.count("\n") == (vocab_size + 1)  # + 1, because of token '\n'
+    assert len(labelled_token_ids_dict.keys()) == vocab_size
+    assert is_valid_structure(labelled_token_ids_dict) == True
+
+
+@pytest.mark.dependency(depends=["test_label_tokens_from_tokenizer"])
+@pytest.mark.parametrize(
+    "path", [Path("temp/token_labels.csv"), Path("temp/token_labels.pkl")]
+)
+def test_import_token_labels(path: Path):
+    global labelled_token_ids_dict
+    assert (
+        labelled_token_ids_dict is not None
+    ), "It should be filled for the test to run. Check test-dependency."
+    assert (
+        labelled_token_ids_dict != {}
+    ), "It should be filled for the test to run. Check test-dependency."
+    # create the path
+    path.parent.mkdir(parents=True, exist_ok=True)
+    # save the file
+    if path.suffix == ".pkl":
+        with open(path, "wb") as file:
+            pickle.dump(labelled_token_ids_dict, file)
+    elif path.suffix == ".csv":
+        df = tl.convert_label_dict_to_df(labelled_token_ids_dict)
+        df.to_csv(path, index=False)
+    else:
+        raise ValueError("The file ending is incorrect.")
+
+    # load the file with our function to be tested
+    loaded_dict = tl.import_token_labels(path)
+
+    # assure that the structure is correct
+    assert loaded_dict == labelled_token_ids_dict
+    assert is_valid_structure(loaded_dict) == True