delphi-suite · jettjaniak · Mar 13, 2024 · Feb 18, 2024 · Feb 18, 2024 · Feb 18, 2024
diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,7 @@ tqdm==4.66.1
 ipywidgets==8.1.1
 nbformat==5.9.2
 pytest==7.4.4
+pytest-dependency==0.6.0
 black==23.12.1
 jaxtyping==0.2.25
 beartype==0.16.4
@@ -15,4 +16,5 @@ chardet==5.2.0
 sentencepiece==0.1.99
 protobuf==4.25.2
 plotly==5.18.0
-spacy-transformers==1.3.4
+spacy-transformers==1.3.4
+pandas==1.3.4
diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py
@@ -1,6 +1,7 @@
 import argparse
 import pickle
 
+import pandas as pd
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -26,17 +27,34 @@ def main():
     # Setup argparse
     parser = argparse.ArgumentParser(description="Tokenization and labeling utility.")
     parser.add_argument(
-        "--model_name",
+        "--model-name",
         type=str,
         help="Name of the model to use for tokenization and labeling.",
         default="delphi-suite/delphi-llama2-100k",
         required=False,
     )
+    parser.add_argument(
+        "--save-dir", type=str, help="Directory to save the results.", required=True
+    )
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        help="Format to save the results in. Options: csv, pkl. Default: csv.",
+        default="csv",
+        required=False,
+    )
     args = parser.parse_args()
 
     # Access command-line arguments
-
+    # Directory to save the results
+    SAVE_DIR = Path(args.save_dir)
+    SAVE_DIR.mkdir(parents=True, exist_ok=True)  # create directory if it does not exist
     model_name = args.model_name
+    output_format = args.output_format
+    assert output_format in [
+        "csv",
+        "pkl",
+    ], f"Invalid output format. Allowed: csv, pkl. Got: {output_format}"
 
     print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n")
     print(f"You chose the model: {model_name}\n")
@@ -49,58 +67,65 @@ def main():
 
     # Load the tokenizer from Huggingface
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    vocab_size = tokenizer.vocab_size
-    print("Loaded the tokenizer.\nThe vocab size is:", vocab_size)
+    print("Loaded the tokenizer.\nThe vocab size is:", tokenizer.vocab_size)
 
-    # Create a list of all tokens in the tokenizer's vocabulary
-    tokens_str = ""  # will hold all tokens and their ids
-    for i in range(tokenizer.vocab_size):
-        tokens_str += f"{i},{decode(tokenizer, i)}\n"
+    tokens_str, labelled_token_ids_dict = token_labelling.label_tokens_from_tokenizer(
+        tokenizer
+    )
 
     # Save the list of all tokens to a file
     filename = "all_tokens_list.txt"
-    filepath = STATIC_ASSETS_DIR.joinpath(filename)
-    with open(f"{filepath}", "w", encoding="utf-8") as f:
+    filepath = SAVE_DIR / filename  # TODO: use the static files of python module
+    with open(filepath, "w", encoding="utf-8") as f:
         f.write(tokens_str)
 
     print(f"Saved the list of all tokens to:\n\t{filepath}\n")
 
     # ================ (2) =================
     print("(2) Label each token ...")
 
-    # let's label each token
-    labelled_token_ids_dict: dict[int, dict[str, bool]] = {}  # token_id: labels
-    max_token_id = tokenizer.vocab_size  # stop at which token id, vocab size
-    # we iterate over all token_ids individually
-    for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
-        # decode the token_ids to get a list of tokens, a 'sentence'
-        tokens = decode(tokenizer, token_id)  # list of tokens == sentence
-        # put the sentence into a list, to make it a batch of sentences
-        sentences = [tokens]
-        # label the batch of sentences
-        labels = token_labelling.label_batch_sentences(
-            sentences, tokenized=True, verbose=False
-        )
-        # create a dict with the token_ids and their labels
-        # update the labelled_token_ids_dict with the new dict
-        labelled_token_ids_dict[token_id] = labels[0][0]
-
-    # Save the labelled tokens to a file
-    filename = "labelled_token_ids_dict.pkl"
-    filepath = STATIC_ASSETS_DIR.joinpath(filename)
-    with open(f"{filepath}", "wb") as f:
-        pickle.dump(labelled_token_ids_dict, f)
-
-    print(f"Saved the labelled tokens to:\n\t{filepath}\n")
-
-    # sanity check that The pickled and the original dict are the same
-    print("Sanity check ...", end="")
-    # load pickle
-    with open(f"{filepath}", "rb") as f:
-        pickled = pickle.load(f)
-    # compare
-    assert labelled_token_ids_dict == pickled
-    print(" completed.")
+    if output_format == "pkl":
+        # Save the labelled tokens to a file
+        filename = "labelled_token_ids.pkl"
+        filepath = SAVE_DIR / filename
+        with open(filepath, "wb") as f:
+            pickle.dump(labelled_token_ids_dict, f)
+
+        print(f"Saved the labelled tokens to:\n\t{filepath}\n")
+
+        # sanity check that The pickled and the original dict are the same
+        print("Sanity check ...", end="")
+        # load pickle
+        with open(filepath, "rb") as f:
+            pickled = pickle.load(f)
+        # compare
+        assert labelled_token_ids_dict == pickled
+        print(" completed.")
+
+    # ----------- CSV ------------------------
+    if output_format == "csv":
+        print("\nCreating the CSV ...")
+
+        df = token_labelling.convert_label_dict_to_df(labelled_token_ids_dict)
+
+        print("Sanity check pandas csv ...", end="")
+        # Perform sanity check, that the table was created correctly
+        for row_index, row_values in df.iterrows():
+            token_id = row_values.iloc[0]
+            label_pandas = list(
+                row_values.iloc[1:]
+            )  # we exclude the token_id from the colum
+            label_dict = list(labelled_token_ids_dict[token_id].values())[:]
+            assert (
+                label_pandas == label_dict
+            ), f"The dataframes are not equal for row {token_id}\n{label_pandas}\n{label_dict}"
+        print(" completed.")
+
+        # save the dataframe to a csv
+        filename = "labelled_token_ids.csv"
+        filepath = SAVE_DIR / filename
+        df.to_csv(filepath, index=False)
+        print(f"Saved the labelled tokens as CSV to:\n\t{filepath}\n")
 
     print(" END ".center(50, "="))
 

diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py
@@ -1,8 +1,13 @@
+import pickle
+from pathlib import Path
 from typing import Callable, Optional
 
+import pandas as pd
 import spacy
 from spacy.tokens import Doc, Token
 from spacy.util import is_package
+from tqdm.auto import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 # make sure the english language model capabilities are installed by the equivalent of:
 # python -m spacy download en_core_web_sm
@@ -208,3 +213,111 @@ def label_batch_sentences(
             print("\n")
 
     return labelled_sentences
+
+
+def label_tokens_from_tokenizer(
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+) -> tuple[str, dict[int, dict[str, bool]]]:
+    """
+    Labels all tokens in a tokenizer's vocabulary with the corresponding token categories (POS, named entity, etc). Returns two things: 1) `tokens_str`, a string where each token comprises 'token_id,token_str\n' and 2) `labelled_token_ids_dict` a dict that contains for each token_id (key) the corresponding token labels, which is in turn a dict, whith the label categories as keys and their boolean values as the dict's values.
+
+    Parameters
+    ----------
+    tokenizer : The tokenizer with its tokens to be labelled.
+
+    Returns
+    -------
+    tokens_str, labelled_token_ids_dict
+
+    """
+
+    def decode(
+        tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+        token_ids: int | list[int],
+    ) -> str:
+        return tokenizer.decode(token_ids, skip_special_tokens=True)
+
+    vocab_size = tokenizer.vocab_size
+
+    # 1) Create a list of all tokens in the tokenizer's vocabulary
+    tokens_str = ""  # will hold all tokens and their ids
+    for i in range(vocab_size):
+        tokens_str += f"{i},{decode(tokenizer, i)}\n"
+
+    # 2) let's label each token
+    labelled_token_ids_dict = {}  # token_id: labels
+    max_token_id = vocab_size  # stop at which token id, vocab size
+    # we iterate over all token_ids individually
+    for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
+        # decode the token_ids to get a list of tokens, a 'sentence'
+        token = decode(tokenizer, token_id)  # list of tokens == sentence
+        # put the sentence into a list, to make it a batch of sentences
+        sentences = [token]
+        # label the batch of sentences
+        labels = label_batch_sentences(sentences, tokenized=True, verbose=False)
+        # create a dict with the token_ids and their labels
+        # update the labelled_token_ids_dict with the new dict
+        label = labels[0][0]  # first sentence of batch, label of first token
+        labelled_token_ids_dict[token_id] = label
+
+    return tokens_str, labelled_token_ids_dict
+
+
+def import_token_labels(path: str | Path):
+    """
+    Imports token labels from a file. May be a .pkl or a .csv
+
+    Parameters
+    ----------
+    path : str | Path
+        The path to the file.
+
+    Returns
+    -------
+    dict[int, dict[str, bool]]
+        Returns the labelled tokens dict. Each token_id has its own dict having the labels.
+    """
+    global labelled_token_ids_dict
+    if isinstance(path, str):
+        path = Path(path)
+    # make sure the file_type is compatible
+    file_type = path.suffix
+    assert file_type in [
+        ".csv",
+        ".pkl",
+    ], f"Invalid file type. Allowed: csv, pkl. Got: {file_type}"
+    # make sure file exists
+    if not path.exists():
+        raise FileNotFoundError(f"There is no file under {path}")
+    # load the file if CSV
+    if file_type == ".csv":
+        df = pd.read_csv(str(path))
+        categories = list(df.columns[1:])  # excluding first column: token_id
+        loaded_label_dict: dict[int, dict[str, bool]] = {}
+        # go through each row and construct the dict
+        for _, row in df.iterrows():
+            token_id = int(row["token_id"])
+            labels = {cat: bool(row[cat] == 1) for cat in categories}
+            loaded_label_dict[token_id] = labels
+
+    # load the file if a pickle
+    elif file_type == ".pkl":
+        with open(path, "rb") as f:
+            loaded_label_dict = pickle.load(f)
+
+    return loaded_label_dict
+
+
+def convert_label_dict_to_df(
+    labelled_token_ids_dict: dict[int, dict[str, bool]]
+) -> pd.DataFrame:
+    """
+    Takes a `labelled_token_ids_dict` and converts it into a Pandas Dataframe.
+    """
+    df = pd.DataFrame(labelled_token_ids_dict.items(), columns=["token_id", "label"])
+    # split the label column into multiple columns
+    df = df.join(pd.DataFrame(df.pop("label").tolist()))
+    # Change datatype of columns to float
+    df = df.astype(int)
+
+    return df
diff --git a/tests/eval/test_token_labelling.py b/tests/eval/test_token_labelling.py
@@ -1,10 +1,15 @@
+import pickle
+from pathlib import Path
+
 import pytest
-import spacy
 from spacy.language import Language
 from spacy.tokens import Doc
+from transformers import AutoTokenizer
 
 import delphi.eval.token_labelling as tl
 
+labelled_token_ids_dict: dict[int, dict[str, bool]] = {}
+
 
 @pytest.fixture
 def dummy_doc() -> tuple[str, Doc, dict[str, bool]]:
@@ -112,3 +117,75 @@ def test_label_batch_sentences(dummy_doc):
     # iterate through tokens in doc
     for token, label in zip(doc, labels[0]):
         assert label == tl.label_single_token(token)
+
+
+def is_valid_structure(obj: dict[int, dict[str, bool]]) -> bool:
+    """
+    Checks whether the obj fits the structure of `dict[int, dict[str, bool]]`. Returns True, if it fits, False otherwise.
+    """
+    if not isinstance(obj, dict):
+        print(f"Main structure is not dict! Instead is type {type(obj)}")
+        return False
+    for key, value in obj.items():
+        if not isinstance(key, int) or not isinstance(value, dict):
+            print(
+                f"Main structure is dict, but its keys are either not int or its values are not dicts. Instead key is type {type(key)} and value is type {type(value)}"
+            )
+            return False
+        for sub_key, sub_value in value.items():
+            if not isinstance(sub_key, str) or not isinstance(sub_value, bool):
+                print(
+                    f"The structure dict[int, dict[X, Y]] is True, but either X is not str or Y is not bool. Instead X is type {type(sub_key)} and Y is type {type(sub_value)}"
+                )
+                return False
+    return True
+
+
+@pytest.mark.dependency()
+def test_label_tokens_from_tokenizer():
+    """
+    Simple test, checking if download of tokinzer and the labelling of all tokens in its vocabulary works.
+    """
+    global labelled_token_ids_dict
+    # get a tokinzer
+    model_name = "delphi-suite/delphi-llama2-100k"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    vocab_size = tokenizer.vocab_size
+
+    tokens_str, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer)
+    # count the number of lines in the token_str
+    assert tokens_str.count("\n") == (vocab_size + 1)  # + 1, because of token '\n'
+    assert len(labelled_token_ids_dict.keys()) == vocab_size
+    assert is_valid_structure(labelled_token_ids_dict) == True
+
+
+@pytest.mark.dependency(depends=["test_label_tokens_from_tokenizer"])
+@pytest.mark.parametrize(
+    "path", [Path("temp/token_labels.csv"), Path("temp/token_labels.pkl")]
+)
+def test_import_token_labels(path: Path):
+    global labelled_token_ids_dict
+    assert (
+        labelled_token_ids_dict is not None
+    ), "It should be filled for the test to run. Check test-dependency."
+    assert (
+        labelled_token_ids_dict != {}
+    ), "It should be filled for the test to run. Check test-dependency."
+    # create the path
+    path.parent.mkdir(parents=True, exist_ok=True)
+    # save the file
+    if path.suffix == ".pkl":
+        with open(path, "wb") as file:
+            pickle.dump(labelled_token_ids_dict, file)
+    elif path.suffix == ".csv":
+        df = tl.convert_label_dict_to_df(labelled_token_ids_dict)
+        df.to_csv(path, index=False)
+    else:
+        raise ValueError("The file ending is incorrect.")
+
+    # load the file with our function to be tested
+    loaded_dict = tl.import_token_labels(path)
+
+    # assure that the structure is correct
+    assert loaded_dict == labelled_token_ids_dict
+    assert is_valid_structure(loaded_dict) == True