delphi-suite · jettjaniak · Mar 13, 2024 · Feb 18, 2024 · Feb 18, 2024 · Feb 18, 2024
diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -15,4 +15,5 @@ chardet==5.2.0
 sentencepiece==0.1.99
 protobuf==4.25.2
 plotly==5.18.0
-spacy-transformers==1.3.4
+spacy-transformers==1.3.4
+pandas==1.3.4
diff --git a/scripts/label_all_tokens.py → scripts/spacy_label_all_tokens.py b/scripts/label_all_tokens.py → scripts/spacy_label_all_tokens.py
@@ -1,11 +1,13 @@
 import argparse
 import pickle
+from pathlib import Path
 
+import pandas as pd
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from delphi.constants import STATIC_ASSETS_DIR
-from delphi.eval import token_labelling
+from delphi.eval import spacy_token_labelling
 
 
 def tokenize(
@@ -26,82 +28,78 @@ def main():
     # Setup argparse
     parser = argparse.ArgumentParser(description="Tokenization and labeling utility.")
     parser.add_argument(
-        "--model_name",
+        "--model-name",
         type=str,
         help="Name of the model to use for tokenization and labeling.",
         default="delphi-suite/delphi-llama2-100k",
         required=False,
     )
+    parser.add_argument(
+        "--save-dir", type=str, help="Directory to save the results.", required=True
+    )
     args = parser.parse_args()
 
     # Access command-line arguments
-
+    # Directory to save the results
+    save_dir = Path(args.save_dir)
+    save_dir.mkdir(parents=True, exist_ok=True)  # create directory if it does not exist
     model_name = args.model_name
 
     print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n")
     print(f"You chose the model: {model_name}\n")
     print(
-        f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{STATIC_ASSETS_DIR}'\n"
+        f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{save_dir}'\n"
     )
 
     # ================ (1) =================
     print("(1) Create a list of all tokens in the tokenizer's vocabulary ...")
 
     # Load the tokenizer from Huggingface
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    vocab_size = tokenizer.vocab_size
-    print("Loaded the tokenizer.\nThe vocab size is:", vocab_size)
+    print("Loaded the tokenizer.\nThe vocab size is:", tokenizer.vocab_size)
 
-    # Create a list of all tokens in the tokenizer's vocabulary
-    tokens_str = ""  # will hold all tokens and their ids
-    for i in range(tokenizer.vocab_size):
-        tokens_str += f"{i},{decode(tokenizer, i)}\n"
+    (
+        tokens_str,
+        labelled_token_ids_dict,
+    ) = spacy_token_labelling.label_tokens_from_tokenizer(tokenizer)
 
     # Save the list of all tokens to a file
     filename = "all_tokens_list.txt"
-    filepath = STATIC_ASSETS_DIR.joinpath(filename)
-    with open(f"{filepath}", "w", encoding="utf-8") as f:
+    filepath = save_dir / filename  # TODO: use the static files of python module
+    with open(filepath, "w", encoding="utf-8") as f:
         f.write(tokens_str)
 
     print(f"Saved the list of all tokens to:\n\t{filepath}\n")
 
     # ================ (2) =================
     print("(2) Label each token ...")
 
-    # let's label each token
-    labelled_token_ids_dict: dict[int, dict[str, bool]] = {}  # token_id: labels
-    max_token_id = tokenizer.vocab_size  # stop at which token id, vocab size
-    # we iterate over all token_ids individually
-    for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
-        # decode the token_ids to get a list of tokens, a 'sentence'
-        tokens = decode(tokenizer, token_id)  # list of tokens == sentence
-        # put the sentence into a list, to make it a batch of sentences
-        sentences = [tokens]
-        # label the batch of sentences
-        labels = token_labelling.label_batch_sentences(
-            sentences, tokenized=True, verbose=False
-        )
-        # create a dict with the token_ids and their labels
-        # update the labelled_token_ids_dict with the new dict
-        labelled_token_ids_dict[token_id] = labels[0][0]
-
-    # Save the labelled tokens to a file
-    filename = "labelled_token_ids_dict.pkl"
-    filepath = STATIC_ASSETS_DIR.joinpath(filename)
-    with open(f"{filepath}", "wb") as f:
-        pickle.dump(labelled_token_ids_dict, f)
-
-    print(f"Saved the labelled tokens to:\n\t{filepath}\n")
-
-    # sanity check that The pickled and the original dict are the same
-    print("Sanity check ...", end="")
-    # load pickle
-    with open(f"{filepath}", "rb") as f:
-        pickled = pickle.load(f)
-    # compare
-    assert labelled_token_ids_dict == pickled
+    print("\nCreating the CSV ...")
+
+    df = spacy_token_labelling.convert_label_dict_to_df(labelled_token_ids_dict)
+
+    print("Sanity check pandas csv ...", end="")
+    # Perform sanity check, that the table was created correctly
+    for row_index, row_values in df.iterrows():
+        token_id = row_values.iloc[0]
+        label_pandas = list(
+            row_values.iloc[1:]
+        )  # we exclude the token_id from the colum
+        label_dict = list(labelled_token_ids_dict[token_id].values())[:]
+        assert (
+            label_pandas == label_dict
+        ), f"The dataframes are not equal for row {token_id}\n{label_pandas}\n{label_dict}"
     print(" completed.")
 
+    # TODO: Fix the issue with disappearing spaces when exporting DataFrame to CSV.
+    # There's a known problem where no token is classified as "starting with a space".
+
+    # save the dataframe to a csv
+    filename = "spacy_labelled_token_ids.csv"
+    filepath = save_dir / filename
+    df.to_csv(filepath, index=False)
+    print(f"Saved the labelled tokens as CSV to:\n\t{filepath}\n")
+
     print(" END ".center(50, "="))
 
 

diff --git a/src/delphi/eval/token_labelling.py → src/delphi/eval/spacy_token_labelling.py b/src/delphi/eval/token_labelling.py → src/delphi/eval/spacy_token_labelling.py
@@ -1,8 +1,13 @@
+import pickle
+from pathlib import Path
 from typing import Callable, Optional
 
+import pandas as pd
 import spacy
 from spacy.tokens import Doc, Token
 from spacy.util import is_package
+from tqdm.auto import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 # make sure the english language model capabilities are installed by the equivalent of:
 # python -m spacy download en_core_web_sm
@@ -208,3 +213,103 @@ def label_batch_sentences(
             print("\n")
 
     return labelled_sentences
+
+
+def label_tokens_from_tokenizer(
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+) -> tuple[str, dict[int, dict[str, bool]]]:
+    """
+    Labels all tokens in a tokenizer's vocabulary with the corresponding token categories (POS, named entity, etc). Returns two things: 1) `tokens_str`, a string where each token comprises 'token_id,token_str\n' and 2) `labelled_token_ids_dict` a dict that contains for each token_id (key) the corresponding token labels, which is in turn a dict, whith the label categories as keys and their boolean values as the dict's values.
+
+    Parameters
+    ----------
+    tokenizer : The tokenizer with its tokens to be labelled.
+
+    Returns
+    -------
+    tokens_str, labelled_token_ids_dict
+
+    """
+
+    def decode(
+        tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+        token_ids: int | list[int],
+    ) -> str:
+        return tokenizer.decode(token_ids, skip_special_tokens=True)
+
+    vocab_size = tokenizer.vocab_size
+
+    # 1) Create a list of all tokens in the tokenizer's vocabulary
+    tokens_str = ""  # will hold all tokens and their ids
+    for i in range(vocab_size):
+        tokens_str += f"{i},{decode(tokenizer, i)}\n"
+
+    # 2) let's label each token
+    labelled_token_ids_dict = {}  # token_id: labels
+    max_token_id = vocab_size  # stop at which token id, vocab size
+    # we iterate over all token_ids individually
+    for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
+        # decode the token_ids to get a list of tokens, a 'sentence'
+        token = decode(tokenizer, token_id)  # list of tokens == sentence
+        # put the sentence into a list, to make it a batch of sentences
+        sentences = [token]
+        # label the batch of sentences
+        labels = label_batch_sentences(sentences, tokenized=True, verbose=False)
+        # create a dict with the token_ids and their labels
+        # update the labelled_token_ids_dict with the new dict
+        label = labels[0][0]  # first sentence of batch, label of first token
+        labelled_token_ids_dict[token_id] = label
+
+    return tokens_str, labelled_token_ids_dict
+
+
+def import_token_labels(path: str | Path):
+    """
+    Imports token labels from a *.csv file.
+
+    Parameters
+    ----------
+    path : str | Path
+        The path to the file.
+
+    Returns
+    -------
+    dict[int, dict[str, bool]]
+        Returns the labelled tokens dict. Each token_id has its own dict having the labels.
+    """
+    if isinstance(path, str):
+        path = Path(path)
+    # make sure the file_type is compatible
+    file_type = path.suffix
+    assert (
+        file_type == ".csv"
+    ), f"Invalid file type. Allowed: csv, pkl. Got: {file_type}"
+    # make sure file exists
+    if not path.exists():
+        raise FileNotFoundError(f"There is no file under {path}")
+
+    df = pd.read_csv(str(path))
+    categories = list(df.columns[1:])  # excluding first column: token_id
+    loaded_label_dict: dict[int, dict[str, bool]] = {}
+    # go through each row and construct the dict
+    for _, row in df.iterrows():
+        token_id = int(row["token_id"])
+        labels = {cat: bool(row[cat] == 1) for cat in categories}
+        loaded_label_dict[token_id] = labels
+
+    return loaded_label_dict
+
+
+def convert_label_dict_to_df(
+    labelled_token_ids_dict: dict[int, dict[str, bool]]
+) -> pd.DataFrame:
+    """
+    Takes a `labelled_token_ids_dict` and converts it into a Pandas Dataframe.
+    """
+    df = pd.DataFrame(labelled_token_ids_dict.items(), columns=["token_id", "label"])
+    # split the label column into multiple columns
+    df = df.join(pd.DataFrame(df.pop("label").tolist()))
+    # Change datatype of columns to float
+    df = df.astype(int)
+
+    return df
diff --git a/tests/eval/test_token_labelling.py → tests/eval/test_spacy_token_labelling.py b/tests/eval/test_token_labelling.py → tests/eval/test_spacy_token_labelling.py
@@ -1,9 +1,19 @@
+import pickle
+from pathlib import Path
+
 import pytest
-import spacy
 from spacy.language import Language
 from spacy.tokens import Doc
+from transformers import AutoTokenizer
+
+import delphi.eval.spacy_token_labelling as tl
 
-import delphi.eval.token_labelling as tl
+# skip all tests in this module
+pytestmark = pytest.mark.skip(
+    "tests are slow and we're not using this module currently"
+)
+
+labelled_token_ids_dict: dict[int, dict[str, bool]] = {}
 
 
 @pytest.fixture
@@ -112,3 +122,68 @@ def test_label_batch_sentences(dummy_doc):
     # iterate through tokens in doc
     for token, label in zip(doc, labels[0]):
         assert label == tl.label_single_token(token)
+
+
+def is_valid_structure(obj: dict[int, dict[str, bool]]) -> bool:
+    """
+    Checks whether the obj fits the structure of `dict[int, dict[str, bool]]`. Returns True, if it fits, False otherwise.
+    """
+    if not isinstance(obj, dict):
+        print(f"Main structure is not dict! Instead is type {type(obj)}")
+        return False
+    for key, value in obj.items():
+        if not isinstance(key, int) or not isinstance(value, dict):
+            print(
+                f"Main structure is dict, but its keys are either not int or its values are not dicts. Instead key is type {type(key)} and value is type {type(value)}"
+            )
+            return False
+        for sub_key, sub_value in value.items():
+            if not isinstance(sub_key, str) or not isinstance(sub_value, bool):
+                print(
+                    f"The structure dict[int, dict[X, Y]] is True, but either X is not str or Y is not bool. Instead X is type {type(sub_key)} and Y is type {type(sub_value)}"
+                )
+                return False
+    return True
+
+
+def test_label_tokens_from_tokenizer():
+    """
+    Simple test, checking if download of tokinzer and the labelling of all tokens in its vocabulary works.
+    """
+    global labelled_token_ids_dict
+    # get a tokinzer
+    model_name = "delphi-suite/delphi-llama2-100k"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    vocab_size = tokenizer.vocab_size
+
+    tokens_str, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer)
+    # count the number of lines in the token_str
+    assert tokens_str.count("\n") == (vocab_size + 1)  # + 1, because of token '\n'
+    assert len(labelled_token_ids_dict.keys()) == vocab_size
+    assert is_valid_structure(labelled_token_ids_dict) == True
+
+
+@pytest.mark.parametrize("path", [Path("temp/token_labels.csv")])
+def test_import_token_labels(path: Path):
+    """
+    Simple test, checking if the import of token labels works.
+
+    Note: Because we want to use pure pytest and not install any extra dependencies (e.g. pytest-depencency) we recreate the `labelled_tokens_dict` in this test as we did in `test_label_tokens_from_tokenizer`. This duplication is not ideal, but it is the best quick&dirty solution for now.
+    """
+    # create the labelled_token_ids_dict
+    model_name = "delphi-suite/delphi-llama2-100k"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    _, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer)
+
+    # create the path
+    path.parent.mkdir(parents=True, exist_ok=True)
+    # save the file
+    df = tl.convert_label_dict_to_df(labelled_token_ids_dict)
+    df.to_csv(path, index=False)
+
+    # load the file with our function to be tested
+    loaded_dict = tl.import_token_labels(path)
+
+    # assure that the structure is correct
+    assert loaded_dict == labelled_token_ids_dict
+    assert is_valid_structure(loaded_dict) == True