Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

34 manual token labeling #40

Merged
merged 18 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
339 changes: 332 additions & 7 deletions notebooks/token_labelling.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ chardet==5.2.0
sentencepiece==0.1.99
protobuf==4.25.2
plotly==5.18.0
spacy-transformers==1.3.4
spacy-transformers==1.3.4
pandas==1.3.4
86 changes: 42 additions & 44 deletions scripts/label_all_tokens.py → scripts/spacy_label_all_tokens.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import argparse
import pickle
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

from delphi.constants import STATIC_ASSETS_DIR
from delphi.eval import token_labelling
from delphi.eval import spacy_token_labelling


def tokenize(
Expand All @@ -26,82 +28,78 @@ def main():
# Setup argparse
parser = argparse.ArgumentParser(description="Tokenization and labeling utility.")
parser.add_argument(
"--model_name",
"--model-name",
type=str,
help="Name of the model to use for tokenization and labeling.",
default="delphi-suite/delphi-llama2-100k",
required=False,
)
parser.add_argument(
"--save-dir", type=str, help="Directory to save the results.", required=True
)
args = parser.parse_args()

# Access command-line arguments

# Directory to save the results
save_dir = Path(args.save_dir)
save_dir.mkdir(parents=True, exist_ok=True) # create directory if it does not exist
model_name = args.model_name

print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n")
print(f"You chose the model: {model_name}\n")
print(
f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{STATIC_ASSETS_DIR}'\n"
f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{save_dir}'\n"
)

# ================ (1) =================
print("(1) Create a list of all tokens in the tokenizer's vocabulary ...")

# Load the tokenizer from Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_name)
vocab_size = tokenizer.vocab_size
print("Loaded the tokenizer.\nThe vocab size is:", vocab_size)
print("Loaded the tokenizer.\nThe vocab size is:", tokenizer.vocab_size)

# Create a list of all tokens in the tokenizer's vocabulary
tokens_str = "" # will hold all tokens and their ids
for i in range(tokenizer.vocab_size):
tokens_str += f"{i},{decode(tokenizer, i)}\n"
(
tokens_str,
labelled_token_ids_dict,
) = spacy_token_labelling.label_tokens_from_tokenizer(tokenizer)

# Save the list of all tokens to a file
filename = "all_tokens_list.txt"
filepath = STATIC_ASSETS_DIR.joinpath(filename)
with open(f"{filepath}", "w", encoding="utf-8") as f:
filepath = save_dir / filename # TODO: use the static files of python module
with open(filepath, "w", encoding="utf-8") as f:
f.write(tokens_str)

print(f"Saved the list of all tokens to:\n\t{filepath}\n")

# ================ (2) =================
print("(2) Label each token ...")

# let's label each token
labelled_token_ids_dict: dict[int, dict[str, bool]] = {} # token_id: labels
max_token_id = tokenizer.vocab_size # stop at which token id, vocab size
# we iterate over all token_ids individually
for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
# decode the token_ids to get a list of tokens, a 'sentence'
tokens = decode(tokenizer, token_id) # list of tokens == sentence
# put the sentence into a list, to make it a batch of sentences
sentences = [tokens]
# label the batch of sentences
labels = token_labelling.label_batch_sentences(
sentences, tokenized=True, verbose=False
)
# create a dict with the token_ids and their labels
# update the labelled_token_ids_dict with the new dict
labelled_token_ids_dict[token_id] = labels[0][0]

# Save the labelled tokens to a file
filename = "labelled_token_ids_dict.pkl"
filepath = STATIC_ASSETS_DIR.joinpath(filename)
with open(f"{filepath}", "wb") as f:
pickle.dump(labelled_token_ids_dict, f)

print(f"Saved the labelled tokens to:\n\t{filepath}\n")

# sanity check that The pickled and the original dict are the same
print("Sanity check ...", end="")
# load pickle
with open(f"{filepath}", "rb") as f:
pickled = pickle.load(f)
# compare
assert labelled_token_ids_dict == pickled
print("\nCreating the CSV ...")

df = spacy_token_labelling.convert_label_dict_to_df(labelled_token_ids_dict)

print("Sanity check pandas csv ...", end="")
# Perform sanity check, that the table was created correctly
for row_index, row_values in df.iterrows():
token_id = row_values.iloc[0]
label_pandas = list(
row_values.iloc[1:]
) # we exclude the token_id from the colum
label_dict = list(labelled_token_ids_dict[token_id].values())[:]
assert (
label_pandas == label_dict
), f"The dataframes are not equal for row {token_id}\n{label_pandas}\n{label_dict}"
print(" completed.")

# TODO: Fix the issue with disappearing spaces when exporting DataFrame to CSV.
# There's a known problem where no token is classified as "starting with a space".

# save the dataframe to a csv
filename = "spacy_labelled_token_ids.csv"
filepath = save_dir / filename
df.to_csv(filepath, index=False)
print(f"Saved the labelled tokens as CSV to:\n\t{filepath}\n")

print(" END ".center(50, "="))


Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import pickle
from pathlib import Path
from typing import Callable, Optional

import pandas as pd
import spacy
from spacy.tokens import Doc, Token
from spacy.util import is_package
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

# make sure the english language model capabilities are installed by the equivalent of:
# python -m spacy download en_core_web_sm
Expand Down Expand Up @@ -208,3 +213,103 @@ def label_batch_sentences(
print("\n")

return labelled_sentences


def label_tokens_from_tokenizer(
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
) -> tuple[str, dict[int, dict[str, bool]]]:
"""
Labels all tokens in a tokenizer's vocabulary with the corresponding token categories (POS, named entity, etc). Returns two things: 1) `tokens_str`, a string where each token comprises 'token_id,token_str\n' and 2) `labelled_token_ids_dict` a dict that contains for each token_id (key) the corresponding token labels, which is in turn a dict, whith the label categories as keys and their boolean values as the dict's values.

Parameters
----------
tokenizer : The tokenizer with its tokens to be labelled.

Returns
-------
tokens_str, labelled_token_ids_dict

"""

def decode(
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
token_ids: int | list[int],
) -> str:
return tokenizer.decode(token_ids, skip_special_tokens=True)

vocab_size = tokenizer.vocab_size

# 1) Create a list of all tokens in the tokenizer's vocabulary
tokens_str = "" # will hold all tokens and their ids
for i in range(vocab_size):
tokens_str += f"{i},{decode(tokenizer, i)}\n"

# 2) let's label each token
labelled_token_ids_dict = {} # token_id: labels
max_token_id = vocab_size # stop at which token id, vocab size
# we iterate over all token_ids individually
for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
# decode the token_ids to get a list of tokens, a 'sentence'
token = decode(tokenizer, token_id) # list of tokens == sentence
# put the sentence into a list, to make it a batch of sentences
sentences = [token]
# label the batch of sentences
labels = label_batch_sentences(sentences, tokenized=True, verbose=False)
# create a dict with the token_ids and their labels
# update the labelled_token_ids_dict with the new dict
label = labels[0][0] # first sentence of batch, label of first token
labelled_token_ids_dict[token_id] = label

return tokens_str, labelled_token_ids_dict


def import_token_labels(path: str | Path):
"""
Imports token labels from a *.csv file.

Parameters
----------
path : str | Path
The path to the file.

Returns
-------
dict[int, dict[str, bool]]
Returns the labelled tokens dict. Each token_id has its own dict having the labels.
"""
if isinstance(path, str):
path = Path(path)
# make sure the file_type is compatible
file_type = path.suffix
assert (
file_type == ".csv"
), f"Invalid file type. Allowed: csv, pkl. Got: {file_type}"
# make sure file exists
if not path.exists():
raise FileNotFoundError(f"There is no file under {path}")

df = pd.read_csv(str(path))
categories = list(df.columns[1:]) # excluding first column: token_id
loaded_label_dict: dict[int, dict[str, bool]] = {}
# go through each row and construct the dict
for _, row in df.iterrows():
token_id = int(row["token_id"])
labels = {cat: bool(row[cat] == 1) for cat in categories}
loaded_label_dict[token_id] = labels

return loaded_label_dict


def convert_label_dict_to_df(
labelled_token_ids_dict: dict[int, dict[str, bool]]
) -> pd.DataFrame:
"""
Takes a `labelled_token_ids_dict` and converts it into a Pandas Dataframe.
"""
df = pd.DataFrame(labelled_token_ids_dict.items(), columns=["token_id", "label"])
# split the label column into multiple columns
df = df.join(pd.DataFrame(df.pop("label").tolist()))
# Change datatype of columns to float
df = df.astype(int)

return df
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
import pickle
from pathlib import Path

import pytest
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from transformers import AutoTokenizer

import delphi.eval.spacy_token_labelling as tl

import delphi.eval.token_labelling as tl
# skip all tests in this module
pytestmark = pytest.mark.skip(
"tests are slow and we're not using this module currently"
)

labelled_token_ids_dict: dict[int, dict[str, bool]] = {}


@pytest.fixture
Expand Down Expand Up @@ -112,3 +122,68 @@ def test_label_batch_sentences(dummy_doc):
# iterate through tokens in doc
for token, label in zip(doc, labels[0]):
assert label == tl.label_single_token(token)


def is_valid_structure(obj: dict[int, dict[str, bool]]) -> bool:
"""
Checks whether the obj fits the structure of `dict[int, dict[str, bool]]`. Returns True, if it fits, False otherwise.
"""
if not isinstance(obj, dict):
print(f"Main structure is not dict! Instead is type {type(obj)}")
return False
for key, value in obj.items():
if not isinstance(key, int) or not isinstance(value, dict):
print(
f"Main structure is dict, but its keys are either not int or its values are not dicts. Instead key is type {type(key)} and value is type {type(value)}"
)
return False
for sub_key, sub_value in value.items():
if not isinstance(sub_key, str) or not isinstance(sub_value, bool):
print(
f"The structure dict[int, dict[X, Y]] is True, but either X is not str or Y is not bool. Instead X is type {type(sub_key)} and Y is type {type(sub_value)}"
)
return False
return True


def test_label_tokens_from_tokenizer():
"""
Simple test, checking if download of tokinzer and the labelling of all tokens in its vocabulary works.
"""
global labelled_token_ids_dict
# get a tokinzer
model_name = "delphi-suite/delphi-llama2-100k"
tokenizer = AutoTokenizer.from_pretrained(model_name)
vocab_size = tokenizer.vocab_size

tokens_str, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer)
# count the number of lines in the token_str
assert tokens_str.count("\n") == (vocab_size + 1) # + 1, because of token '\n'
assert len(labelled_token_ids_dict.keys()) == vocab_size
assert is_valid_structure(labelled_token_ids_dict) == True


@pytest.mark.parametrize("path", [Path("temp/token_labels.csv")])
def test_import_token_labels(path: Path):
"""
Simple test, checking if the import of token labels works.

Note: Because we want to use pure pytest and not install any extra dependencies (e.g. pytest-depencency) we recreate the `labelled_tokens_dict` in this test as we did in `test_label_tokens_from_tokenizer`. This duplication is not ideal, but it is the best quick&dirty solution for now.
"""
# create the labelled_token_ids_dict
model_name = "delphi-suite/delphi-llama2-100k"
tokenizer = AutoTokenizer.from_pretrained(model_name)
_, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer)

# create the path
path.parent.mkdir(parents=True, exist_ok=True)
# save the file
df = tl.convert_label_dict_to_df(labelled_token_ids_dict)
df.to_csv(path, index=False)

# load the file with our function to be tested
loaded_dict = tl.import_token_labels(path)

# assure that the structure is correct
assert loaded_dict == labelled_token_ids_dict
assert is_valid_structure(loaded_dict) == True
Loading