Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

34 manual token labeling #40

Merged
merged 18 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
339 changes: 332 additions & 7 deletions notebooks/token_labelling.ipynb

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ tqdm==4.66.1
ipywidgets==8.1.1
nbformat==5.9.2
pytest==7.4.4
pytest-dependency==0.6.0
black==23.12.1
jaxtyping==0.2.25
beartype==0.16.4
Expand All @@ -15,4 +16,5 @@ chardet==5.2.0
sentencepiece==0.1.99
protobuf==4.25.2
plotly==5.18.0
spacy-transformers==1.3.4
spacy-transformers==1.3.4
pandas==1.3.4
111 changes: 68 additions & 43 deletions scripts/label_all_tokens.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import pickle

import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

Expand All @@ -26,17 +27,34 @@ def main():
# Setup argparse
parser = argparse.ArgumentParser(description="Tokenization and labeling utility.")
parser.add_argument(
"--model_name",
"--model-name",
type=str,
help="Name of the model to use for tokenization and labeling.",
default="delphi-suite/delphi-llama2-100k",
required=False,
)
parser.add_argument(
"--save-dir", type=str, help="Directory to save the results.", required=True
)
parser.add_argument(
"--output-format",
type=str,
help="Format to save the results in. Options: csv, pkl. Default: csv.",
default="csv",
required=False,
)
args = parser.parse_args()

# Access command-line arguments

# Directory to save the results
SAVE_DIR = Path(args.save_dir)
joshuawe marked this conversation as resolved.
Show resolved Hide resolved
SAVE_DIR.mkdir(parents=True, exist_ok=True) # create directory if it does not exist
model_name = args.model_name
output_format = args.output_format
assert output_format in [
"csv",
"pkl",
], f"Invalid output format. Allowed: csv, pkl. Got: {output_format}"

print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n")
print(f"You chose the model: {model_name}\n")
Expand All @@ -49,58 +67,65 @@ def main():

# Load the tokenizer from Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_name)
vocab_size = tokenizer.vocab_size
print("Loaded the tokenizer.\nThe vocab size is:", vocab_size)
print("Loaded the tokenizer.\nThe vocab size is:", tokenizer.vocab_size)

# Create a list of all tokens in the tokenizer's vocabulary
tokens_str = "" # will hold all tokens and their ids
for i in range(tokenizer.vocab_size):
tokens_str += f"{i},{decode(tokenizer, i)}\n"
tokens_str, labelled_token_ids_dict = token_labelling.label_tokens_from_tokenizer(
tokenizer
)

# Save the list of all tokens to a file
filename = "all_tokens_list.txt"
filepath = STATIC_ASSETS_DIR.joinpath(filename)
with open(f"{filepath}", "w", encoding="utf-8") as f:
filepath = SAVE_DIR / filename # TODO: use the static files of python module
with open(filepath, "w", encoding="utf-8") as f:
f.write(tokens_str)

print(f"Saved the list of all tokens to:\n\t{filepath}\n")

# ================ (2) =================
print("(2) Label each token ...")

# let's label each token
labelled_token_ids_dict: dict[int, dict[str, bool]] = {} # token_id: labels
max_token_id = tokenizer.vocab_size # stop at which token id, vocab size
# we iterate over all token_ids individually
for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
# decode the token_ids to get a list of tokens, a 'sentence'
tokens = decode(tokenizer, token_id) # list of tokens == sentence
# put the sentence into a list, to make it a batch of sentences
sentences = [tokens]
# label the batch of sentences
labels = token_labelling.label_batch_sentences(
sentences, tokenized=True, verbose=False
)
# create a dict with the token_ids and their labels
# update the labelled_token_ids_dict with the new dict
labelled_token_ids_dict[token_id] = labels[0][0]

# Save the labelled tokens to a file
filename = "labelled_token_ids_dict.pkl"
filepath = STATIC_ASSETS_DIR.joinpath(filename)
with open(f"{filepath}", "wb") as f:
pickle.dump(labelled_token_ids_dict, f)

print(f"Saved the labelled tokens to:\n\t{filepath}\n")

# sanity check that The pickled and the original dict are the same
print("Sanity check ...", end="")
# load pickle
with open(f"{filepath}", "rb") as f:
pickled = pickle.load(f)
# compare
assert labelled_token_ids_dict == pickled
print(" completed.")
if output_format == "pkl":
# Save the labelled tokens to a file
filename = "labelled_token_ids.pkl"
filepath = SAVE_DIR / filename
with open(filepath, "wb") as f:
pickle.dump(labelled_token_ids_dict, f)

print(f"Saved the labelled tokens to:\n\t{filepath}\n")

# sanity check that The pickled and the original dict are the same
print("Sanity check ...", end="")
# load pickle
with open(filepath, "rb") as f:
pickled = pickle.load(f)
# compare
assert labelled_token_ids_dict == pickled
print(" completed.")

# ----------- CSV ------------------------
if output_format == "csv":
print("\nCreating the CSV ...")

df = token_labelling.convert_label_dict_to_df(labelled_token_ids_dict)

print("Sanity check pandas csv ...", end="")
# Perform sanity check, that the table was created correctly
for row_index, row_values in df.iterrows():
token_id = row_values.iloc[0]
label_pandas = list(
row_values.iloc[1:]
) # we exclude the token_id from the colum
label_dict = list(labelled_token_ids_dict[token_id].values())[:]
assert (
label_pandas == label_dict
), f"The dataframes are not equal for row {token_id}\n{label_pandas}\n{label_dict}"
print(" completed.")

# save the dataframe to a csv
filename = "labelled_token_ids.csv"
filepath = SAVE_DIR / filename
df.to_csv(filepath, index=False)
print(f"Saved the labelled tokens as CSV to:\n\t{filepath}\n")

print(" END ".center(50, "="))

Expand Down
113 changes: 113 additions & 0 deletions src/delphi/eval/token_labelling.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import pickle
from pathlib import Path
from typing import Callable, Optional

import pandas as pd
import spacy
from spacy.tokens import Doc, Token
from spacy.util import is_package
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

# make sure the english language model capabilities are installed by the equivalent of:
# python -m spacy download en_core_web_sm
Expand Down Expand Up @@ -208,3 +213,111 @@ def label_batch_sentences(
print("\n")

return labelled_sentences


def label_tokens_from_tokenizer(
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
) -> tuple[str, dict[int, dict[str, bool]]]:
"""
Labels all tokens in a tokenizer's vocabulary with the corresponding token categories (POS, named entity, etc). Returns two things: 1) `tokens_str`, a string where each token comprises 'token_id,token_str\n' and 2) `labelled_token_ids_dict` a dict that contains for each token_id (key) the corresponding token labels, which is in turn a dict, whith the label categories as keys and their boolean values as the dict's values.

Parameters
----------
tokenizer : The tokenizer with its tokens to be labelled.

Returns
-------
tokens_str, labelled_token_ids_dict

"""

def decode(
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
token_ids: int | list[int],
) -> str:
return tokenizer.decode(token_ids, skip_special_tokens=True)

vocab_size = tokenizer.vocab_size

# 1) Create a list of all tokens in the tokenizer's vocabulary
tokens_str = "" # will hold all tokens and their ids
for i in range(vocab_size):
tokens_str += f"{i},{decode(tokenizer, i)}\n"

# 2) let's label each token
labelled_token_ids_dict = {} # token_id: labels
max_token_id = vocab_size # stop at which token id, vocab size
# we iterate over all token_ids individually
for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"):
# decode the token_ids to get a list of tokens, a 'sentence'
token = decode(tokenizer, token_id) # list of tokens == sentence
# put the sentence into a list, to make it a batch of sentences
sentences = [token]
# label the batch of sentences
labels = label_batch_sentences(sentences, tokenized=True, verbose=False)
# create a dict with the token_ids and their labels
# update the labelled_token_ids_dict with the new dict
label = labels[0][0] # first sentence of batch, label of first token
labelled_token_ids_dict[token_id] = label

return tokens_str, labelled_token_ids_dict


def import_token_labels(path: str | Path):
"""
Imports token labels from a file. May be a .pkl or a .csv

Parameters
----------
path : str | Path
The path to the file.

Returns
-------
dict[int, dict[str, bool]]
Returns the labelled tokens dict. Each token_id has its own dict having the labels.
"""
global labelled_token_ids_dict
joshuawe marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(path, str):
path = Path(path)
# make sure the file_type is compatible
file_type = path.suffix
assert file_type in [
".csv",
".pkl",
], f"Invalid file type. Allowed: csv, pkl. Got: {file_type}"
# make sure file exists
if not path.exists():
raise FileNotFoundError(f"There is no file under {path}")
# load the file if CSV
if file_type == ".csv":
df = pd.read_csv(str(path))
categories = list(df.columns[1:]) # excluding first column: token_id
loaded_label_dict: dict[int, dict[str, bool]] = {}
# go through each row and construct the dict
for _, row in df.iterrows():
token_id = int(row["token_id"])
labels = {cat: bool(row[cat] == 1) for cat in categories}
loaded_label_dict[token_id] = labels

# load the file if a pickle
elif file_type == ".pkl":
with open(path, "rb") as f:
loaded_label_dict = pickle.load(f)

return loaded_label_dict


def convert_label_dict_to_df(
labelled_token_ids_dict: dict[int, dict[str, bool]]
) -> pd.DataFrame:
"""
Takes a `labelled_token_ids_dict` and converts it into a Pandas Dataframe.
"""
df = pd.DataFrame(labelled_token_ids_dict.items(), columns=["token_id", "label"])
# split the label column into multiple columns
df = df.join(pd.DataFrame(df.pop("label").tolist()))
# Change datatype of columns to float
df = df.astype(int)

return df
79 changes: 78 additions & 1 deletion tests/eval/test_token_labelling.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this file should be renamed to spacy_* as well

Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import pickle
from pathlib import Path

import pytest
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from transformers import AutoTokenizer

import delphi.eval.token_labelling as tl
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did you rename the files? If you do right click and rename in VSCode it should also rename all references


labelled_token_ids_dict: dict[int, dict[str, bool]] = {}


@pytest.fixture
def dummy_doc() -> tuple[str, Doc, dict[str, bool]]:
Expand Down Expand Up @@ -112,3 +117,75 @@ def test_label_batch_sentences(dummy_doc):
# iterate through tokens in doc
for token, label in zip(doc, labels[0]):
assert label == tl.label_single_token(token)


def is_valid_structure(obj: dict[int, dict[str, bool]]) -> bool:
"""
Checks whether the obj fits the structure of `dict[int, dict[str, bool]]`. Returns True, if it fits, False otherwise.
"""
if not isinstance(obj, dict):
print(f"Main structure is not dict! Instead is type {type(obj)}")
return False
for key, value in obj.items():
if not isinstance(key, int) or not isinstance(value, dict):
print(
f"Main structure is dict, but its keys are either not int or its values are not dicts. Instead key is type {type(key)} and value is type {type(value)}"
)
return False
for sub_key, sub_value in value.items():
if not isinstance(sub_key, str) or not isinstance(sub_value, bool):
print(
f"The structure dict[int, dict[X, Y]] is True, but either X is not str or Y is not bool. Instead X is type {type(sub_key)} and Y is type {type(sub_value)}"
)
return False
return True
Comment on lines +127 to +146
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

beartype is doing this automatically

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha, nice. But weirdly enough I needed this to catch a bug because beartype did not throw an error when the type was dict[int, dict[str, np.array[bool]]



@pytest.mark.dependency()
def test_label_tokens_from_tokenizer():
"""
Simple test, checking if download of tokinzer and the labelling of all tokens in its vocabulary works.
"""
global labelled_token_ids_dict
# get a tokinzer
model_name = "delphi-suite/delphi-llama2-100k"
tokenizer = AutoTokenizer.from_pretrained(model_name)
vocab_size = tokenizer.vocab_size

tokens_str, labelled_token_ids_dict = tl.label_tokens_from_tokenizer(tokenizer)
# count the number of lines in the token_str
assert tokens_str.count("\n") == (vocab_size + 1) # + 1, because of token '\n'
assert len(labelled_token_ids_dict.keys()) == vocab_size
assert is_valid_structure(labelled_token_ids_dict) == True


@pytest.mark.dependency(depends=["test_label_tokens_from_tokenizer"])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does it do?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This decorator has another test run first (the test that it depends on), whose results can be used in another test.
So here, we first create a dict with the token labels in another test and use those results to test subsequent functions of our library.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I don't think this is a good practice, I suggest you use pytest fixtures instead (built-in, don't require additional packages) https://docs.pytest.org/en/6.2.x/fixture.html

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general I disagree about good practice. In bigger projects with large amounts of tests, you would not want to waste compute on tests that anyways depend on other tests.

Here we do not have a large test set, so if you prefer I can go with fixtures and have the same code from test_label_tokens_from_tokenizer a second time.

@pytest.mark.parametrize(
"path", [Path("temp/token_labels.csv"), Path("temp/token_labels.pkl")]
)
def test_import_token_labels(path: Path):
global labelled_token_ids_dict
assert (
labelled_token_ids_dict is not None
), "It should be filled for the test to run. Check test-dependency."
assert (
labelled_token_ids_dict != {}
), "It should be filled for the test to run. Check test-dependency."
# create the path
path.parent.mkdir(parents=True, exist_ok=True)
# save the file
if path.suffix == ".pkl":
with open(path, "wb") as file:
pickle.dump(labelled_token_ids_dict, file)
elif path.suffix == ".csv":
df = tl.convert_label_dict_to_df(labelled_token_ids_dict)
df.to_csv(path, index=False)
else:
raise ValueError("The file ending is incorrect.")

# load the file with our function to be tested
loaded_dict = tl.import_token_labels(path)

# assure that the structure is correct
assert loaded_dict == labelled_token_ids_dict
assert is_valid_structure(loaded_dict) == True
Loading