From bd4a88b3c1c92a0d89dfe68fbfaf2830a19009de Mon Sep 17 00:00:00 2001 From: Joshua We <80349780+joshuawe@users.noreply.github.com> Date: Sat, 17 Feb 2024 03:56:28 +0100 Subject: [PATCH] add spacy token labelling (#21) * add token labelling * add explanation function * add notebook * test * swtich off dependency labels + add spacy to requirements * small improvements * improve notebook explanation * fix errors * add notebook * test * swtich off dependency labels + add spacy to requirements * small improvements * improve notebook explanation * fix errors * complete UPOS tags for token labels * add tests * update requirements for delphi tokenizer * added token label script * add the files containing token information/labels * small enhancements suggested for PR * rebasing * improve optional downloading of spacy language model * bugfix: handle tokens empty string '' * add argparse for label_all_tokens.py script * add tokenized dicts * update notebook * undo __init__ * change spacy model from "trf" to "sm" * bug fix --- notebooks/token_labelling.ipynb | 435 ++++++++++++++++++++ requirements.txt | 6 +- scripts/label_all_tokens.py | 110 +++++ src/delphi/eval/all_tokens_list.txt | Bin 0 -> 45121 bytes src/delphi/eval/labelled_token_ids_dict.pkl | Bin 0 -> 274517 bytes src/delphi/eval/token_labelling.py | 210 ++++++++++ tests/eval/test_token_labelling.py | 114 +++++ 7 files changed, 874 insertions(+), 1 deletion(-) create mode 100644 notebooks/token_labelling.ipynb create mode 100644 scripts/label_all_tokens.py create mode 100644 src/delphi/eval/all_tokens_list.txt create mode 100644 src/delphi/eval/labelled_token_ids_dict.pkl create mode 100644 src/delphi/eval/token_labelling.py create mode 100644 tests/eval/test_token_labelling.py diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb new file mode 100644 index 00000000..45423d8c --- /dev/null +++ b/notebooks/token_labelling.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Giving tokens a label - How to categorize tokens\n", + "\n", + "\n", + "The first part of this Notebook contains elements that explain how to label tokens and how the functions work.\n", + "\n", + "The second part shows how all tokens are labelled that are used for our delphi language models.3\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "# autoreload\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from pprint import pprint \n", + "\n", + "import spacy\n", + "from tqdm.auto import tqdm\n", + "\n", + "import delphi\n", + "\n", + "from delphi.eval import token_labelling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# 1) How to use the token labelling functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We analyze a simple sentence and receive the respective tokens with their analyzed attributes. \n", + "The grammatical/linguistic analysis is done by a model provided by spaCy for the English language." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Peter \t PROPN \t nsubj \t PERSON\n", + "is \t AUX \t ROOT \t \n", + "a \t DET \t det \t \n", + "person \t NOUN \t attr \t \n" + ] + } + ], + "source": [ + "# Load the english model\n", + "nlp = spacy.load(\"en_core_web_sm\")\n", + "\n", + "# Create a Doc object from a given text\n", + "doc = nlp(\"Peter is a person\")\n", + "\n", + "token = doc[0]\n", + "for tok in doc:\n", + " print(tok,\"\\t\", tok.pos_, \"\\t\", tok.dep_, \"\\t\", tok.ent_type_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get the label for our custom token that we just printed." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Capitalized': True,\n", + " 'Is Adjective': False,\n", + " 'Is Adposition': False,\n", + " 'Is Adverb': False,\n", + " 'Is Auxiliary': False,\n", + " 'Is Coordinating conjuction': False,\n", + " 'Is Determiner': False,\n", + " 'Is Interjunction': False,\n", + " 'Is Named Entity': True,\n", + " 'Is Noun': False,\n", + " 'Is Numeral': False,\n", + " 'Is Other': False,\n", + " 'Is Particle': False,\n", + " 'Is Pronoun': False,\n", + " 'Is Proper Noun': True,\n", + " 'Is Punctuation': False,\n", + " 'Is Subordinating conjuction': False,\n", + " 'Is Symbol': False,\n", + " 'Is Verb': False,\n", + " 'Starts with space': False}\n" + ] + } + ], + "source": [ + "from delphi.eval import token_labelling\n", + "\n", + "label = token_labelling.label_single_token(token)\n", + "pprint(label)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get an understanding of what the labels acutally mean.\n", + "Use this function to receive an explanation for a single token." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------- Explanation of token labels --------\n", + "Token text: Peter\n", + "Token dependency: nominal subject\n", + "Token POS: proper noun\n", + "---------------- Token labels ---------------\n", + " 0 Starts with space False\n", + " 1 Capitalized True\n", + " 2 Is Adjective False\n", + " 3 Is Adposition False\n", + " 4 Is Adverb False\n", + " 5 Is Auxiliary False\n", + " 6 Is Coordinating conjuction False\n", + " 7 Is Determiner False\n", + " 8 Is Interjunction False\n", + " 9 Is Noun False\n", + " 10 Is Numeral False\n", + " 11 Is Particle False\n", + " 12 Is Pronoun False\n", + " 13 Is Proper Noun True\n", + " 14 Is Punctuation False\n", + " 15 Is Subordinating conjuction False\n", + " 16 Is Symbol False\n", + " 17 Is Verb False\n", + " 18 Is Other False\n", + " 19 Is Named Entity True\n" + ] + } + ], + "source": [ + "token_labelling.explain_token_labels(token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument:\n", + "```Python\n", + ">>> token_labelling.explain_token_labels()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Batched token labelling\n", + "Next, let us analyze a batch of sentences and have them labelled.\n", + "> In the example below the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token: Peter\n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | True | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | True \n", + "---\n", + "Token: is\n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: a\n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: person\n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: .\n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False \n", + "---\n", + "\n", + "\n", + "5\n", + "[{'Starts with space': False, 'Capitalized': True, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': True, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': True}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': True, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': True, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': True, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': True, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}]\n" + ] + } + ], + "source": [ + "sentences = [\n", + " \"Peter is a person.\"\n", + "]\n", + "labels = token_labelling.label_batch_sentences(sentences, tokenized=False, verbose=True)\n", + "\n", + "print(len(labels[0]))\n", + "print(labels[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now with our own tokenization. E.g. the one from our TinyStories models." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5\n", + "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': True, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': True, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" + ] + } + ], + "source": [ + "sentences = [\n", + " [\"This \", \"is \", \"a \", \"sentence\", \".\"]\n", + "]\n", + "labelled_sentences = token_labelling.label_batch_sentences(sentences, tokenized=True, verbose=False)\n", + "\n", + "print(len(labelled_sentences[0]))\n", + "print(labelled_sentences[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2) Labelling all tokens in the dataset\n", + "\n", + "Now we want to label all the tokens that our tokenizer knows - its entire vocabulary.\n", + "\n", + "Using thy script in `scripts/label_all_tokens.py` we get the files:\n", + "- `src\\delphi\\eval\\all_tokens_list.txt`\n", + "- `src\\delphi\\eval\\labelled_token_ids_dict.pkl`\n", + "\n", + "Let's load the tokenizer so that we can look at the labelled tokens.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\joshu\\anaconda3\\envs\\delphi2\\lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The vocab size is: 4096\n" + ] + } + ], + "source": [ + "# Get all the tokens of the tokenizer\n", + "from transformers import AutoTokenizer, PreTrainedTokenizer\n", + "\n", + "\n", + "# Decode a sentence\n", + "def decode(tokenizer: PreTrainedTokenizer, token_ids: list[int]) -> str:\n", + " return tokenizer.decode(token_ids, skip_special_tokens=True)\n", + "\n", + "model = \"delphi-suite/delphi-llama2-100k\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model)\n", + "vocab_size = tokenizer.vocab_size\n", + "print(\"The vocab size is:\", vocab_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the pickle." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "path = \"../src/delphi/eval/labelled_token_ids_dict.pkl\"\n", + "# load \n", + "with open(path, \"rb\") as f:\n", + " labelled_token_ids_dict = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look at some random tokens and their labels" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The token id is: 1143\n", + "The decoded token is: has\n", + "The label is:\n", + "{'Capitalized': False,\n", + " 'Is Adjective': False,\n", + " 'Is Adposition': False,\n", + " 'Is Adverb': False,\n", + " 'Is Auxiliary': False,\n", + " 'Is Coordinating conjuction': False,\n", + " 'Is Determiner': False,\n", + " 'Is Interjunction': True,\n", + " 'Is Named Entity': False,\n", + " 'Is Noun': False,\n", + " 'Is Numeral': False,\n", + " 'Is Other': False,\n", + " 'Is Particle': False,\n", + " 'Is Pronoun': False,\n", + " 'Is Proper Noun': False,\n", + " 'Is Punctuation': False,\n", + " 'Is Subordinating conjuction': False,\n", + " 'Is Symbol': False,\n", + " 'Is Verb': False,\n", + " 'Starts with space': False}\n" + ] + } + ], + "source": [ + "import random\n", + "from pprint import pprint\n", + "# Get a random token id between 0 and 4000\n", + "token_id = random.randint(0, 4000)\n", + "# decode the token id\n", + "decoded_token = decode(tokenizer, [token_id])\n", + "# get the corresponding label\n", + "label = labelled_token_ids_dict[token_id]\n", + "# print the results\n", + "print(\"The token id is:\", token_id)\n", + "print(\"The decoded token is:\", decoded_token)\n", + "print(\"The label is:\")\n", + "pprint(label)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv_tinyevals", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 65b457a4..5fdc84c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,8 @@ black==23.12.1 jaxtyping==0.2.25 beartype==0.16.4 pre-commit==3.6.0 -isort==5.13.2 \ No newline at end of file +isort==5.13.2 +spacy==3.7.2 +chardet==5.2.0 +sentencepiece==0.1.99 +protobuf==4.25.2 \ No newline at end of file diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py new file mode 100644 index 00000000..01bf4cf1 --- /dev/null +++ b/scripts/label_all_tokens.py @@ -0,0 +1,110 @@ +import argparse +import pickle +from pathlib import Path + +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + +from delphi.eval import token_labelling + + +def tokenize( + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, sample_txt: str +) -> int: + # supposedly this can be different than prepending the bos token id + return tokenizer.encode(tokenizer.bos_token + sample_txt, return_tensors="pt")[0] + + +# Decode a sentence +def decode( + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, token_ids: int | list[int] +) -> str: + return tokenizer.decode(token_ids, skip_special_tokens=True) + + +def main(): + # Setup argparse + parser = argparse.ArgumentParser(description="Tokenization and labeling utility.") + parser.add_argument( + "--model_name", + type=str, + help="Name of the model to use for tokenization and labeling.", + default="delphi-suite/delphi-llama2-100k", + required=False, + ) + args = parser.parse_args() + + # Access command-line arguments + # Directory to save the results + SAVE_DIR = Path("src/delphi/eval/") + model_name = args.model_name + + print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n") + print(f"You chose the model: {model_name}\n") + print( + f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{SAVE_DIR}'\n" + ) + + # ================ (1) ================= + print("(1) Create a list of all tokens in the tokenizer's vocabulary ...") + + # Load the tokenizer from Huggingface + tokenizer = AutoTokenizer.from_pretrained(model_name) + vocab_size = tokenizer.vocab_size + print("Loaded the tokenizer.\nThe vocab size is:", vocab_size) + + # Create a list of all tokens in the tokenizer's vocabulary + tokens_str = "" # will hold all tokens and their ids + for i in range(tokenizer.vocab_size): + tokens_str += f"{i},{decode(tokenizer, i)}\n" + + # Save the list of all tokens to a file + filename = "all_tokens_list.txt" + filepath = SAVE_DIR / filename + with open(filepath, "w", encoding="utf-8") as f: + f.write(tokens_str) + + print(f"Saved the list of all tokens to:\n\t{filepath}\n") + + # ================ (2) ================= + print("(2) Label each token ...") + + # let's label each token + labelled_token_ids_dict: dict[int, dict[str, bool]] = {} # token_id: labels + max_token_id = tokenizer.vocab_size # stop at which token id, vocab size + # we iterate over all token_ids individually + for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"): + # decode the token_ids to get a list of tokens, a 'sentence' + tokens = decode(tokenizer, token_id) # list of tokens == sentence + # put the sentence into a list, to make it a batch of sentences + sentences = [tokens] + # label the batch of sentences + labels = token_labelling.label_batch_sentences( + sentences, tokenized=True, verbose=False + ) + # create a dict with the token_ids and their labels + # update the labelled_token_ids_dict with the new dict + labelled_token_ids_dict[token_id] = labels[0][0] + + # Save the labelled tokens to a file + filename = "labelled_token_ids_dict.pkl" + filepath = SAVE_DIR / filename + with open(filepath, "wb") as f: + pickle.dump(labelled_token_ids_dict, f) + + print(f"Saved the labelled tokens to:\n\t{filepath}\n") + + # sanity check that The pickled and the original dict are the same + print("Sanity check ...", end="") + # load pickle + with open(filepath, "rb") as f: + pickled = pickle.load(f) + # compare + assert labelled_token_ids_dict == pickled + print(" completed.") + + print(" END ".center(50, "=")) + + +if __name__ == "__main__": + main() diff --git a/src/delphi/eval/all_tokens_list.txt b/src/delphi/eval/all_tokens_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..438dddae5a703d7c5bd4efc3b840708916eb9dc7 GIT binary patch literal 45121 zcmZ9V>3Snanw`BbvgLhWpBaxmUUJ$+Y>Aw)-Bz{KtyW7dO_ydg-CjWuBvAzdXn-iH zz1Je&6#&vh?@|k%2_65pmA9#EIaG?!ynR_?z%I?f%$@AI!Qx{^1An?oWL9 z!J_+na`p7E8OEn_gDG6>i!y^lkTtcIqm)ipR?|7@;UGR7N3jmZ}ZuAe}~Uy z_jmamy1&P>z3BcvpI6;K;B(UbLq4b7KjL%N{bN4oKK`Qn4L`z_w+ zMfXqnyz2fLpOfyl`J8tDoX=VJFZi5y|B}x|_pkWuJFYWZcE7{t(EWglc+vf9KCikL zd``M6KBwJ;&sjI+bKcGPTy%3j`)VB8cN%wnvPP>1@=dAm; ze9pUn$LFH^A)kHs@A+JIAMrVKFRAty-H-UZ>OSUk(tX0`wEHoiv+gH+&bv?fTy#I> zv+q9RbJ_ij&!PLA)_T!>!RJ-?b3P~CmwZmUulSsGzt88q`vsqi?iHVX_cfo(?hp7J zx<90mUv%H_dDZ<9pOfyFd``Px@j2`M1E2HmKk~Wg{u7^l_n-M(cK?OXq5H2i@{1SU z%FR`G?Pk)g-Aub1H?wZzX5QVpS#)=9`tIJ%vfH{Dy4P&zVsG4Bbvrkc?!nEp+q;={ z2RHNX=w{LV+D+e`+$_7Zo1uGTLtneNx$54!nRHKXrro!0X5BkC^X@x0i|)U<>AU~# zX4(A@H$(S7+0g6$mz%5Z|Nfuy@yhhmwr9LDy|wnxVOwFmGTpZA8Lv#|ZF|Nm(}mlf@yc}Mwr9LD-MRMA zqg!FTGX1*k8Lv$5ZhOWn)5qJM@yhh{+Cyh=h4IRC`L<`gG9AC|8Lv$DZ+pfo1A?|^ zyfRRzJ)of##w&w~wr9LD$Y^`UD}#@=XS^~fsXYLt6~-$Am$qlTGN5UD#w!Dzwr9LD z0BU>Ml_#JmdnVwh721_2V5#kCSDt{Uwx?Zr0;bxYcI64Usy!g96~-%ruC`~qG6-vX z#w&xewr9LDNNan>D}%P$1KwI;yfT<;d&Vn+yS8V%GT3W-#w&xr+5-YxVZ8E)!3hSS z{WM+~Gtl;oSH>E&J>!+J3AJa$;AF($1mn!+JA#KliWh_bCGhP{c0;f$t;5{Td0f6xA1e*fkO#mOYq*01w! z593ZSEt_n6>3D*5d4xMBpl+4lK^^2j!N$PB6A%Y6Pp~puh;{Aa!D>qy%KM&fNsYZz>pMs|C zK07AeAv>nshRaQ{Osj0?-HunCVw(=!b_$SmBs5O>I%spJU}(j|OfgTqByTqbLmRJ$ ziL#2OAn1`AonoZ6-VgkI>UA+wR_zq@Y>OyoDbP7#2B+Anx7O1XGJ>@`o$FQ$!h|bL%uG%H+G*Xc=2IC3(m$Ath};qr(oqiPl>%+XUDADc|6wY-d3D~ zl(UbIv3ipOmU;kFEY;_1z{qu~2aN1J42Z0;`%@6Y>-#8}sdsj^DGuwzWc$13jzLN3`ep-$@{U~>ChrpfIxQ_R$M=e+<6=t-&9dzu25C+`JZ z?pNLfxa`w!0L;#I0x=is5y(_c1T%RTP;&rHQ&3aq()5qMB(OPqRdHi2100{&`vRW! z-r#4G%YdJaH3@#sE4v5?`n;YO3_aebUjw2q_g)h>?jtu;#w9HQj2^_-6d>KEOM|4d zT@xhnJn8M&sNHj!diJRR)Yo+)9W1R4rhXt8aP+nh+-wG_)?8@@s`lbxhPUTa%rH@p z?U0#%+zbnKxY$fHfJGfkCo{0cY4)!f_`2q%Gpy9v3jwgFItYmU1~;7nu&34jF-HTk zYe6y76RZLrW!jaRnt`>qbsU~$4imlJ2DEKx@fm2lu07ywl|7T-e+JxEcGMYo`>E%G zw@0xs196AEHGsPS=^1d_-sI4MhqbL{Anral0C7*&41oJ`FEVD>DFHD9aOrh35XZ6f zof)9J2TYLEoE457xgT2Q?j={f{ zlxBc$YfqnHrKoTV@V`$30PR0B@OOePXW$R-dj|Y&q?8%x;}YrFGXR)70>Fz81pprp z?|-dMJp+a-Av^s%WcKITMBl`xwD!o$I;QB1ctXhA1GW&^D}VxG945czHtXA zyw~8&Kp}7ABY?w8wp^CLWQIjMOBgc{_>CJZTKWh0yH3LbzzdH&2Y=KoPcR38hiaxd z7`$J}kaJM@V9m`jZeMumIWXKEf{i&iT<02gxKExgV_;+al52Z!fJ zE%6*A9z^*Z7@n!?IVil3Rh?t)uG9K+FnDzVIB2ms*_EKhl6fVmE2=Rk98 zV&F+T<$VFD{51!h+g+WpFhwv2ow+6U@!XyaXx;^QbFg`S2omO?^9&Qr0q3=?3pig{ zfnXCtwc0@ESzydT=SsqxgU<6qyB+YP?Q;dH{+eF_=c}Veat=D5G&ytq%8=7s*V6h0 zpO0P>Y_0;kIc9QW9|f96Au|V>YpV!gKGc0d=2Pt59Bkg6v`%xt`FKCp2DUOU+qxqr zbF!$4O9LNF#FDMUKr9k##PXw|LHadu1Y3}9#c6G8#V>V9# zm`7UoNM zqJ_3I54pf}p6U9&+XB?S$?GhD?Vd_qfLSa6ZL$EhM;c%OW_O!B^a9jgWUmFVy*^Om z3w=}^l?9l6ik(;h+H~{OsKl`Vw))@;aC_F$EkNz9F92rY zyzC*7MORtiw`$fFK=uso78uYk>?I2TYtt>j>%{|r>+zV{6Ry4wSY7RXL9iODxd5z} zwXXwKFAu&eVBOiV7l3uEGr9n%pVvVE6?nR9V!CbtMjs163o+{Hda$cB3nuiumGl8y^#el zU0ug8EkNn98WJm7-9{UlOMucFDR%)%X$}ihpT)-mrYq@b0Zt#{a4f(n9o1cQak*>( zP>EQG}005!C&T(@<7hh)qKyO=#m~~U^w*a0; zD+6?1Z&C-qlce6l_%M{Z0G}V}04>1hU7iAbY77>@6OQqj!RO_aU6?&E7z4k+uu@HB zfffMu6OyQabn`7Q)q_zDq$LPoR~PC5seQ2rr008EvInG8qh3=FPJQq^Ru5E92RmU8 zQ1M_bU|y-E9-P|3Jt)no)=h6Dso&HaO74x;dyq;GurTjlrz`eg^*a5n2dl^Ywlo&? zne=i`yloFwZ}cR3z^cLR0qaF>=|O62cn?&+sj~p;?aG?%0qWlC0MvbqQx8%Bc`nJd zzg<^<>;da3wE$ReH#)vOVEv`xN)K9hvC=(o{V3NHuLpyY9xHpcM#ZZ&172wsI&TkR zxBF^|9?*t4dQq#n?Sbv1M*-S%I|1B2SVzFt9T%)&C%q!!R;2i>u>l>={DeYwI6q4VCzg+A)Nu-ek7&#z;?gME^tfGXZrwLURccc zK=(111G-eUcL%y}(;a~BN4AK-&9Tr~QAH1OH(LK5Ojk0UHfdvJS=ZL&lx z<7L6^N@nQ+Zgoj;%RBoF7}kr%w+FP}`tsQqcSM7??LC6JBxx&*S& zn8!d1Z>Y^Bh<&tLmjL#hv+R;SC8qV348jt$u0umh;QBU(d)EZ`4vUaR{PfyizOfiIdJ3>6n~y>CW=EG zOHu6OrkjA|?E^l{64Se;AqIeqLlNc}0P|h*B?q8Pv-7M2P~LnyKEwc=FZ=I*r_~R3 ztN~C{dsfQ;tnZ%7)Zi}n9KiLVQN;kVcX^iqi+s%!4IulTR~`U1Ahm#1pe6@>W}12c z)>}>5fEm7v_cR#jn%f;fc0!v0U{n8jAdr2%GnyabI0G1&bpYPy$N1g@_^wVofNs2` z7=i(S-^!5#2;bts3;_HqH@aNf#sPHyoZdTtZhDda*Z{s)QuqMAFAv(00e~Mw_yEGU z415mY+vv$sALk;^1HcKk-3j;zO!A6AoIaD(;{e7FcBui3ztQO#0Qn_X2IGki-Di#q z3=aUDs7p_B0O4;8CkGI&eXvAe3Oa$(0L0hXLavhbpj}S9G63(-d_k~n-ek}$Kc3=a z48VJr8-Z>e%E7SrF}DHTub%tb0LCx+V1se5&eZ_O#g(4V0LoX!o&zW+AL>)aKKtB& zoRAZE4#50cGd+OwYYpN6%(-eB9iX4{1i^V-4W!e$ItK$#zdqzmLAz)g0Q=>v%0hWUpw)|bceEBzKfrjF)?a#`?x*~?x7fZrN9 z4d7j>&ps;6`V{QD1J8qEzE1Y7UJ<5R*1 zl+`H7$Gc=-_yS-a4;5UZe!v$@`i3I33{EIax*us%CFC8Z?SzFLV%4a5$cj$c8dxmZygV4q8lh(Z5e|n<>Ww&8;B<{KUEwT!53L z_sk(ZkxvSysZwxIL_$L)@}m1J*+hz0$;MMCwB~0bj7_^0oIsAQ&Z7QybuCySS>!yK zkv2_Nfk4F&PoT@GK2cB|Yc;cJZ$>52n8XG|rLxd({P{JElqVhtwU>8S|GG@C~ zxw-5^-59z!J>N~Aexs{{vku5iXra-;!m zc^@2s1xid$aY>++1Io*^2x+wL0c${o&o|apfHgUS^unefij(kjX`Y=V80uGe@G!Ox zyb%|h-^~UZhEoj0j2(~(k$|b)Kre$jz8b&)xO>@-GDDq39`rW{OQa@tW1CjxZ}UYj zR}Y#Z{p3(&!3&a?Hr32-`la>+Nm2$j(tp(_B@UzjeY}&{C<5zr36)Ao^s&#Of*?}C zl9>nljH;#L0)AZMQgd$p6C zR=qV0Vr0gTrVZ*xU7`9F0!hNq10WEBhSUTU@>&A8fN>W zuOffs1~?NB@znVpkIu1>Tmuf-JjCCo2^ffKtqF}>Z)Bh=yb99wDPb(DvhXZ$_G>2; zlKcoq;<#+#tx(7n4h9^N^9aq;MPQJVxd2)ijG6wWG$CvhgI7in4((H-7d9Taf<6e( z@{aHaPEh(F1aiAlvUUZ1&?|BuoQzM$GW3CAP4qz@x~eQEC3JF!IC_+Ek>6?3DRRT` zS-zCu?Ft6@CN%(me4HD=9yon^J)D9JFUIo!FYG)^n=1@8%f~4ZvE1izy64?tiavKkTKyJ#sUBMs+gMD@aFI-v*hN^jP zxFbPm9sv5N3Q2~I!xn~|Rhy8=+q|G0a;Y|eL%#Dx63y=A5L-2`2rKi zW2J~&Xv9ZJx@y3fz#ccv`%rnc*;my~O!(ap;?)^AA!8j)j5LyzK%2;IwSUyk(kl0+ryFwK8xUE)}F(pTbcQ{=^L-U&4FbrOse@lJ~fClE<9hZCseAt5yfNdNJQCUD8G4W%j&v}QnQ=hd8|i*PaL)4P`W3MV2Xg;BtJVfq|kRRYbL?~nBC@6>MRDiEAs+- z6H1unEKfqg7kR~aBC5XVpz-09emM%9x`$eBGDynVAfg`M$b}SkI!%}*KAl>yUD7EW z%P{pp0S0!0t`g-1MfkE_zyKzJ7CY^hMmiK3R?bCnOvr5{Vqq^}*xo*0L%UN+N`AsU zAvmLuF}4vtFigVZya623Oo9?JI3qsxM1m=vR7m#r&@cp}#24 zSasS>Ixz8VzcXQ?HhhcS^jfZKq(ZI4VIl-wk=<6R2G~ zwr;I{P4^_Mp@{AAP=1d>HX=j&gIc!qA;oO;TB}q&n6yjkt zFfFIOz`q}eO_QcVU?YlSXTa6rVbaua(M?RB zGQ4x^*f3G~4+`;Cd7d~qX1efZ8CQgDQq)%r6)VRho@%`f5>jH(I*2qhb`W?7zop(7 zXD80g6c(zdfrG9qHl0F3w+(=h5Eedt4GMb5IK|Zz0y1%3P>i4R&^owTmN?l{_(!Ev zMTimXt<%kv?46TXyHd6HGzR4=_nu<&)$Uza!_Huz^JBSUD)?U2<&^N58{`U-yS1r) zkCeGjs)TQit6()L*Xjy!XIt`a2I)XEI(X^}AL^F+Kliv> z*^AXm5@%*=MwL~IVP??IT3s+_I^LH%!Z_R3h*D13Th)e;s{<%kDh|(2;jJ08v#Az? zbs)slE1nP4n)ZNoDr{}yLrHrE-@H}~Ldn6!@TC&Ocvs2a5*#~vSp)2RaR{?|60}2J zFOijsL}b~Od7QyKW6)R40snki@sJ9|*K}R)2=|a>&$B^3l4BhI8Sc-~hf}P0NR=p7 zEJXx6&5V~!lP~yO>G}#5pUCW4DQXr?-`G$kT8t+>hkL%P>l%XC#8$G(MF-x9^)m|J zte)yc^LhCJ&*xB=gMBh+&C%YlR1JKYBigKYQO6j+B4;{+RA^Jphy$Jngn{5l-&{fC z#|^V{*oSnWy;)J?H!Z2Hu5n$Sp~^;bzVKL`tL~_ti(o74(0*tUWmO0M4^C#@6(UmF zm@-7vvf=fWL7=2~6g{pU%gtB$_?T8v_(<9z=vVr9c4V1Uh;C&H7?SvDb4che`HQ(J zU=$ts9THl#4y*+7q4J}1C`flWPk_^dyl9Lm=kU-WVTW4CbL4?n$|3QRaOG2vunH&B+w3hFp(h?ZAqsbh#tc7~ZHlocj z<@VR3;o!V%sdbf?_+GgTFyF)(Reb3r2pdBIW(!HwV7^W6MFr+wtsy_tZioGNOWd35 zGF8ej$@JzB!nh{T;dpo@%&HcX;lo&*1+;=0&fDPLWVB8_=1tru6`9vbD=5h{J72`4 zFjFa68hin%G>4^J0!5lsn-2iF>^6B6gQ{5gR5JAPlO({^Ykue0hk{MR%sdIyf_gbo zh_cOimC81oRa3ZmI>b3wx_PePUh(E<2Z5k^Q>j*JF@DXeMdb@v<*9-mWtf$S{@CI!q5}?3r+k2 zwjq`-|3!glgR2ZzC=$JG?$`p-xeL_boWw4zf5JO-hcvaJRf7%gjlNYuR`&*9 zfxAXAim45@RE-*3WxuXmoK#h#=Y7$m%F*xajtWPUOZ zCDo+RL3=F-=yd`MC8g{5>WWIo{-Cau{6?z7*lJf{UJw012ML{3my(#Tu_h&^O;r|X zD>QwGFWkdIMX#wAf2`ytO##!NdWvDg6(n~{(yykZ%xb9o9 zq7wBSVD~VQL5%K!YSd)>+H;%Lsb_+Xu4{Cohl;*v53Pg4!ldF*WrmfsHWACg7iUXh$ty901@3Bn)lK}Mj)*AllV~Cb)~vB zX|9|J9SQxY&G?C%w#ZAR>shi;>3SC!!bLn>s7IkIPO?8kLg(ivg@l+?Fro++IxAY{ zATvCu&;x8}$jG)i){0-@;pcNuI8k8*6P=(CT#Xf}g-w_u@fgL=*{ww_hRlDB;B=jvS`rZZKMq72c7%hk-9*|P_!o86@T)y;Br zqZGBXWK69I#k1&J%MVdOdtJFKCA4(rI>hc2A5smiiRIc2@1}Mjq}zDpO9-j7&Q}U) znUUljAfqIhl+rruD^CUw?TRea(_X*dZ_J*{LqnOF%>{V9bkJyO+xbp(gn4o(G{i7c z!Y+li8k?Y6ajle?&Zfe4&uE2@1Qor^D993*=rL($^|jj!a;vXJe-@LX#Fk!@b8x?^ zR$-vH*KHJ1r7gO{hjh@WZ6C{vTf##SeFA>fw(E%681aQ_OWUL5meIBhgDbi{S1L~3 zEj6BZgMdi6<*9Lqa0Id!`niq&v4noUex6~<5Hc;;JhdexM2M2qg*x0%9+JOSiAyXi zniS(2PgiBZLBK4#iGofMa;V8Q;>%6ppkL%c;2-34X?qyxJSLJA=aLff3NR2lfI8T~ zq|Rr&fTU*H-;gV@uKL`!?PBl`o@KBH`#3lxZ7b5n5o{iuLS6MH4I6L}1d^ko5(slH z;U5e@ZVv-php3h85<7?VqS?)>%0Q$I^DYn&iAAp^`8@Y?_$T9Gs&&y)%4^DX@8i`k z@qcKZbaWU8mTGb86239#oui#%RZf|@U8h|!f`8S0H4X@>+NJZA9z)?SoubAP)$OkK z?F@1*(ZzByHhOicU_{k!&~If8Fb-ZsjF-AyoZ7L@mF|*X4}Gt+6?Uy_*`*RohErWe_d`-%{A9@ZA-j6!|&==M`Wc zRU0WH<8o09cFyR|$vP5OR?`GZEufN?z?N&bHD*Qms_gfmq^hAU>?w!(iL-f ztMjV&&9VUeo@rKfXjNuVt>{-uZLbCO;LPWaa1TwC3W0h^Y8osl{w4WL*YXSW^P0SP zm01e`@I=APFZ7~QLJ5Vy7AtU24typ}o3jb5Nfox|hkpnh(tQW`=kr!IW_&^x1N8Id z&pZ$E`JzV6l?C56w?kzx@T%?x_mH&;*(wgcZPA$WV3Y=VUbtr;;#DB5`3jpW68^k? z=S8WPdQrINlvM2i^I#^UY#St>aksG%W32W3vd>NAS?Y!9qCPh650xl92iza!dF6bm z8U|!_47McWUvi?t;h)r~v&!L8sSHpMGSB>MYNYi+$jAJ6o*egQSB(Yv;GpDK#|)y) zQ9B%QR_&+%Tw+8A_=i;%o*N1Z`mD%M6|wKxE(!_VRM&xpwiyE*AR$r!%_CJu+`1n; zbZptT0Ty~FX{%ylnum2?)Dk<7nF2nZ8ZJuM<~Sb|v6IH=l;9pP7)OzBAW}6rC9Kv( zPNYcc-l%|*!&|!@FG$(>nTcyq3=vQ3>`-!?8!S+Q(HZg1vGC1S?}5G8tBiOi4bOKt z2Tz8^qfFwUJu0L_y_kn#w{aK{r^MTt9z&Nz(y7A3d_(4&)GI%@@`iagicoBfG)1zm z+DJ-SfCJ@5o(9(05UjA9tbG+QAdITEr^DB|)6+!zqz#EPLmBsjyoueKFGRKRtSI6^ zKDc}dMNC#CxfJ0Y)gzRk00QHr%J>GOSFM`<$&?9p*4%LfR!+;aPgBHMZBGH|C?8M5 znf+n7wjK{>2MgYg*KX~KeL1^%AC3m_jp!>gPbf0i@R;m2Wh*wl1>(V|C+mf_%h)zp zqRg$Cfs+@}=Q)5Cqc47ec%)!ThPQ`G%kvGVAfvMsO=)MqnKC?3WB-)(w_1OKd=7Ue zOOTO~VyowcdeHjRPF=f-Rye}JnCdk!k3)|+538lM;CBK8cuud5FNT?N6Gh}8Ej@xl zjW&%FO_8Zi@uDcgcKFBDy1G7CmGmdL$GX#!aTNLV5F174=coXUBGs#3o_f*!*`=ur zzmhLT3FkdGxt^II8qo8go|m~Sj?Z@oeO2~3nJ2FW?J#v*W6EA9c+~H@lKY!F-`v`% z38py6v0!J%u-X~-gTL}X5YOkEfE(U1NUr_@?T}~5o!}i*VtEyahumTw6xKOjOh0lt z!&xl>MOerAHEIj_M#s&JDkOb0S!vN%~x_>Bo%#3<;aAsMsNrSaV@p!lvJS*iKRSg%LK6-PoxABl&xN? zR7vvn#aT`i@e7)okuMpstaJ_E1Zg_~v{1kG1t!|3ub?>u2e~37gcFt?j7*O(%+Vxd zjj_?#@pok=~A7vp$W@Xo?8)X*RL1RE6 z)5>#Ci0D;DSlLTRvPVoyDdSBk^=N7HK#LDN= zoRT~vyrcXsPbBLg;K(o$=USb3?UWGt9bz~b^P!Vf)k8W^Z~6nAV_*~DF&;vY=xI<6 ziN2JamLeKdD(|*V38V-xw%-OX8bt}MMs(S#3|ymCJjZgBzCo15brgD+=B!yvo#!TV!xz`!tER#~epqGO@oP>5O2nTYoN$D$2XG#fX z#fQloQrRpZQUgaEqe#>_)2Ni0866n1_q|*(F9zpq4W&8A#8O5oDQla@38s8CWUnh} zN9=bKDCb8>?@{Eo3>el`AURqvxqB|jJ43Un2`HyfF@bQ>-b$JAvvVGVlR18QC@2Tn zk0vT|<`BDTW)jEI7qeT2+f;bMH~oP;AJl__$rGg>j!4mzGeb!1}&i2S`Ure%=kr zsW^iptjy5T1kPblBp2Y#s%c>yKoSQ;Ei+SD{T;^HI0{Xvb(+*4q=U)HZfcTgPAPJ) zhhsG_;nfWo&u%)Z()K!$`_c9CKDaq$QcK3%1a~ww!-%e^MhRO{K{RD^+NpJT=cuEW zBXlF{1t6YezBm-8H7nIo1d^Mm;whUGEVSX^VtaX zqts!r>wOsqrOYTcQrnJs6;n-MAJ&OT9YQ~!Rk8rxJz;vG2lg4^)*cED!cR$^K|*g#|v@qctXmVohIkQDQLz}So2v}QxP>KI;-tC zg?}*b)~Pb)kDHULj5&g*x`Nub^dObYHA8^I71xAC)iE>d6;D_pGdX#$2m2(ALrF1V zjkfT`$OkD-8CIptq@{Db1Je6=AS#&&p=viTSySgY*Ob*eC3CMK zD)q}4s>=N{=flxxRlrqKct6gdPxZh;3OOrWRm$v>)ivOvV{%no0MG7}M}vvpfaGwgAXqsZ&7 zC&~h8sU~RYO>u^cit2-w`=X|Kze&a$O9S;*f|;`874@F*Qf1xP3pdm_6J_Ir+y{4v z7Kqw}-FDsToKO;Z!PFa+v_6DXTygYlS_)fcjv|6R%2$MN=~7Tp((!2`$cXtSs>eBm zo@5lQu9*<9_BlqJ!BTdkPWM&Pm+*&NjgY+=I`GO$n(0$15fsE%QO;~^S5Q#W?6OH! z%c`1d3J<3yaF0)_s@b%7uG4p4#L}p19_^;WW{@2JCBo)JW*#mZ>GG*;PUbUZ!4l)D zvRQFqyEtBuDv*qla|)sReLMx_&32re3@6|!1(+(FNfx*pEoL_kLq!o@w~E9Uav(mQn5&d_#j3<>cIK)roF3>FNsr8jG;p3OWhwT;uR{&SL%FU{Rb{WA=Mt3U@W64TpG`) zY=TPXM-nQjFK9@vO*NVU%HSz39f|hQm2u6~avJL82+Ce&C0h=|8*@CC5>5lJTyI2F z=IChlW4kZO@8TTNz7bwhbcM4bUWLxPRS@k`s^Hl3;2|fNfcn4#E~;^{dIb3BCPOUM zO34K|DUb7(t>ZyQfZtt^k(HOnfs9DTW^yh+Oboc1lQn~p4i)0gAf%7d`{ARf=baiF z0;ck|6h7ZFP>>tIzr;u>l73=%sU8bfx^3M6QmT5=Ju)crV6#DoN>(X(#zj$Yzeki*`?kY)t-G*y>pUa(K_EBlE`N- z^?O;jG>?Mk400Eht9kyoWjYi+n+x=wF4lUZ42Y6szdD6Z)jhL1M2xhm=bze{T(VUu zW3`{_wvwRCer3SBcps{sK|w)H*)uCmTJ0|2nq`V9*?%PmmvCj#AWKk^llmz*3Rbs; zk2tPXh=OMfL*72==*ptFoGX?qJtzyhW`v5%;Hhj+QDyUmS7Z=CZg*kV2Z!qjON>Lu z%#rK6BH52cR_pAVi?)MW=d3J1DZ{8#b%i48omt(`Mvvfm%fNYiCMb$Y#DuGL1*Ffc zA(j`0p*~IK#HC*!%IQ$~+$P!Nc<4!yh}VUhPV|!$aaRtziu2VzgYVMcaTJ{xPqbn& zSA->t*BTw4=~d=la+H=1W0>cFp6=8hP3ExESG$^X$rpSU7AUVLgr9hcKR(m1Z(EO;UU(Ijce~}vNUiDfalQBW8~3H!F6<_YFXE)&eStv zCPR{eN}*S$T-!`hi>6dUf7$?&R8a=_xLoT!MGK0d9bU*g8?h;1p18CuV?~-o<f-{p?l-aLS_TJ2r*F zXtK9y45iVGfOvg{(aCBR!77i&G9@ZjAia$frb2pu6Zor;#_BusnRB^u+8_(jDg@4* zV54i8NlT|n`n-OwS(VbxyU8z#kTM&Vqo`)WwxTVi(zS>ZKj12*fs{$#MK{1s!lw;m zl3oQ`a=L2W7;0kKvFyf!K!P7%N3C@9UU^@rDM|VygV(HAOX-AoT6!yxc znaQaSpUY*&(4r8J3&;$W51Soh~|j5m}+u2g!R zaeYqCh^KcFp&{b zy(na~uRN)1c*^$qG_L~dJ@#+lymnRv*ER+h__PB04>P|*{KTj zvU5F34kk=&siOtVL{6fPVJ>R3j7q1;uH}}-Y*ig_(pNbWLUN70w6#g3JD$l4!%0~; zo>H)q@nzN02d8wg*Hli?u`BD+9a=JB0VicLXPOgMB3E2@qIM6il0$+H0ZnRTTj8;i zX%;WBZCz>iOf9CEEOym}nrTd1O?%)d*6z$ZshY+XL~O%u_>c@i-Gn^0V{#>C<~PMA zOxXAJlux5`4V2VRGatQ@vudboZ558eS7Lfzo{BiGrUZnXU5uAZlP#c-`m8E9Wu0%u z7wmRUMW{6@>RN4>6r`?bnm`pbqc%AhuZbbUTn=V>P0p#^7HT4B8Q%_Sx~?!oCG{Pt zr92zdbbhSSJVn$RkYaL$)c4h^T(9>~uM0I1AN!0jlPb*=rT^3n)YRr|L@mNmraf#- zm+oD+p?oFO%*!{rD2i7|4Qa#&a3$Y7QF|vNJ76r?Mox|YQDx`GWvA);mL6A6-KOeu zG&7G^mFZGX{dJxYrb6&Ri>J;ZEA7O1NmpDTtK?PTt8E!a71b#Ib2g0Cl8UbO`&6vt zioYIEJ*W9gHEiJ0zl7=a^e`5}(^MXmwazGzT58zjdxOzxs!8RfJ>f01JTk0`YV^{H zDb-X{-|>zVRbTewlR{dpGr?Fl?L_O^4qD6FawRO#zB(qHbr=05d1|VeASsE52&+du zF+X90&b5FmU!W}7O{!O3Z6~atU4b2}RRcV)R3pSHt!ccMX(!4Wg+ zav;3fyb(p#k=}ATL|p`kQ~5@1X*FcWBiKcl~y*0$V;0Ex61v|1T$P4{4cc z$YbhejW|+(4a;~gv~`yiQ7BTy>FtWIFMB(<+Us4jW@@jQ5RtvSXvs5{@Rc*S>mE=Q z>?Sk1_VFqzMv4;`#7=9COjVXz@inT5aD?LPHj=OSnp5)zW}w;#ysIG-K}q&5hm5VT z1Wz%Ow|HuaU*-RZ$eFOzx6LDPU1WxRlS`__k=3wsF5dE{xw0;h{CFryz!H+G0rJ!w zJoU19?h3J)(Oa(wP2qBtab21aKh*RH2H5CMhveI{WHQW=)`qE=GT#I-uAKZ`@`ft0 zfxEQs+R143inJAD+cq)$st#J__p9ZF4Zf>d*s>-sf3=^6*VH_YgQK8Ks{aDj#w9^8mU5H8> zIjPmCNEph{jG7oGMAyU04~Rz$#=ON zJ{XG#j(t=Ow@q1jGnZyoSX&3Wl*V>fqi2N5>IX(#;rENL)>~Ydv$GIIs4= z8FQdAdl_-d;G$~o^(Fqca&GV*Z_(A85rHLKRM5>piM>rlH^oH9158%KE(rw`bw6Z< zNseMvoM0*3>1xjT_fG5v=8li#La@v84PdiYx2?sYG}69RJuuoAIR!#PvlO3DNjGhl z8i3L;O$i=c|IwmM4QFW8rJw0VW$_1CZOiP9x+<*3T0k*( z3cCSr8En^phUvfeU4#`ctL|=#i142*8AixN5{e8LSLcV-fMa_^XpLd!7)-U@{Ld~K zL~S=_BCiakaX_orWV1#T!D>Kg6ci578hMVKz34uw#(~xFPRi~_<=rTKfYgrFK2X{z z4ytQJ6Lz;I#_ww5x7A*78k(l+a5`VOYPDtodIJ5;CTmpaeHWQ$tr%VD5ry8rY_A5N zy^bCS@qs)(RySW^a_b_|FJCd0m3g4-f0Jx+JiUq?F5gR4QKGpI$LdRbW0kYK0YzsF zJVy1(7b<;~ksiuIxCZJ_JqW6L{M0dfsD{er#}{G;xq{%5;xzeq3aJ8ohCXUvr zW~GQsg7>UfG6lw}7wwA~T;m50()%r02Vbz{n_4J%D3W6v z452!e0jV5(I2|GmpyFB8D>aV;_o{xyc{Aya42i+KQ{e)l)n`Hh!0#fIbw%}6?$l5($NdnhS)=}{Ze z<*gqM8Iz~XTRuc8OPOGx7O$pEP&+1K`KDISKCPm>{BDYP3eMW9%2QKr?gFJ%lII~R zR*??z1WSs_adHe+IY>7(SA@%)cpfw;D`z0UWCw>hGa>b%r2J{6L&;t$cpsHBN-i*U z7>|L?_-8X70+UhQ`aqQF)tMYX(fZ}Jb8+a5`8ic%iW~?ns>iU@C}$`0$th+`m_nT^ zRhqYT9+l-RMw1$c&>o);gV4k;X%fU`R5ZslsIwTOs~hI_FxjJi0mqnBT#_@|?T6dHEwld@4WBbEjEbI`w zr3gJ!1~^KymG;9qLt>G5l-+uR-%Gx+$D!(Y@RV^@$YQmQ4Snr!7suy;!W09AUariL z?f<_kL1k@3m}O5@4y7;PraAMfoyN12c~YN_gXs%FVZ`v#w`*xLYN7jv$`IQbE3Pp8 zAz>9ep{}G!DbmCD=}P9uN`b>(r>B`=phypEq^~sHY)>i#&cY5KQ{T8}NEtjD+ERK@ zZ4Gfb9FfEVKWVF>A1!WH)!u(wqpf^nK;fwvqEh_1D%^FmeSQ(>>u!8&*sHmToU2MI zom7cBtF|V`fQ*vFLNKpHoqv;8?vf()46JhiW-949iu5p-1f)dg-WfboBq2unrFJvZ zz$qQ<)jWI`H6!9o7$g}Bia6iWc}|75@}E`MO_GROD&Lc@_yf-ug}UV7?e`)W2BZi}M@= zvP*0W{BR`FV$7Qo3--I-OdbX&Y+Z0hBa6X6agfP?MHkd}dIH3*cI zxlimy?Jc76ruD<=s%rz&wG&OA4l2z}OSE8(;_x9ZrLcVlXpgWzpyvQBnhKwJWb){Bqo z9ksLu9-UZ?O$Qh0%RDB%zl`OoD2UJC9mM% zq}lxX0!PPAMcQ888}T{=*v*L~n68oS@kwDXWV7Y+DsM-V!}}2Ek*rP~5pg+Ov~DXE z@{=v?7(8y*hRD0E+Az=Z<+)+}C@DGN$m z!MosAPopQ6u?Y_KKcrVp$!W(7jG}JmKL^Bd;RsGmi@;@M-OYCKyI4LEz2plC!9tI! z_Z>`|y>W9h#KZBBs6$6dtnpR9)6@(XN_wieudu89UB4ub>l8`@cBwchjW|qC)P=km zJ!&-#pVgWj>~|#EDG?NTXHj20OBsA5_52VaF#?n5@zLxXOq$lEh&#gL=XD`6D!18W zHSiLCVgc9lCi|PaKxEBUxu7?PG@(4#Qg9WzRs~PYR-%7?$lOs=epDAkUM%F&-i!$5 zF`zOAkTFH33pvQ4+4?~y8=KBZGS)NsIS!-49louq(k0`a>h{Kw4KHQ(6~%2I; zW|lTX?|j3O?Hq}tI05)c>`~T?W#J0SCPGPR-P9BXc9^;%Aj?T8$M>zPLTy-#gp11K z&qtMW9Im>Aip;CQYz*Y&-JrI&hv(Xd*^rFo31K#tpAxKrA9{UO=m(eYuSA}ujM{5MYwLH>Ino(j=^`f2)xW9qgbh=8&4uRqJpzxgv?_Uf zZv2iZyrv|oY8hTjKDqTbow8jfmU00+Q1qo-OSL?aymjRQ;J9rPTfX6~;87Pql+5EA zR*;}#e$_%rCG!qjl#8Qi{<1UA7ILe9`on?|Q(S%iNDWAtH z2y;b0pSHlEPwT0om!gbvDjPC`>b$}D$dH}hwoUDF;Y=}2y}@;*u_^N>3hsjZn zpvtZ4>N!5u0n3pSD(kDX-hZ)d_L_eaKq`TM8l57y)>E$)xcCXn0?ePkVo_@$9DaiF z_!pfy3zBc^EU-ULdCXQQQ{-;R&f(>`ROab$j+2^9!MTcfTUi{krC(Zh6bXTY0okp$X^a#tE>lqJT)Tj0vQzC_SS*`BAqhhoLQ1waU| zjJ@i6RKn@Ou$|qeA}7`NkJV=(yT|(=8lJ0lX)s+8q}02@atPe>-z4|}ime=sOOEUC z75BR8cw34Tb4Ly=yU5~U^9DLW?!t9Kuz6`&j<;($9rgG42hV^|hVL>@u}@ieJlY99t&3& zBIlAM8U??~e2#f;1=k}d^r@eCbynA^vWwNJujUM+QW{)#Q=Lx+y^6dxgk*y#&#!4Y zVpw@TLbNJEu$Yci2djcZS!1cB&gzV=3LZk*-b7e}00^a=3moFdI*yw%@2J;6k*t zD!8%c<%y zB1Ju$4h@mf{oh}^?|z*3lga##00|0Qw45=>v^+$1tlXDNmV>rN2P*c_qC~`rCyFsC zqpEDbUI-#P-zWZd+445Nk%h}o56@K&kKsS3N5EqKLppp}`gWF*p~&0PpIci&VkpW| z!LZnOV^@I3_;+ThPzJazlqB=(f;L)boM7Bt6k@#(UjxeI7f;(iGh)a&!?C7NYFKElKIzjITZExi@Iiie}Az>74aJfS66n?^IE67e&Yd* zE9wiGM3&pgIix%i6{zB8a5Vey&9jaK!Hz?0$#$sVCt}RQ;hf!S78US2Mq03T;q&Gb zD&qIw&7`-g;wPi~{;Q(_t&gkaSNRxfU*MJT2+yx7g)#mX&QR1(o-DzD3!t0D9{(3Q zGm@v>kJ9Ea7k8&o_PaKXlcjR)>Sj+4Nr`Az zLoWdxMRL^qhs|0dfv+}o%XujCKOcA1h7i~rm?~$Ji>H{?**Sx#pfYA^{NIHy75E>w zWA}l*2n}_d`ty%sntqe7fGorL3&^aJEUuFNbqnN``jfo& z?ob(tzs6rKlKvImT>Gi-epTbq3%HEthyln7>6MUlfpk|Vul9w}i2J-ITy}64Athe; zx`yl2`x_XQ=~44f{FojPpRC9MkIcoD+p9xlOyYe$pXSbzQ-u1eatK6IT?+nPGKKpo z`1fCER|WNWWKJ{oCQAODAeKy@ihtVb{k0YTlPuNlM)v<$k+zGan{gX2XWV@v4Xo>- z>VLGog%2YS3#`NPF>Tm{t_|9|UoV8HRCDn0$6a4fzX7G18ZB78N_r1R*%(i`Wi63u+ zhi0}v4+gpM&XqB7E`al?GNkO<>JHV#H*_ih`cExy55*~N35mI`IzzQ}rOMjZ@sQgZ z4=K~XSi8Zwz_(c+i7zfl^?Xa{&HuFGG#q}Sv$faPKcgKX+gu!ibN%~PWpLa#6}vbg zU`zHdXA=qe?=Pp$2XMi=6v^-9l`nD&MgUOFsIfBwc%oE`$+frTUc;7pKeG-JJx$z^uK%NdSac)|C2x0#b#x>WOi~9ckoeQuzXK7u|(>X-| zmKt{Pb^d#rkat2r#L81=17Mr-urL`+8L!6q0I!bu7fT#szuq_B(Fp-$WokwMzbS)? z$;pg>iY%QHfLC9)!bN+9(Am3^L~~r$qCuuh6}iA8-!LSs7*&s{qvqmu?9SL{goAt` z%8Zx0bnG)jR!dk6S1ysX3$dq)#Dg;exJF%sup{QqnE-9Z2#ci1+6il2haKImsBjf_ zro~s7?ohaDAxXWxr4)JBTyD+~q0^ag}jq6JQCCMc&8CtYdg0W@G4K5+ByIS0kjyNsgz_6*+ z<(z<@`ENWZ(X%1gNaFeC;9FkGmD}OIni9?r=Ba#HrvxyrmlCdTm0+zef@LQFFlkT< zQT+<8hOi#chvi{@EHk9TSLMB7vb4a>|dw5ZcmHe zT)7>+SbHJ}%L~`!S4MX5^Bp1eQVQDD zRsF9~EL2NET`T~W>nTT17lF47Y268LA*ZcZg}KT@aSiv?cfpHlfBt6~ZIz?d3Y-z} zWqd-(KT&TYFV}ESezP4I7oR9Pl&11OgYgRgb(H0~Eat9kJI?`iIhiQ?@j4p%T+E#T z*Hj0N*4AlVIRxr(kg#0{!el^7JFfI!_r1<=7a>SxeR^-I1?E!TRS8Gc z{|542D$dq=?4>Z*e)E(8t&;w?#De1g6{>-p!MMsERcrf(&vTzuzxjv1wQ_#03vD%D z*u~vfZ@<^bu-5jWApmzv;}VWg|4$aE^>s*#EM)B40BfaO{T|*rB%E{(0IO)HYGEy= zx{a5GwhX@VB#>53rAwv2Si7>CgUPWhJj>W5z7Os(fp;1W;@UIFA1Z>l-hA(U;Vz<; zTHJv1@V+oJ+=oF;r308(FTpoW)~I*oD|PU|+8+7I1w=?a(|r8EP{Tt65|ruY-;x4i z-99ro>>#o-MutK8f+uv5l_~lV?FIQpm)C!4HjXh>%ts6iMINI9=bQ++xqf53YktIh zAXf8=IAj`(*3tRi{%h&X!8bcHZTAAtT%qj^{rn zXUl)o%ogX#Ee!N3+pF%=Y$x4Mvz>NdW;=5dCdbXY&$C^0zmaXVBT>r28aW z%xt!^?yGG1M+n(6B|6*Qf9uBi%kFox9sIw2jJUt({xDn2TDFsJn(egvy=-UQ?`J#j v{wUi;ca?45y~uXiO|l)jxmz$j%l4`pvYm9xY^U8K+gaCVJMVt$!w>!+%KLai literal 0 HcmV?d00001 diff --git a/src/delphi/eval/labelled_token_ids_dict.pkl b/src/delphi/eval/labelled_token_ids_dict.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5fe96a3904c3ef3c78e4e34cfa453e812f68eb99 GIT binary patch literal 274517 zcmbu|b97zX7Qo@uwr$(Ck=j<;lOk=?q&9PE+qP}nwrv|PZN{CW?=|Cc4&xA$)0>DAQ7)2mg|=Iy;&`}$jd{;$8Sk{&)D-t9cS zJiPt?dQ|^EDCgz>2d#a*{`pH`{Qs;{d*9#xsqnx5(YKw4chk21zl-4iyDI+e_iWzw zcf)@F%PQXOy#oGf9o>EqvJ-wMI}ht?6?=?LL;gmgMW zIUS*$jxbI~Sf?YL(-Ge3h~RWYbUGqA9g&@mC{9OIrz4uv5#8yC;dI3GbHwr&f@2}X ze-!-wYJ>An8$x0s6c$2bAq*D6Vj&zB!eb!<79##f;qRyph^RV3OScr;+X#Y_N zoSV`Ai{SVF6((RVV*D4uzeWXaLrg5h!a{5;#KA&bEX2b?d@LluLP9JgG6{aJ*Uf;a zPHYnXwekjz>LmYB2wdfoVj&q8l4BtS5CZlvDgUDocrH?5AvG4#SPFiYd%1v^Olv6w z-VFvGhjcCkKO6hWfHtIeA-LL+25v(JEM&w&CM;yeLKZA!#X>eLWXD1dEabEy__=sW z@{eji-wL^~kQ)nmu#guE`LK{53k9%H5DSHD2!B6!2d;93u}}mHMX^u}3&pWe0t+So z5&T@7q8N9BrLa&M3uUlS77OLDP#y~vuu#z?$ll&B(5YI$F38=s;D0U`t`!Sg4DIdRVBBg$7t? zh=oS~2sTgt1Mff@W1-1^6#hQJ2zYXD`X7bBy=aDo=2&Q9DcGq|9+rZg8r2dDt+3#* z6zq(Hr=?(L99mBj94ySm!aOX@$HD?E zEX2YhEG)*t5-cpm!ZIu@$HEFMti-}9EUd=D8Z4~E!a6Lh$HE3IY{bGQENsTY7A$PV z!Zs{y$HERQ?8L$@EbPX@9xUv|!agkQ$HDh$HEOP+{D5yEZoMz9W30%!aXe9 z$HD_FJjB8yEIh`-6D&N%!ZR#9$HEINyu`vQEWF0T8!WuV!aFRy$HE6Je8j>hEPTeo z7c6|m!Z$2@$HEUR{KUd9D!97^!$dGn1m{EuBVoFi;~tU|p{xlTH*?%WTN5@ed%1@( z5~dqQ?qQ9DaX7sqBE*BVJAAH83~C_7kBX7 zquVE3Tt0J;VI+(J9@9t|I}*#9urnjEtqD6b5{DCUIT4Q&@r6+5NCF`O=ZK%_eMW!2 zdF7tag7AMOlATFSWKGza)Wn=f!il7uNMau|t!jYXi_?e0146Ry%Mmk_RU#5kO}g$SG@HW$P^ zLa1AjSBSu^2zbq)u_O76gv^Mm;mmI&jIAhOB#f;nD1>@A3kl(BIE}3+%!wjK!Z@5o zjf8aMU#rXMQMe)Bkr82Bt(#7VP{4v3lU^T>=0ErQI!+btO+}DQJoVt zga~p*{99p6_L@e*7@f6*2y#;Gj74ou)Zs*3PSoQ>eNHs6ChVkHLm^y+mN70G389XQ z#++!viKd)rW=+_ck>=KfoiuB~2@fIEanVu;bzHRKgu|M!Gb5hXgq;~_&51UgXv>Lq zM#8uhc^L^~$!RZyIyyTD;VL?%Bd(rDI$9HULd)BluoGIHIN`$yUru!9L>EqU;!l(Yr;-|_vS<&PV_YruIh!H#`hCKUA6iP;i_sGYv%wV)U6oEi9tfB zJ2KdsuoIm_tO+~O>1QNd#f6+}IgNyIQipP47$=4cp$@GPLa5_nq>=d7j08T=A7!6# zb-Fg%NEjz|3@65NVjL&Nb7BG~CURmDCnj@Z3MZy=Vw#aKrrC5OVa$;koS4apS)7>7 zi8-8@%ZYiMn9qp?oLI<-MVweHgsU}G(#+LTv_uH?n!1z|%Q&%|6Dv5ek`t?}2|HWY z)k3(63uEnEBZRBCFoxDzBVlaCI!>(T#0E}mKC3&Q3FqrDb{&D%lyII*7-2RLz%6Nfl)m=i}hag-CsIB}d4CpdAE6Q?+F+DI5T z2WO0gadU8%6X!T_o)Z^1agh_3jD$>Tz=vLekH#(=3FD+*;lx!=T;s%bPTb(cO-|h6 z#BEO8;ly1|+_NU^Je%FOChUBy@PHE!Iq`@Sk2&!~2=$}xQz2YE>Kd!oGfq6`#0yTm zG!n81xH?gKWuN%hHQbR3LcCT!8s9v6CpVfN(gnDg%(1cW??uHmJ{JP5uOtf zgiy~&L?P5O5{VO$IT3{uQH_LgmlMrM7)wrcPQ>6uOd(u_mW+k}b}h&+V8;?d9a^zD z5r-3TIT4Q&@i~z|h#)7`&U0!)A=Hzah!cr9k%SXTIgyML$vKgN6Dc{7iW8|hk%kj# zIgySN={b>s6B&gFvgFt~*UBV>y5wZ$L>5kDPgKhgnCkQ8HvBQD1X0^L{5})8wulB60 z!ilDwXvT@=oM^!b4^Fh?L@Og<-2FO?gmL%lDMXOv*Um{}Ya!I-w+$!Sa-tn4yg1R` zny@n?9jpmEXE`0M2|KOuwkGVfq7x^4IN{5Q&PKwR>|Kn6G158!Ci4z=sK;4)EbZs9P~Y2z4t)S`ao%&L|5aaLEbs zG=8*^FfIaPI5CzJUbf7oK!m-r3u!Aouz0ZCnj-XvJmRwoFYV!!)a$MrgCB$ zC#G{^1}A26ViqT6b7BrB=5k^lC+2fv0Vfu6Vi6}6TM#x^vn3XU&8_QFPAsz~>?{Jy ztqD7YZv`h-a$*%HR&!zvC)RRe9VgZcp1l zdQx|AVkakdabhiEEs=E`&NRZU~`{ zialnwgnBGqbK;E<>W;h>Lfw&foOsWP51jbOiBFvP z%!x0Y_{xcIMnV=uSD)&Bw{(!87Go+A_XT>av~KcQVXGu zi!?&0<0365(s3fa5U$3;_%M+{2=!QGcj~jPWW=7Gbg%mqAMr5aiTjXdT^pACwg(Bw>4qs7Dyi<)D^t15b6rvj}!fkgz2oBtl`93PORg^ zdQNQM#70hR;>2c7Y!O1e2y7KXy$EdM#CA^X;KWW&>=HsfBfEuA&&VE5?B&EhPVDEz z0Zts`#34=`=EM^o&m)0xXp<>oVd%0dz`p$B%}_zx>oqWNEqwzLry%xgl}5+#~5*c!ilGxc*cq6 zLa1x_3nA16_$4P^apJW#VdseJjWuECi0dsU-f`kRCq8iEBPTuyp$_oRLZ}1$3n#vE z;u|NvbK(alehQ(U)L%lVC)F(^e-(xh>Q)5jL6uOisk&L~KsP;Y3_c#N$MK zA=H_gz(^P$GZPA-&eTMlNX&^OoJh)vWSmIOi4;PpLo1~a>bOY7iPT2I7~pA)gz+3D zEho}(B0VQEa3UioGI1g^C$b2k&XKG_sBuc>2&aJ8n&Ik>C4*yDsyw_>~yu3BL{N}s@qiJX{ZBxE>UeXKB9h#-g4&K6~g zHDPCqGF1rmj7;OibWY6R#7s`i;>2uD%;CgbBk^~D`#GWp{?gh!BVkPT`9?xo;i_sa zFcQXAEab!@PAulc5>71T#4;mcoYdt;!Z@ibII)rwt2nWm6KgoJmJ{nZv7QqfII)ov zn>ewV6I(d3l@r@Iv7Hk;II+{3u(RXaWlh-G@$Kfs9!~5PBFNpZosR4iLcMj}&xr#< zs5^2{2z5seapEv1j&R~ACysIAI44eU;v^?d389Y8(?SFpop!3$8Ee8$vY)jk?6l&X z5bCiwFGP@IVW%S(IB}5^mpE~m6IVEKl@r%Eah(%4giuH4O(E3Ld5aUbIdO**cR6v7 z6ZbjsfD;cn@rVb`6JLc;59c=_)Wi9m6F)fdlM}x<;TGzj_g|=oGnf$SNewQ9x)mWf5t0+3 zI1!o?VK@<%6XA@6@so-0La0M4f)K7k%h-yDM#A{{NF+`~=0p@B)WaE72v@^t9E)h2 zh|Y-^La4_grV#3}h{cK6oQT7TxSWW`iTIpIz=?#MNW_W6oJhinq?|~`iR7F}!HJZd zNX3cNoJhlow46xCiS(Svz=@33gq_p)OxA>*qx8(2$ij)NoXEzB?3~EKiJY9sC4_pH zlbaKHgivpk@^T^{C-QTm04EA^q7WwvbD{_*igKbDCyH~T1Sd*zq7)}ebD|6<%5tI{ zC(3i8f{`$uUsN;_#%FeSPE;}y(h*me{3{y?V@IlRqADkiDsN=&WRSB@US5K9O3<+Q!N$& z=bs-Y)zX4+{vnKuKr2o-IN`~O)|_a=iME_*#|bY^wC6+zPITmiHzzuA!iN*SoaoGn zE}ZDfiEf7Gj#$dCURmD zCnj@Z3MZy=Vj3r=b7F=N>gb#)ggQEBabh+n=5S&zC+2ZtJ|`A%Vj(9MabhtimT+Pz zCzf$yIVVhAPTc0i9ZuZk#62N`%n>`2dS3{2jy&MRLry&6#A8l8 z;lxu;JQE_wxUiF{&xKIO#S2cn6Y)8bKnV4u zCKN(Fsfjp|SO|4Tk_e&hNKzy5uS0AMA+y3-NE>4+Kdg@7g;L>^A$ zT{w2 zCmM315hogRq6sIOa-tb0nscHBCpiB6pG;e;mIE6XQ5Bo)Z%|G0{lKC)}=H{4>c&7(d~j%!w(Sn97N1oS4pu8Jw8OiCLVO&51dj zn9GTIoS4sv1)NyOiA9`P%!wtOSjvfIoLJ6@6`WYfiB+6fErfcnwMGb6do5WI1HQK$ z_{!H>BVk++*KuM!CpK_mBPTX-VlyYUaAKge1igsbQ@hSqjY?BK*sPVC~uZcgms z#9mJ924Qo&t4(i}Dt#xy&^iKCo2#);#cIKhdNoH%7o*g3>LZB5uY z#6H7`vz$1`iSwMez=?~TxFm#nS-vcUdRe~0iL0Er#)<2kxWS2=M#8uV+%gizMc_6k z?g*id&bvaWqw^jo?sMV+CmwR*5horCp`O$yLZ~P8DJPzB;<*s&v3MbbdMsXY;uR-e zbK(sr-g4p{C*E`711COm;u9x6bK(mpzH;IlC%$vy2Pb}V;uj~}!tnQBa3VM-LU1A^ zCqi)|G$+DvA}lAuaUwh?B5)!iCn9koGAE*NA}S}MaUwb=VsIiRCt`6THYeh6A}%N5 zaUwn^5^y3RClYZYF(;C6A}J@5aUwYXD9jU;Hibld% z!QG97v4U6PL}ek=Gg3te^^8>IL^UH}oRR8A!Z;%}I8l=mwK!3m6LmOImlO3kQJ)hH zIMI+3jX2Sm6HPeLloQQ3(Od|1?Q9{0x^{YSq9rF;Vp!EEIM(*hZDY> z=*)>OoaoAlZk*`Oi5{Hj$%$T^=*@{foaoDmew^sfi2+RE2=!PD<-{;f4CllMBOzN9SKsIvDTI1PMhT&wkfwiFs;bf}2|~x8J|yC#uX>TS6=l zLR~l(3K2N#{T$)_j}T-{3jFiU1Aa8jA|ceRSj>qfLIm!J^G`>NPu@#8v5XVTIkAEh zD><=>6RU+#ht?V))N!$v6YDs!o)a55v5^y-II)=%TR5?m6WchkT?loK><~hoBRe^< zixay!v4<0TIkAru`#Eud69+kQNC;oNA=G1Wf)gh>aS9W@ zY28m_#Qh8>&T`@$C(d)?0w*qV;u0q=bK;5+>Ri1lggRHRapF2BZgAozCvI`#HYe_I z;w~rdapFEF9&q9zCmwO)F(;mI;wdMdapE~AUU1?iCth*lH7DM1;w>lMapFBEK5*hA zCq8lFvk>ad!51Oan}e^M_{NFvocO_spPcx`3AeEPRT!KI&WRA52+4_1oCwW{Fq{a> ziEx|aA_^y>av~ZhqH`h!Ct`9U7AInJA`U0wav~ll;&UPa zClYcZ5hoIJA_*swav~Wgk_(|;M^Xr(UPn@LB9%2^=Sv)^tqD6{;z+}Zw46xCiS(Sv zAcQ)!G76y%txTNA%!w?V$jXUqoXF0J9Gu9>iCmn>&51mm$jga*oXF3K0-Pwwi9(zx z%!wkLC@O@y{1y{JU4Dymq68;Oa-tL`N^_zNC(3f794E?iq5>x>a>AVxl{itE6ID1- zl@rxCQJoVtgizPcnnI{+XDuPrt*9-8x)pUeQCA3cN9qZo?nr%3G~h%-BVlpP-_H@t z>FeIeNEk2JH|9hWPBi62Gfp(;L<>%MaH1tAT5-Z5ggV(hg-|DZYfiM`L|aa@abh?pMsQ*zCq{8%G$+Pz zVyqDA=o}}6Iy%R5Vge^7a$*uECUas6C#G^@8YiZ6Vg@H>a$*)IW?K_>E)2}EChS}o zn9GTIoR}|!x@s*DLS3~Ma$*rD7IR_=Czf(z87G!=Vg)Bwa$*%HR&!zvC)RReoe=6| zUoV6@**9=vBPTX-VlyYUaAGSbwsB%RCw6dRCnt7sVmBxDaAGef_HklACk_aqu386$ zP*<%(oH)#hBb+$OiDR5N&WRJ8ILV1qoH)&iGn_cfiF2GdZ%x>FHoIU=*m*X)D1^G? zToOWEaxQb?3MZ~|;u0LJ|`Y<;vpvqVNA21M#6Z%^A{)F!u|993w3k`6G9!G!8sAany~XQ5z?Bl z^Dq&L6QMZ~h7(~q5snk#IT3*q5jhcw6OlO)MF@4WM-@Vy?9n(8of9!Q5t9?KI1!r@ zaX1l|6Y)3^pA!is6B#*?i4&PQk%bdkIgyPM**TGe6FE7Nixasyk%tp`IgyVO`8iR569t7(Zw?9x zq23%6=0p)r6y-!QP81hHJtHNAP|rw7PL$$AX-<^kL|IOh<3xE*RNzEKPPlWT5+^Ei zq6#Ofa-te1s&k?SCu(w{7AI<36L!vi>sS+Z&VK6(p)387vD>RS_b#-f2WVP`BF za-tC@8grrvCz^7i87G=^q6H^BIMI?5t%Oi#szV5Mrh0OsH7D9w5H^1xZCeY%=8uAH z#|bY^wC6+zPITmiHzzuA!iN*SoaoGnE<&j5MOPtQ)eE^W5cual2fPrfn-J<&bQi)^ zD~xvzdT^pACwg(BHz)cCp`MYxLa1k?A1C^AVgM%wa$=AW>KPd2WY!p>v=6l=oHWB*i6 zOyk6KPR!uMOis+=#B5H?5kj5pbA?bR`#es}=fna|Eab!@PAulc5>71T#4=7S=fnz5 ztQ10>W~+oyr`c*wtl`93PORg^dLh&^vOx&-jBMn@CQfYT#1>9$<-|4%!sdDBb_>Gh zdFKvJ?Bv8QPVDBy9wF3ewpR#sn(gDnej(HyIUt0(BL_Kgh!cl7afA~`g;3APF(K45 za-0(dU34g-{RY8BUz##5qo!=fnk0T;#+hBO!6|uMao;95J1L ze%i%lBVqhx;tD6Oa^e~%u5;oBCvI}$7AJ0V;tnV7a^fB*?sMXS5Uwh?G0h$dp|0SM zIPsVhPmF|AEmy1AQzKy<&SyfXXXLpM>KS>#iI<#s#fjIPc*BXeoOs8H_d=*c>w^&L z(E7-UPeQ0W@>vLVN4{|4D<{5j;yWjP2%(;lpF*f-NJbQ ziO52zGc^h)qH-b{C!%vA1}9>2A{HlNb0UtBFjlR&M#5ON;&CFr5b8`#AcQ*25^^FD zClYfa2`7?rA{i%=3!#pS6hf%uA|)qMaUwM*(r_XzC(>~uJts17A|odiCmn>&51mm$jga*oXF3K0-Pwwi9(zx%!wkLD9VXqoG8wT z5<;lAC?$nZZ&6Aa3F9JA+DI4|figm<$D*td>ai%tiSnGNz=?{SaOXrNPE_VZ6;4#; zL^V!S=R^%o)Z|1hPSoZ^9ZuBcL_JQ_=R^ZeG~`4hPBi936HYYcL^Dn_=R^xmcyOX6 zCt7jB!3j@JwB|$`PPFAjJ5G3UqP-C6eP;(D)cekkobcvECr z6WxVS2Y3%5)B)a;6TLXmn-hIF(U%kbgiudve<9S9I)D=cg-~~7kPzyQ4CcfTPWW-c z$%&zy7{-a=oEX80k(?OCiP4-G!-=t+7{`h6oS49gi9)EW)+8a+RckUQrf^~^C#G2w zb}pVxwFSKLfw&FoY>8YJ)GFfiG7^d&xr$^ILL`ZoH)#hBb+$OiDR5N&WRJ8ILV1q zoH)&iGn_cfiF2Gd&xs41xX6i1oVd)1E1bB>iEEs=&WRhGxM@w;*>~QuChTk)Z*$@f zC+-TN-mcvfLcLwP&xr?|c*u!IoOsNMC!Bc7iD#U6&WRVCc*%)ZoOsQNH=KCOiFcfM z&xsG5_{fP*ocPR%FG8pnfv-ZS7lCh__|AzRocPI!Uz~7@z~6ttiQv|RoxN5FYr@W6 zDA|fXuaUwD&qHrQAC!$#sc9K21HDM>&V{jrSCt`6T zHYeh6A}%N5aUwn^5^y4+5bBLmA|ccprNo>_VkG3-kOA*?^>ajXI-)xrF`WOr*EOk; zFn${{87Go+A_XT>av~KcQgb2=C(?2v9VgOrA_FHfav~EaG7F)uT3LipSFNm^$i|86 zoXEk6oSewTiQJsXV@=rE@#VE9?CkjRaUwq_3UHzzCkk<*un_8cQA7xJy(r3wVw@<> zi4vSBDTI1PN(rH!ko@D>i3yyT$cagun9PYOoS4dqX`GnO zi5Z-j$%$E66(Yz5(ay)a z%Y;xbh|4*#f)gt_v5FI`IkAQlYdNuw6YDv#ffE~rQ0K@dA+$N-o7R0ZM%=e>Vk;-M zabi0sc5q^+HDTv*VwVu=;My&OHn`mPaAGef_HklACk_bVYDSD}`#~YpGjfO%hdFUX z2v=ia+@u^8LOmA8IB}d4CpdAE6Q?+FniFR@ah4P3IB}j67lcsf$VDO4IdX{;mpO5T z6IVHLjT6^7af1^#IdO{fc#6L&drj}!Mf@qiN#Iq`@Sk2&##6Hhtuj1$i}@q!aC zIq^ye^=kH72=!|Ah7)hC2|Leb@2m+shhy(K@qrT`Iq``TpE>b`6JL#l@$v4PkuW|L zeK!)uR{Rh`ovA;CP-p5dPPj$nufpI&a888aL`Y7A;zVdpgyBS3PK4t`cuqv%L_|(R z;zVRlMBzkKA=HI0nh@&37hMQ-D`E(tZbeK^#NtG3PQ>9vTu#K}M0`#p;6y?p)X|wp z2z7KO=0p-sB;`ahP9*0<3QnZtL@G|C=0qA!q~%09PNe5V22Nz;L?%vT=0p}wWaUIQ zPGsjq4o>9cL@rL`=0qM&!n6SK&FdkM`;6z1ExC^09_DVvilfAN$FwRI7BVnA8s+_3C ziRzrF!HJrjsKtreM#2~ub&P~DF6wfk9w+K^q5&rwa-tC@8grrvCz^7i87G=^q6H^B zIMI?5tvKP}geNCjTM&MZ@cti?OW_N6SFJP9yK3855Y9h@vGBF!L_1D+aiTpZI&h*R zC%ie)i4#7Y@a05jPITc!S59=}M0ZZ~;6zVO^x{NsPW0hKUrzM1AZ#uI{VfQai@*RO z)K7>5g-|~s4&uaMP7L9MA19og7|MxZoEXlD5u6ywiBX&w&51Fb7|V%qoEXoE37nY7 ziAkK8%!w(Sn97N1oS4pu8Jw8OiCLVO&51djn9GTIoS4sv1)NyOiA9`P%!wtOSjvfI zoLJ6@6+)uOkPAP_HA0IB}R0M>uhm6UR7loD(NFagr0K zIB}X2XEptPCVqqBThW##1l?D<-{{iJmkPDJHIG)_e4L<~;ELzJWj;tL;_AE zhLLzbJp zSvZlE6WKVCofA1Yk&_d-IFXwZc{q`m6ZtripA!W*QIHdbI8m4rML1EE6UD3vJJ%M9 zTN8F()?0!TB{@-w6Qwy(h7)BuQH~SkIZ=TV6*=L~iAtQP%!w+Ts49edyH-sI^>(c~ zCu(q_CMRlfqBbY$aH6gd>bR&UggP$jbD{w!8gil$CmM622`8Fzq8TTebE1VcVP~V{ zVNKZCD7EB7D^55#;mL{CoM^*|ww!3k2`^5x7eZaZI|!k!;2nifx58Tpbt^h?!iN*S zoaoGnE}ZDfiEfqv6>TWII)%!>o~EV6B{_O zkrSIZu~`Up1>YistMY5ik*%EA#)<8m*ujaNLa1kCmk{b1+0BVPoY>2WeVo|Oi36NC zD1|FO z6OTFZgcDCW@r)DCIq`xMFFEl_2z9-9ErhFjVNCWnoOsKLcbs_7i4UCk$cay!_{@nf zocPL#Z`OpJ4>!JB6Lvn__`!*vocP5Fx5)hc7n}&ri4dF!DTKQGh7v+uenWF23@5^J zA{-~eb0PvKB61=UCn9qq3MZlpq0W(LLa1{jIwxXqA|@wdaUwP+;&37^C*pA;J|_}z zA|WRdaUwA%l5ip^Cz5d@xe)5Al|l%0)k?{URGdi7i8P!@%ZYTHNY9B3oXE(DOq|Hf zi7cGR%86{8$j*rzoXE+ET%5?wi9DRh%ZYqKs273!LZ}yk0z#-;QBVkVD++OC4@S(N^_zNC(3f794E?iq5>x>a>AVxl{itE6ID1-l@rxC zQJoVtI8l=mwK!3m6LmOImlO4jgnZiX{BuL!&k@b(i0*X6a5`c-9kHCg?)8m?@dLmH zoM_02Mx1CYgnB`2B7}NDY|4pdoMRG!pVuDoEXN5;hY%3iIJQb#fj0J z7{iINoEXQ6@tl~ziHV$;#EHqAn8JyvoR}try8KQTLS24maAGDWW^rORC+2WsE+^)3 zVm>DpaAF}R7I9)RCzfzxDJPb3VmT*PaAGAVRtcf5ovVdV*UmMZSj&lZoLJ9^4V>7> ziA_SN<6^TA>bTg#iLIR2#)<8m*ujaNoY=*Q-JICNiM^cICxkjj_6woTkprAK$caOo zILwJ7La1lts1WKIImU_OoH)UWlbkrkiPM}o!-=z;ILC?eoVdV=i=4Q`iOWK$ll_Vi z>SVvliEEs=&WRhGxXFoIoVd-2JDj*HggQF!389Y8`Ku8^i8n&1JMvZtbw}PA3AqaB>Sev}g;00ogAnSDeB{I@PJHIX z7fyUN62?jWW=+_+!tvdjuycju2Pb}V;uj~}qVV@$a3VM-LU1A^Cqi)|G$+DvA}lAu zaUwh?B5)!iCn9koGAE*NA}S}MaUwb=VsIiRCt`6THYeh6A}%N5aUwn^5^y3RClYZY zu@LIrZxSKYyWgaoNXCieoJhfml$=P#iPW4(!-=$qQnJ z)b%1OC$e!OJ125*A}1$uaUwS-@)!wu><{=_=ifhg$?wnCI`bL{i4vSB$%#@zsAr@!C&~z+p474d#DtVt-TwS?Izp(e za=ba^IZ=TV6*=L~iAtQP%!w+Ts49dyS*i)4j?U_wsKJSvoTw#)dN^wfp&rgUoT$r* zdYq`wi3Xf#$caXrXv~QwLa3v&sSxVuY{rS^oM<70dMrGIP>)4RPPF2LgA<;dXw8W> zoM_95cAW6yM0-wj;6z7Gcypo?Cww^J%ZbjM=wc+~Aj8$Al&(g?c#zSJ6Wuw{gA+YD z(aT5}XQa20FwRIHPW0tOKTh=L!~jkVMF)&Yr@VqFt%`Ft2JTgOB~y*2|FFx&WRn?gq@D; z2Web$7XxY*B$1J;CpT*!>LdKdn||4IbDcXh}<;VLc; z8wq1v9O1-KP8{RJaZa2t62`bVX(Ws@a!LsG{^GO{uJ#wkj+`+P#*Um7!c|B9ZiSyC z>fbpM@FOSA83|)6&Kn77g{w5XU`^PmS{H>-&&VYqTn(pjEG`S7Zp9TL)UCKGM3Akp zGo06iP`BbbCvI@!rVv4ng`F+REh8ZT?rJr=ErfbF?+D?lBmZi}-^m_u5&5nVu3GUo z5oEo%XC#a{a-S0qIPp*jSHo%S$Ri4+Nm zXE43w#491xaq*fHZ#eN*2#ewLb3}4FA_sop_TOLl`MSTeLpc8;0?){MA=Jb9ffFBv qP>;nYJB0OEe6~Ya&&U@}eC5P9PJHLY4^I5##4l^Y#&*p&^Zx)TH9C<1 literal 0 HcmV?d00001 diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py new file mode 100644 index 00000000..80673e03 --- /dev/null +++ b/src/delphi/eval/token_labelling.py @@ -0,0 +1,210 @@ +from typing import Callable, Optional + +import spacy +from spacy.tokens import Doc, Token +from spacy.util import is_package + +# make sure the english language model capabilities are installed by the equivalent of: +# python -m spacy download en_core_web_sm +# Should be run once, initially. Download only starts if not already installed. +SPACY_MODEL = "en_core_web_sm" # small: "en_core_web_sm", large: "en_core_web_trf" +NLP = None # global var to hold the language model +if not is_package(SPACY_MODEL): + spacy.cli.download(SPACY_MODEL, False, False) + + +TOKEN_LABELS: dict[str, Callable] = { + # --- custom categories --- + "Starts with space": (lambda token: token.text.startswith(" ")), # bool + "Capitalized": (lambda token: token.text[0].isupper()), # bool + # --- POS (part-of-speech) categories --- + # They include the Universal POS tags (https://universaldependencies.org/u/pos/) + # -> "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. + "Is Adjective": (lambda token: token.pos_ == "ADJ"), + "Is Adposition": (lambda token: token.pos_ == "ADP"), + "Is Adverb": (lambda token: token.pos_ == "ADV"), + "Is Auxiliary": (lambda token: token.pos_ == "AUX"), + "Is Coordinating conjuction": (lambda token: token.pos_ == "CCONJ"), + "Is Determiner": (lambda token: token.pos_ == "DET"), + "Is Interjunction": (lambda token: token.pos_ == "INTJ"), + "Is Noun": (lambda token: token.pos_ == "NOUN"), + "Is Numeral": (lambda token: token.pos_ == "NUM"), + "Is Particle": (lambda token: token.pos_ == "PART"), + "Is Pronoun": (lambda token: token.pos_ == "PRON"), + "Is Proper Noun": (lambda token: token.pos_ == "PROPN"), + "Is Punctuation": (lambda token: token.pos_ == "PUNCT"), + "Is Subordinating conjuction": (lambda token: token.pos_ == "SCONJ"), + "Is Symbol": (lambda token: token.pos_ == "SYM"), + "Is Verb": (lambda token: token.pos_ == "VERB"), + "Is Other": (lambda token: token.pos_ == "X"), + # --- dependency categories --- + # -> "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. + # "Is Subject": (lambda token: token.dep_ == "nsubj"), + # "Is Object": (lambda token: token.dep_ == "dobj"), + # "Is Root": ( + # lambda token: token.dep_ == "ROOT" + # ), # root of the sentence (often a verb) + # "Is auxiliary": (lambda token: token.dep_ == "aux"), + # --- Named entity recognition (NER) categories --- + # "Named Entity Type": (lambda token: token.ent_type_), # '', 'PERSON', 'ORG', 'GPE', .. + "Is Named Entity": (lambda token: token.ent_type_ != ""), +} + + +def explain_token_labels(token: Optional[Token] = None) -> None: + """ + Prints the explanation of a specific token's labels or of ALL + possible labels (POS, dependency, NER, ...), if no token is provided. + + Parameters + ---------- + token : Optional[Token], optional + The token, whose labels should be explained. If None, all labels + possible labels are explained, by default None. + """ + if token is not None: + # get token labels + labels = label_single_token(token) + print(" Explanation of token labels ".center(45, "-")) + print("Token text:".ljust(20), token.text) + print("Token dependency:".ljust(20), spacy.glossary.explain(token.dep_)) + print("Token POS:".ljust(20), spacy.glossary.explain(token.pos_)) + print(" Token labels ".center(45, "-")) + for i, (label_name, value) in enumerate(labels.items()): + print(f" {i:2} ", label_name.ljust(20), value) + + else: + glossary = spacy.glossary.GLOSSARY + print( + f"Explanation of all {len(glossary.keys())} token labels (POS, dependency, NER, ...):" + ) + for label, key in glossary.items(): + print(" ", label.ljust(10), key) + + +def label_single_token(token: Token | None) -> dict[str, bool]: + """ + Labels a single token. A token, that has been analyzed by the spaCy + library. + + Parameters + ---------- + token : Token | None + The token to be labelled. + + Returns + ------- + dict[str, bool] + Returns a dictionary with the token's labels as keys and their + corresponding boolean values. + """ + labels = dict() # The dict holding labels of a single token + # if token is None, then it is a '' empty strong token or similar + if token is None: + for label_name, category_check in TOKEN_LABELS.items(): + labels[label_name] = False + labels["Is Other"] = True + return labels + # all other cases / normal tokens + for label_name, category_check in TOKEN_LABELS.items(): + labels[label_name] = category_check(token) + return labels + + +def label_sentence(tokens: Doc | list[Token]) -> list[dict[str, bool]]: + """ + Labels spaCy Tokens in a sentence. Takes the context of the token into account + for dependency labels (e.g. subject, object, ...), IF dependency labels are turned on. + + Parameters + ---------- + tokens : list[Token] + A list of tokens. + + Returns + ------- + list[dict[str, bool]] + Returns a list of the tokens' labels. + """ + labelled_tokens = list() # list holding labels for all tokens of sentence + # if the list is empty it is because token is '' empty string or similar + if len(tokens) == 0: + labels = label_single_token(None) + labelled_tokens.append(labels) + return labelled_tokens + # in all other cases + for token in tokens: + labels = label_single_token(token) + labelled_tokens.append(labels) + return labelled_tokens + + +def label_batch_sentences( + sentences: list[str] | list[list[str]], + tokenized: bool = True, + verbose: bool = False, +) -> list[list[dict[str, bool]]]: + """ + Labels tokens in a sentence batchwise. Takes the context of the token into + account for dependency labels (e.g. subject, object, ...). + + Parameters + ---------- + sentences : list + A batch/list of sentences, each being a list of tokens. + tokenized : bool, optional + Whether the sentences are already tokenized, by default True. If the sentences + are full strings and not lists of tokens, then set to False. If true then `sentences` must be list[list[str]]. + verbose : bool, optional + Whether to print the tokens and their labels to the console, by default False. + + Returns + ------- + list[list[dict[str, bool]] + Returns a list of sentences. Each sentence contains a list of its + corresponding token length where each entry provides the labels/categories + for the token. Sentence -> Token -> Labels + """ + global NLP, SPACY_MODEL + + if NLP is None: + # Load english language model + NLP = spacy.load(SPACY_MODEL) + # labelled tokens, list holding sentences holding tokens holding corresponding token labels + labelled_sentences: list[list[dict[str, bool]]] = list() + + # go through each sentence in the batch + for sentence in sentences: + if tokenized: + # sentence is a list of tokens + doc = Doc(NLP.vocab, words=sentence) # type: ignore + # Apply the spaCy pipeline, except for the tokenizer + for name, proc in NLP.pipeline: + if name != "tokenizer": + doc = proc(doc) + else: + # sentence is a single string + doc = NLP(sentence) # type: ignore + + labelled_tokens = list() # list holding labels for all tokens of sentence + labelled_tokens = label_sentence(doc) + + # print the token and its labels to console + if verbose is True: + # go through each token in the sentence + for token, labelled_token in zip(doc, labelled_tokens): + print(f"Token: {token}") + print(" | ".join(list(TOKEN_LABELS.keys()))) + printable = [ + str(l).ljust(len(name)) for name, l in labelled_token.items() + ] + printable = " | ".join(printable) + print(printable) + print("---") + # add current sentence's tokens' labels to the list + labelled_sentences.append(labelled_tokens) + + if verbose is True: + print("\n") + + return labelled_sentences diff --git a/tests/eval/test_token_labelling.py b/tests/eval/test_token_labelling.py new file mode 100644 index 00000000..a727ddc0 --- /dev/null +++ b/tests/eval/test_token_labelling.py @@ -0,0 +1,114 @@ +import pytest +import spacy +from spacy.language import Language +from spacy.tokens import Doc + +import delphi.eval.token_labelling as tl + + +@pytest.fixture +def dummy_doc() -> tuple[str, Doc, dict[str, bool]]: + """ + Create a dummy Doc (list of Tokens) with specific attributes for testing purposes. + """ + nlp_dummy = Language() + + # Assume we're creating a dummy token with specific attributes + words = ["Peter", "is", "a", "person"] + spaces = [True, True, True, True] # No space after "dummy_token" + pos_tags = ["PROPN", "AUX", "DET", "NOUN"] # Part-of-speech tag + dep_tags = ["nsubj", "ROOT", "det", "attr"] # Dependency tag + ner_tags = ["PERSON", "", "", ""] # Named entity tag + + # Ensure the length of pos_tags and dep_tags matches the length of words + assert len(words) == len(pos_tags) == len(dep_tags) == len(ner_tags) + + # Create a Doc with one dummy token + doc = Doc(nlp_dummy.vocab, words=words, spaces=spaces) + + # Manually set POS, dependency and NER tags + for token, pos, dep, ner_tag in zip(doc, pos_tags, dep_tags, ner_tags): + token.pos_, token.dep_, token.ent_type_ = pos, dep, ner_tag + + # Token labels for "Peter" in the dummy doc + PETER_TOKEN_LABEL = { + "Starts with space": False, + "Capitalized": True, + "Is Adjective": False, + "Is Adposition": False, + "Is Adverb": False, + "Is Auxiliary": False, + "Is Coordinating conjuction": False, + "Is Determiner": False, + "Is Interjunction": False, + "Is Noun": False, + "Is Numeral": False, + "Is Particle": False, + "Is Pronoun": False, + "Is Proper Noun": True, + "Is Punctuation": False, + "Is Subordinating conjuction": False, + "Is Symbol": False, + "Is Verb": False, + "Is Other": False, + "Is Named Entity": True, + } + text = " ".join(words) + return text, doc, PETER_TOKEN_LABEL + + +def test_explain_token_labels(dummy_doc): + """ + Test the explain_token_labels function. + """ + # explain all labels + tl.explain_token_labels() + # print explanations for the first token in doc + text, doc, PETER_TOKEN_LABEL = dummy_doc + tl.explain_token_labels(doc[0]) + + +def test_label_single_token(dummy_doc): + """ + Test the label_single_token function. + """ + # create a dummy token + text, doc, PETER_TOKEN_LABEL = dummy_doc + token = doc[0] + # label the token + labels = tl.label_single_token(token) + # check if the labels are correct + assert labels == PETER_TOKEN_LABEL + + +def test_label_sentence(dummy_doc): + """ + Test the label_sentence function. + """ + text, doc, PETER_TOKEN_LABEL = dummy_doc + # label the sentence + labels = tl.label_sentence(doc) + # assert the first token is labeled correctly + assert labels[0] == PETER_TOKEN_LABEL + # iterate through tokens in doc + for token, label in zip(doc, labels): + assert label == tl.label_single_token(token) + + +def test_label_batch_sentences(dummy_doc): + """ + Test the label_batch_sentences function. + """ + # create a batch of sentences + text, doc, PETER_TOKEN_LABEL = dummy_doc + text = text.split(" ") + batch = [text, text, text] + # label the batch + labels = tl.label_batch_sentences(batch, tokenized=True) + # assert the first token is labeled correctly + assert labels[0][0] == PETER_TOKEN_LABEL + assert labels[1][0] == PETER_TOKEN_LABEL + assert labels[2][0] == PETER_TOKEN_LABEL + # iterate through tokens in doc + for token, label in zip(doc, labels[0]): + assert label == tl.label_single_token(token)