From c8e516b3d7551ccc54e31e726b030f2cf91d664e Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:25:50 +0100 Subject: [PATCH 01/29] add token labelling --- src/delphi/eval/token_labelling.py | 114 +++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 src/delphi/eval/token_labelling.py diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py new file mode 100644 index 00000000..ae15f639 --- /dev/null +++ b/src/delphi/eval/token_labelling.py @@ -0,0 +1,114 @@ +""" +This script creates labels for tokens in a sentence. +It takes the context of the token into account. +Additionally, it can visualize the sentences and their poart-of-speech (POS) tags. +""" + +from typing import List + +import spacy # pylint: disable=import-error +from spacy.tokens import Doc # pylint: disable=import-error + + +# make sure the english language model capabilities are installed by the equivalent of: +# python -m spacy download en_core_web_sm +# Should be run once, initially. Download only starts if not already installed. +spacy.cli.download("en_core_web_sm", False, False, "-q") + + +CATEGORIES = { + "Starts with space": (lambda token: token.text.startswith(" ")), # bool + "Capitalized": (lambda token: token.text[0].isupper()), # bool + "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. + "Is Noun": (lambda token: token.pos_ == "NOUN"), # redundant + "Is Verb": (lambda token: "VB" in token.tag_), # redundant + "Is Adjective": (lambda token: token.pos_ == "ADJ"), # redundant + "Is Adverb": (lambda token: token.pos_ == "ADV"), # redundant + "Named Entity Type": ( + lambda token: token.ent_type_ if token.ent_type_ != "" else token.ent_type_ + ), # False, 'PERSON', 'ORG', 'GPE', .. + "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. +} + + +def label_tokens(sentences: List, tokenized: bool=True, verbose: bool = False) -> List[List]: + """ + Labels tokens in a sentence. Takes the context of the token into account. + + Parameters + ---------- + sentences : List + A batch/list of sentences, each being a list of tokens. + tokenized : bool, optional + Whether the sentences are already tokenized, by default True. If the sentences + are full strings and not lists of tokens, then set to False. + verbose : bool, optional + Whether to print the tokens and their labels to the console, by default False. + + Returns + ------- + List[List] + Returns a list of sentences. Each sentence contains a list of its + corresponding token length where each entry provides the labels/categories + for the token. Sentence -> Token -> Labels + """ + assert isinstance(sentences, list) + # Load english language model + nlp = spacy.load("en_core_web_sm") + # labelled tokens, List holding sentences holding tokens holding corresponding token labels + labelled_sentences = list() + + for sentence in sentences: + if tokenized: + # sentence is a list of tokens + doc = Doc(nlp.vocab, words=sentence) + # Apply the spaCy pipeline, except for the tokenizer + for name, proc in nlp.pipeline: + if name != "tokenizer": + doc = proc(doc) + else: + # sentence is a single string + doc = nlp(sentence) + + labelled_tokens = list() # List holding labels for all tokens of sentence + + for token in doc: + labels = list() # The list holding labels of a single token + for _, category_check in CATEGORIES.items(): + label = category_check(token) + labels.append(label) + # add current token's to the list + labelled_tokens.append(labels) + + # print the token and its labels to console + if verbose is True: + print(f"Token: {token.text}") + print(" | ".join(list(CATEGORIES.keys()))) + printable = [ + str(l).ljust(len(cname)) + for l, cname in zip(labels, CATEGORIES.keys()) + ] + printable = " | ".join(printable) + print(printable) + print("---") + # add current sentence's tokens' labels to the list + labelled_sentences.append(labelled_tokens) + + if verbose is True: + print("\n") + + return labelled_sentences + + +if __name__ == "__main__": + # result = label_tokens( + # ["Hi, my name is Joshua.".split(" "), "The highway is full of car s, Peter.".split(" ")], + # tokenized=True, + # verbose=True, + # ) + result = label_tokens( + ["Hi, my name is Joshua.", "The highway is full of car s, Peter."], + tokenized=False, + verbose=True, + ) + print(result) From f57210c9047d6f5f49eb213346ebe2d21e5ab1f6 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:37:04 +0100 Subject: [PATCH 02/29] add explanation function --- src/delphi/eval/token_labelling.py | 100 ++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 15 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index ae15f639..8d1ce6d8 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -4,11 +4,12 @@ Additionally, it can visualize the sentences and their poart-of-speech (POS) tags. """ -from typing import List +from pprint import pprint +from typing import List, Optional import spacy # pylint: disable=import-error from spacy.tokens import Doc # pylint: disable=import-error - +from spacy.tokens import Token # make sure the english language model capabilities are installed by the equivalent of: # python -m spacy download en_core_web_sm @@ -17,39 +18,108 @@ CATEGORIES = { + # custom categories "Starts with space": (lambda token: token.text.startswith(" ")), # bool "Capitalized": (lambda token: token.text[0].isupper()), # bool - "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. + # POS (part-of-speech) categories + # "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. "Is Noun": (lambda token: token.pos_ == "NOUN"), # redundant - "Is Verb": (lambda token: "VB" in token.tag_), # redundant + "Is Pronoun": (lambda token: token.pos_ == "PRON"), # redundant "Is Adjective": (lambda token: token.pos_ == "ADJ"), # redundant + "Is Verb": (lambda token: "VB" in token.tag_), # redundant "Is Adverb": (lambda token: token.pos_ == "ADV"), # redundant - "Named Entity Type": ( - lambda token: token.ent_type_ if token.ent_type_ != "" else token.ent_type_ - ), # False, 'PERSON', 'ORG', 'GPE', .. - "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. + "Is Preposition": (lambda token: token.pos_ == "ADP"), # redundant + "Is Conjunction": (lambda token: token.pos_ == "CONJ"), # redundant + "Is Interjunction": (lambda token: token.pos_ == "INTJ"), # redundant + # dependency categories + # "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. + "Is Subject": (lambda token: token.dep_ == "nsubj"), + "Is Object": (lambda token: token.dep_ == "dobj"), + "Is Root": ( + lambda token: token.dep_ == "ROOT" + ), # root of the sentence (often a verb) + "Is auxiliary": (lambda token: token.dep_ == "aux"), # redundant + # Named entity recognition (NER) categories + # "Named Entity Type": (lambda token: token.ent_type_), # '', 'PERSON', 'ORG', 'GPE', .. + "Is Named Entity": (lambda token: token.ent_type_ != ""), } -def label_tokens(sentences: List, tokenized: bool=True, verbose: bool = False) -> List[List]: +def explain_token_labels(token: Optional[Token] = None) -> None: + """ + Prints the explanation of a specific token's labels or of ALL + possible labels (POS, dependency, NER, ...), if no token is provided. + + Parameters + ---------- + token : Optional[Token], optional + The token, whose labels should be explained. If None, all labels + possible labels are explained, by default None. + """ + if token is not None: + # get token labels + labels = label_single_token(token) + print(" Explanation of token labels ".center(45, "-")) + print("Token text:".ljust(20), token.text) + print("Token dependency:".ljust(20), spacy.glossary.explain(token.dep_)) + print("Token POS:".ljust(20), spacy.glossary.explain(token.pos_)) + print(" Token labels ".center(45, "-")) + for i, (label, value) in enumerate(zip(CATEGORIES.keys(), labels)): + print(f" {i:2} ", label.ljust(20), value) + + else: + glossary = spacy.glossary.GLOSSARY + print( + f"Explanation of all {len(glossary.keys())} token labels (POS, dependency, NER, ...):" + ) + for label, key in glossary.items(): + print(" ", label.ljust(10), key) + + +def label_single_token(token: Token) -> List: + """ + Labels a single token. + + Parameters + ---------- + token : Token + The token to be labelled. + + Returns + ------- + List + The labels of the token. + """ + assert isinstance(token, Token) + labels = list() # The list holding labels of a single token + for _, category_check in CATEGORIES.items(): + label = category_check(token) + labels.append(label) + return labels + + +def label_batch_token( + sentences: List, tokenized: bool = True, verbose: bool = False +) -> List[List]: """ - Labels tokens in a sentence. Takes the context of the token into account. + Labels tokens in a sentence batchwise. Takes the context of the token into + account for dependency labels (e.g. subject, object, ...). Parameters ---------- sentences : List A batch/list of sentences, each being a list of tokens. tokenized : bool, optional - Whether the sentences are already tokenized, by default True. If the sentences + Whether the sentences are already tokenized, by default True. If the sentences are full strings and not lists of tokens, then set to False. verbose : bool, optional - Whether to print the tokens and their labels to the console, by default False. + Whether to print the tokens and their labels to the console, by default False. Returns ------- List[List] - Returns a list of sentences. Each sentence contains a list of its - corresponding token length where each entry provides the labels/categories + Returns a list of sentences. Each sentence contains a list of its + corresponding token length where each entry provides the labels/categories for the token. Sentence -> Token -> Labels """ assert isinstance(sentences, list) @@ -106,7 +176,7 @@ def label_tokens(sentences: List, tokenized: bool=True, verbose: bool = False) - # tokenized=True, # verbose=True, # ) - result = label_tokens( + result = label_batch_token( ["Hi, my name is Joshua.", "The highway is full of car s, Peter."], tokenized=False, verbose=True, From 32d498a9f7d885e1f2eae180ae1ab45d4dbab865 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Fri, 2 Feb 2024 17:38:36 +0100 Subject: [PATCH 03/29] add notebook --- notebooks/token_labelling.ipynb | 586 ++++++++++++++++++++++++++++++++ 1 file changed, 586 insertions(+) create mode 100644 notebooks/token_labelling.ipynb diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb new file mode 100644 index 00000000..cf4515ef --- /dev/null +++ b/notebooks/token_labelling.ipynb @@ -0,0 +1,586 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to label tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# autoreload\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import spacy\n", + "\n", + "import token_labelling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We analyze a simple sentence and receive the respective tokens with their analyzed attributes. \n", + "The grammatical/linguistic analysis is done by a model provided by spaCy for the English language." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This\n" + ] + } + ], + "source": [ + "# Load the english model\n", + "nlp = spacy.load(\"en_core_web_sm\")\n", + "\n", + "# Create a Doc object from a given text\n", + "doc = nlp(\"This is a dummy sentence for testing.\")\n", + "\n", + "token = doc[0]\n", + "print(token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get the label for our custom token that we just printed." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False]\n" + ] + } + ], + "source": [ + "label = token_labelling.label_single_token(token)\n", + "print(label)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get an understanding of what the labels acutally mean.\n", + "Use this function to receive an explanation for a single token." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------- Explanation of token labels --------\n", + "Token text: This\n", + "Token dependency: nominal subject\n", + "Token POS: pronoun\n", + "---------------- Token labels ---------------\n", + " 0 Starts with space False\n", + " 1 Capitalized True\n", + " 2 Is Noun False\n", + " 3 Is Pronoun True\n", + " 4 Is Adjective False\n", + " 5 Is Verb False\n", + " 6 Is Adverb False\n", + " 7 Is Preposition False\n", + " 8 Is Conjunction False\n", + " 9 Is Interjunction False\n", + " 10 Is Subject True\n", + " 11 Is Object False\n", + " 12 Is Root False\n", + " 13 Is auxiliary False\n", + " 14 Is Named Entity False\n" + ] + } + ], + "source": [ + "token_labelling.explain_token_labels(token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Explanation of all 302 token labels (POS, dependency, NER, ...):\n", + " ADJ adjective\n", + " ADP adposition\n", + " ADV adverb\n", + " AUX auxiliary\n", + " CONJ conjunction\n", + " CCONJ coordinating conjunction\n", + " DET determiner\n", + " INTJ interjection\n", + " NOUN noun\n", + " NUM numeral\n", + " PART particle\n", + " PRON pronoun\n", + " PROPN proper noun\n", + " PUNCT punctuation\n", + " SCONJ subordinating conjunction\n", + " SYM symbol\n", + " VERB verb\n", + " X other\n", + " EOL end of line\n", + " SPACE space\n", + " . punctuation mark, sentence closer\n", + " , punctuation mark, comma\n", + " -LRB- left round bracket\n", + " -RRB- right round bracket\n", + " `` opening quotation mark\n", + " \"\" closing quotation mark\n", + " '' closing quotation mark\n", + " : punctuation mark, colon or ellipsis\n", + " $ symbol, currency\n", + " # symbol, number sign\n", + " AFX affix\n", + " CC conjunction, coordinating\n", + " CD cardinal number\n", + " DT determiner\n", + " EX existential there\n", + " FW foreign word\n", + " HYPH punctuation mark, hyphen\n", + " IN conjunction, subordinating or preposition\n", + " JJ adjective (English), other noun-modifier (Chinese)\n", + " JJR adjective, comparative\n", + " JJS adjective, superlative\n", + " LS list item marker\n", + " MD verb, modal auxiliary\n", + " NIL missing tag\n", + " NN noun, singular or mass\n", + " NNP noun, proper singular\n", + " NNPS noun, proper plural\n", + " NNS noun, plural\n", + " PDT predeterminer\n", + " POS possessive ending\n", + " PRP pronoun, personal\n", + " PRP$ pronoun, possessive\n", + " RB adverb\n", + " RBR adverb, comparative\n", + " RBS adverb, superlative\n", + " RP adverb, particle\n", + " TO infinitival \"to\"\n", + " UH interjection\n", + " VB verb, base form\n", + " VBD verb, past tense\n", + " VBG verb, gerund or present participle\n", + " VBN verb, past participle\n", + " VBP verb, non-3rd person singular present\n", + " VBZ verb, 3rd person singular present\n", + " WDT wh-determiner\n", + " WP wh-pronoun, personal\n", + " WP$ wh-pronoun, possessive\n", + " WRB wh-adverb\n", + " SP space (English), sentence-final particle (Chinese)\n", + " ADD email\n", + " NFP superfluous punctuation\n", + " GW additional word in multi-word expression\n", + " XX unknown\n", + " BES auxiliary \"be\"\n", + " HVS forms of \"have\"\n", + " _SP whitespace\n", + " $( other sentence-internal punctuation mark\n", + " $, comma\n", + " $. sentence-final punctuation mark\n", + " ADJA adjective, attributive\n", + " ADJD adjective, adverbial or predicative\n", + " APPO postposition\n", + " APPR preposition; circumposition left\n", + " APPRART preposition with article\n", + " APZR circumposition right\n", + " ART definite or indefinite article\n", + " CARD cardinal number\n", + " FM foreign language material\n", + " ITJ interjection\n", + " KOKOM comparative conjunction\n", + " KON coordinate conjunction\n", + " KOUI subordinate conjunction with \"zu\" and infinitive\n", + " KOUS subordinate conjunction with sentence\n", + " NE proper noun\n", + " NNE proper noun\n", + " PAV pronominal adverb\n", + " PROAV pronominal adverb\n", + " PDAT attributive demonstrative pronoun\n", + " PDS substituting demonstrative pronoun\n", + " PIAT attributive indefinite pronoun without determiner\n", + " PIDAT attributive indefinite pronoun with determiner\n", + " PIS substituting indefinite pronoun\n", + " PPER non-reflexive personal pronoun\n", + " PPOSAT attributive possessive pronoun\n", + " PPOSS substituting possessive pronoun\n", + " PRELAT attributive relative pronoun\n", + " PRELS substituting relative pronoun\n", + " PRF reflexive personal pronoun\n", + " PTKA particle with adjective or adverb\n", + " PTKANT answer particle\n", + " PTKNEG negative particle\n", + " PTKVZ separable verbal particle\n", + " PTKZU \"zu\" before infinitive\n", + " PWAT attributive interrogative pronoun\n", + " PWAV adverbial interrogative or relative pronoun\n", + " PWS substituting interrogative pronoun\n", + " TRUNC word remnant\n", + " VAFIN finite verb, auxiliary\n", + " VAIMP imperative, auxiliary\n", + " VAINF infinitive, auxiliary\n", + " VAPP perfect participle, auxiliary\n", + " VMFIN finite verb, modal\n", + " VMINF infinitive, modal\n", + " VMPP perfect participle, modal\n", + " VVFIN finite verb, full\n", + " VVIMP imperative, full\n", + " VVINF infinitive, full\n", + " VVIZU infinitive with \"zu\", full\n", + " VVPP perfect participle, full\n", + " XY non-word containing non-letter\n", + " AD adverb\n", + " AS aspect marker\n", + " BA 把 in ba-construction\n", + " CS subordinating conjunction\n", + " DEC 的 in a relative clause\n", + " DEG associative 的\n", + " DER 得 in V-de const. and V-de-R\n", + " DEV 地 before VP\n", + " ETC for words 等, 等等\n", + " IJ interjection\n", + " LB 被 in long bei-const\n", + " LC localizer\n", + " M measure word\n", + " MSP other particle\n", + " NR proper noun\n", + " NT temporal noun\n", + " OD ordinal number\n", + " ON onomatopoeia\n", + " P preposition excluding 把 and 被\n", + " PN pronoun\n", + " PU punctuation\n", + " SB 被 in short bei-const\n", + " VA predicative adjective\n", + " VC 是 (copula)\n", + " VE 有 as the main verb\n", + " VV other verb\n", + " NP noun phrase\n", + " PP prepositional phrase\n", + " VP verb phrase\n", + " ADVP adverb phrase\n", + " ADJP adjective phrase\n", + " SBAR subordinating conjunction\n", + " PRT particle\n", + " PNP prepositional noun phrase\n", + " acl clausal modifier of noun (adjectival clause)\n", + " acomp adjectival complement\n", + " advcl adverbial clause modifier\n", + " advmod adverbial modifier\n", + " agent agent\n", + " amod adjectival modifier\n", + " appos appositional modifier\n", + " attr attribute\n", + " aux auxiliary\n", + " auxpass auxiliary (passive)\n", + " case case marking\n", + " cc coordinating conjunction\n", + " ccomp clausal complement\n", + " clf classifier\n", + " complm complementizer\n", + " compound compound\n", + " conj conjunct\n", + " cop copula\n", + " csubj clausal subject\n", + " csubjpass clausal subject (passive)\n", + " dative dative\n", + " dep unclassified dependent\n", + " det determiner\n", + " discourse discourse element\n", + " dislocated dislocated elements\n", + " dobj direct object\n", + " expl expletive\n", + " fixed fixed multiword expression\n", + " flat flat multiword expression\n", + " goeswith goes with\n", + " hmod modifier in hyphenation\n", + " hyph hyphen\n", + " infmod infinitival modifier\n", + " intj interjection\n", + " iobj indirect object\n", + " list list\n", + " mark marker\n", + " meta meta modifier\n", + " neg negation modifier\n", + " nmod modifier of nominal\n", + " nn noun compound modifier\n", + " npadvmod noun phrase as adverbial modifier\n", + " nsubj nominal subject\n", + " nsubjpass nominal subject (passive)\n", + " nounmod modifier of nominal\n", + " npmod noun phrase as adverbial modifier\n", + " num number modifier\n", + " number number compound modifier\n", + " nummod numeric modifier\n", + " oprd object predicate\n", + " obj object\n", + " obl oblique nominal\n", + " orphan orphan\n", + " parataxis parataxis\n", + " partmod participal modifier\n", + " pcomp complement of preposition\n", + " pobj object of preposition\n", + " poss possession modifier\n", + " possessive possessive modifier\n", + " preconj pre-correlative conjunction\n", + " prep prepositional modifier\n", + " prt particle\n", + " punct punctuation\n", + " quantmod modifier of quantifier\n", + " rcmod relative clause modifier\n", + " relcl relative clause modifier\n", + " reparandum overridden disfluency\n", + " root root\n", + " ROOT root\n", + " vocative vocative\n", + " xcomp open clausal complement\n", + " ac adpositional case marker\n", + " adc adjective component\n", + " ag genitive attribute\n", + " ams measure argument of adjective\n", + " app apposition\n", + " avc adverbial phrase component\n", + " cd coordinating conjunction\n", + " cj conjunct\n", + " cm comparative conjunction\n", + " cp complementizer\n", + " cvc collocational verb construction\n", + " da dative\n", + " dh discourse-level head\n", + " dm discourse marker\n", + " ep expletive es\n", + " hd head\n", + " ju junctor\n", + " mnr postnominal modifier\n", + " mo modifier\n", + " ng negation\n", + " nk noun kernel element\n", + " nmc numerical component\n", + " oa accusative object\n", + " oc clausal object\n", + " og genitive object\n", + " op prepositional object\n", + " par parenthetical element\n", + " pd predicate\n", + " pg phrasal genitive\n", + " ph placeholder\n", + " pm morphological particle\n", + " pnc proper noun component\n", + " rc relative clause\n", + " re repeated element\n", + " rs reported speech\n", + " sb subject\n", + " sbp passivized subject (PP)\n", + " sp subject or predicate\n", + " svp separable verb prefix\n", + " uc unit component\n", + " vo vocative\n", + " PERSON People, including fictional\n", + " NORP Nationalities or religious or political groups\n", + " FACILITY Buildings, airports, highways, bridges, etc.\n", + " FAC Buildings, airports, highways, bridges, etc.\n", + " ORG Companies, agencies, institutions, etc.\n", + " GPE Countries, cities, states\n", + " LOC Non-GPE locations, mountain ranges, bodies of water\n", + " PRODUCT Objects, vehicles, foods, etc. (not services)\n", + " EVENT Named hurricanes, battles, wars, sports events, etc.\n", + " WORK_OF_ART Titles of books, songs, etc.\n", + " LAW Named documents made into laws.\n", + " LANGUAGE Any named language\n", + " DATE Absolute or relative dates or periods\n", + " TIME Times smaller than a day\n", + " PERCENT Percentage, including \"%\"\n", + " MONEY Monetary values, including unit\n", + " QUANTITY Measurements, as of weight or distance\n", + " ORDINAL \"first\", \"second\", etc.\n", + " CARDINAL Numerals that do not fall under another type\n", + " PER Named person or family.\n", + " MISC Miscellaneous entities, e.g. events, nationalities, products or works of art\n", + " EVT Festivals, cultural events, sports events, weather phenomena, wars, etc.\n", + " PROD Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas\n", + " DRV Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')\n", + " GPE_LOC Geo-political entity, with a locative sense, e.g. 'John lives in Spain'\n", + " GPE_ORG Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'\n" + ] + } + ], + "source": [ + "token_labelling.explain_token_labels()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let us analyze a batch of sentences and have them labelled.\n", + "> In this example the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token: This\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | True | False | True | False | False | False | False | False | False | True | False | False | False | False \n", + "---\n", + "Token: is\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | True | False | False | False | False | False | False | True | False | False \n", + "---\n", + "Token: a\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: sentence\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | True | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: .\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "\n", + "\n", + "5\n", + "[[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" + ] + } + ], + "source": [ + "sentences = [\n", + " \"This is a sentence.\"\n", + "]\n", + "labels = token_labelling.label_batch_token(sentences, tokenized=False, verbose=True)\n", + "\n", + "print(len(labels[0]))\n", + "print(labels[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now with our own tokenization. E.g. the one from our TinyStories models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token: This \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | True | True | False | False | False | False | False | False | False | False | False | True | False | False \n", + "---\n", + "Token: is \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | False | True | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: a \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | True | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: sentence\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | True | False | False | False | False | False | False | False | False | True | False | False | False \n", + "---\n", + "Token: .\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "\n", + "\n", + "5\n", + "[[False, True, True, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" + ] + } + ], + "source": [ + "sentences = [\n", + " [\"This \", \"is \", \"a \", \"sentence\", \".\"]\n", + "]\n", + "labels = token_labelling.label_batch_token(sentences, tokenized=True, verbose=False)\n", + "\n", + "print(len(labels[0]))\n", + "print(labels[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv_tinyevals", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From e9f5c117991e853e70706351c22f7bb763cc198f Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 8 Feb 2024 21:27:27 +0100 Subject: [PATCH 04/29] test --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 65b457a4..f62b329b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ black==23.12.1 jaxtyping==0.2.25 beartype==0.16.4 pre-commit==3.6.0 -isort==5.13.2 \ No newline at end of file +isort==5.13.2 +spacy \ No newline at end of file From c10d5c132fc5e3f1f66ec0308c31b47d03b19c29 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 8 Feb 2024 21:28:49 +0100 Subject: [PATCH 05/29] swtich off dependency labels + add spacy to requirements --- requirements.txt | 4 +- src/delphi/__init__.py | 2 + src/delphi/eval/__init__.py | 1 + src/delphi/eval/token_labelling.py | 64 +++++++++++++++--------------- 4 files changed, 37 insertions(+), 34 deletions(-) diff --git a/requirements.txt b/requirements.txt index f62b329b..67594574 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -torch==2.1.2 +# torch==2.1.2 datasets==2.16.1 transformers==4.36.2 tqdm==4.66.1 @@ -10,4 +10,4 @@ jaxtyping==0.2.25 beartype==0.16.4 pre-commit==3.6.0 isort==5.13.2 -spacy \ No newline at end of file +spacy==3.7.2 diff --git a/src/delphi/__init__.py b/src/delphi/__init__.py index b9b115cf..36c553b4 100644 --- a/src/delphi/__init__.py +++ b/src/delphi/__init__.py @@ -1,3 +1,5 @@ from beartype.claw import beartype_this_package # <-- hype comes +from . import eval + beartype_this_package() # <-- hype goes diff --git a/src/delphi/eval/__init__.py b/src/delphi/eval/__init__.py index e69de29b..30afc7c2 100644 --- a/src/delphi/eval/__init__.py +++ b/src/delphi/eval/__init__.py @@ -0,0 +1 @@ +from . import token_labelling diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 8d1ce6d8..c21e8040 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -5,7 +5,7 @@ """ from pprint import pprint -from typing import List, Optional +from typing import Callable, Optional import spacy # pylint: disable=import-error from spacy.tokens import Doc # pylint: disable=import-error @@ -17,12 +17,12 @@ spacy.cli.download("en_core_web_sm", False, False, "-q") -CATEGORIES = { - # custom categories +TOKEN_LABELS: dict[str, Callable] = { + # --- custom categories --- "Starts with space": (lambda token: token.text.startswith(" ")), # bool "Capitalized": (lambda token: token.text[0].isupper()), # bool - # POS (part-of-speech) categories - # "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. + # --- POS (part-of-speech) categories --- + # -> "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. "Is Noun": (lambda token: token.pos_ == "NOUN"), # redundant "Is Pronoun": (lambda token: token.pos_ == "PRON"), # redundant "Is Adjective": (lambda token: token.pos_ == "ADJ"), # redundant @@ -31,15 +31,15 @@ "Is Preposition": (lambda token: token.pos_ == "ADP"), # redundant "Is Conjunction": (lambda token: token.pos_ == "CONJ"), # redundant "Is Interjunction": (lambda token: token.pos_ == "INTJ"), # redundant - # dependency categories - # "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. - "Is Subject": (lambda token: token.dep_ == "nsubj"), - "Is Object": (lambda token: token.dep_ == "dobj"), - "Is Root": ( - lambda token: token.dep_ == "ROOT" - ), # root of the sentence (often a verb) - "Is auxiliary": (lambda token: token.dep_ == "aux"), # redundant - # Named entity recognition (NER) categories + # --- dependency categories --- + # -> "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. + # "Is Subject": (lambda token: token.dep_ == "nsubj"), + # "Is Object": (lambda token: token.dep_ == "dobj"), + # "Is Root": ( + # lambda token: token.dep_ == "ROOT" + # ), # root of the sentence (often a verb) + # "Is auxiliary": (lambda token: token.dep_ == "aux"), # redundant + # --- Named entity recognition (NER) categories --- # "Named Entity Type": (lambda token: token.ent_type_), # '', 'PERSON', 'ORG', 'GPE', .. "Is Named Entity": (lambda token: token.ent_type_ != ""), } @@ -64,8 +64,8 @@ def explain_token_labels(token: Optional[Token] = None) -> None: print("Token dependency:".ljust(20), spacy.glossary.explain(token.dep_)) print("Token POS:".ljust(20), spacy.glossary.explain(token.pos_)) print(" Token labels ".center(45, "-")) - for i, (label, value) in enumerate(zip(CATEGORIES.keys(), labels)): - print(f" {i:2} ", label.ljust(20), value) + for i, (label_name, value) in enumerate(labels.items()): + print(f" {i:2} ", label_name.ljust(20), value) else: glossary = spacy.glossary.GLOSSARY @@ -76,7 +76,7 @@ def explain_token_labels(token: Optional[Token] = None) -> None: print(" ", label.ljust(10), key) -def label_single_token(token: Token) -> List: +def label_single_token(token: Token) -> dict[str, bool]: """ Labels a single token. @@ -87,27 +87,27 @@ def label_single_token(token: Token) -> List: Returns ------- - List - The labels of the token. + dict[str, bool] + Returns a dictionary with the token's labels as keys and their + corresponding boolean values. """ assert isinstance(token, Token) - labels = list() # The list holding labels of a single token - for _, category_check in CATEGORIES.items(): - label = category_check(token) - labels.append(label) + labels = dict() # The list holding labels of a single token + for label_name, category_check in TOKEN_LABELS.items(): + labels[label_name] = category_check(token) return labels def label_batch_token( - sentences: List, tokenized: bool = True, verbose: bool = False -) -> List[List]: + sentences: list, tokenized: bool = True, verbose: bool = False +) -> list[list]: """ Labels tokens in a sentence batchwise. Takes the context of the token into account for dependency labels (e.g. subject, object, ...). Parameters ---------- - sentences : List + sentences : list A batch/list of sentences, each being a list of tokens. tokenized : bool, optional Whether the sentences are already tokenized, by default True. If the sentences @@ -117,7 +117,7 @@ def label_batch_token( Returns ------- - List[List] + list[list] Returns a list of sentences. Each sentence contains a list of its corresponding token length where each entry provides the labels/categories for the token. Sentence -> Token -> Labels @@ -125,7 +125,7 @@ def label_batch_token( assert isinstance(sentences, list) # Load english language model nlp = spacy.load("en_core_web_sm") - # labelled tokens, List holding sentences holding tokens holding corresponding token labels + # labelled tokens, list holding sentences holding tokens holding corresponding token labels labelled_sentences = list() for sentence in sentences: @@ -140,11 +140,11 @@ def label_batch_token( # sentence is a single string doc = nlp(sentence) - labelled_tokens = list() # List holding labels for all tokens of sentence + labelled_tokens = list() # list holding labels for all tokens of sentence for token in doc: labels = list() # The list holding labels of a single token - for _, category_check in CATEGORIES.items(): + for _, category_check in TOKEN_LABELS.items(): label = category_check(token) labels.append(label) # add current token's to the list @@ -153,10 +153,10 @@ def label_batch_token( # print the token and its labels to console if verbose is True: print(f"Token: {token.text}") - print(" | ".join(list(CATEGORIES.keys()))) + print(" | ".join(list(TOKEN_LABELS.keys()))) printable = [ str(l).ljust(len(cname)) - for l, cname in zip(labels, CATEGORIES.keys()) + for l, cname in zip(labels, TOKEN_LABELS.keys()) ] printable = " | ".join(printable) print(printable) From ab3be1955a1e3ba65037b7ad4055d41e3252591c Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 8 Feb 2024 22:14:07 +0100 Subject: [PATCH 06/29] small improvements --- src/delphi/eval/token_labelling.py | 63 +++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index c21e8040..c64e6b01 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -4,8 +4,7 @@ Additionally, it can visualize the sentences and their poart-of-speech (POS) tags. """ -from pprint import pprint -from typing import Callable, Optional +from typing import Callable, Optional, Union import spacy # pylint: disable=import-error from spacy.tokens import Doc # pylint: disable=import-error @@ -45,7 +44,7 @@ } -def explain_token_labels(token: Optional[Token] = None) -> None: +def explain_Token_labels(token: Optional[Token] = None) -> None: """ Prints the explanation of a specific token's labels or of ALL possible labels (POS, dependency, NER, ...), if no token is provided. @@ -58,7 +57,7 @@ def explain_token_labels(token: Optional[Token] = None) -> None: """ if token is not None: # get token labels - labels = label_single_token(token) + labels = label_single_Token(token) print(" Explanation of token labels ".center(45, "-")) print("Token text:".ljust(20), token.text) print("Token dependency:".ljust(20), spacy.glossary.explain(token.dep_)) @@ -76,9 +75,10 @@ def explain_token_labels(token: Optional[Token] = None) -> None: print(" ", label.ljust(10), key) -def label_single_token(token: Token) -> dict[str, bool]: +def label_single_Token(token: Token) -> dict[str, bool]: """ - Labels a single token. + Labels a single token. A token, that has been analyzed by the spaCy + library. Parameters ---------- @@ -98,7 +98,29 @@ def label_single_token(token: Token) -> dict[str, bool]: return labels -def label_batch_token( +def label_sentence(tokens: Union[Doc, list[Token]]) -> list[dict[str, bool]]: + """ + Labels spaCy Tokens in a sentence. Takes the context of the token into account + for dependency labels (e.g. subject, object, ...), IF dependency labels are turned on. + + Parameters + ---------- + tokens : list[Token] + A list of tokens. + + Returns + ------- + list[dict[str, bool]] + Returns a list of the tokens' labels. + """ + labelled_tokens = list() # list holding labels for all tokens of sentence + for token in tokens: + labels = label_single_Token(token) + labelled_tokens.append(labels) + return labelled_tokens + + +def label_batch_sentences( sentences: list, tokenized: bool = True, verbose: bool = False ) -> list[list]: """ @@ -117,7 +139,7 @@ def label_batch_token( Returns ------- - list[list] + list[list[dict[str, bool]] Returns a list of sentences. Each sentence contains a list of its corresponding token length where each entry provides the labels/categories for the token. Sentence -> Token -> Labels @@ -126,8 +148,9 @@ def label_batch_token( # Load english language model nlp = spacy.load("en_core_web_sm") # labelled tokens, list holding sentences holding tokens holding corresponding token labels - labelled_sentences = list() + labelled_sentences: list[list[dict[str, bool]]] = list() + # go through each sentence in the batch for sentence in sentences: if tokenized: # sentence is a list of tokens @@ -141,22 +164,24 @@ def label_batch_token( doc = nlp(sentence) labelled_tokens = list() # list holding labels for all tokens of sentence - - for token in doc: - labels = list() # The list holding labels of a single token - for _, category_check in TOKEN_LABELS.items(): - label = category_check(token) - labels.append(label) + labelled_tokens = label_sentence(doc) + + # go through each token in the sentence + for token, labelled_token in zip(doc, labelled_tokens): + # labelled_token = label_single_Token(token) + # labels = list() # The list holding labels of a single token + # for _, category_check in TOKEN_LABELS.items(): + # label = category_check(token) + # labels.append(label) # add current token's to the list - labelled_tokens.append(labels) + # labelled_tokens.append(labelled_token) # print the token and its labels to console if verbose is True: - print(f"Token: {token.text}") + print(f"Token: {token}") print(" | ".join(list(TOKEN_LABELS.keys()))) printable = [ - str(l).ljust(len(cname)) - for l, cname in zip(labels, TOKEN_LABELS.keys()) + str(l).ljust(len(name)) for name, l in labelled_token.items() ] printable = " | ".join(printable) print(printable) From 3c0894750ac70e34161dcf23eee1c68e09e3d657 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 8 Feb 2024 23:40:42 +0100 Subject: [PATCH 07/29] improve notebook explanation --- notebooks/token_labelling.ipynb | 575 ++++++++++------------------- requirements.txt | 1 + src/delphi/eval/token_labelling.py | 6 +- 3 files changed, 205 insertions(+), 377 deletions(-) diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb index cf4515ef..2e200922 100644 --- a/notebooks/token_labelling.ipynb +++ b/notebooks/token_labelling.ipynb @@ -4,22 +4,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# How to label tokens" + "# Giving tokens a label - How to categorize tokens\n", + "\n", + "\n", + "The first part of this Notebook contains elements that explain how to label tokens and how the functions work.\n", + "\n", + "The second part shows how all tokens are labelled that are used for our delphi language models.3\n", + "\n", + "# 1) How to use the token labelling functions" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 90, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "# autoreload\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", + "from pprint import pprint \n", + "\n", "import spacy\n", + "from tqdm.auto import tqdm\n", "\n", - "import token_labelling" + "import delphi\n", + "\n", + "# from delphi.eval import token_labelling" ] }, { @@ -32,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -63,20 +84,34 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False]\n" + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('en_core_web_sm')\n", + "{'Capitalized': True,\n", + " 'Is Adjective': False,\n", + " 'Is Adverb': False,\n", + " 'Is Conjunction': False,\n", + " 'Is Interjunction': False,\n", + " 'Is Named Entity': False,\n", + " 'Is Noun': False,\n", + " 'Is Preposition': False,\n", + " 'Is Pronoun': True,\n", + " 'Is Verb': False,\n", + " 'Starts with space': False}\n" ] } ], "source": [ + "from delphi.eval import token_labelling\n", + "\n", "label = token_labelling.label_single_token(token)\n", - "print(label)" + "pprint(label)" ] }, { @@ -89,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -111,11 +146,7 @@ " 7 Is Preposition False\n", " 8 Is Conjunction False\n", " 9 Is Interjunction False\n", - " 10 Is Subject True\n", - " 11 Is Object False\n", - " 12 Is Root False\n", - " 13 Is auxiliary False\n", - " 14 Is Named Entity False\n" + " 10 Is Named Entity False\n" ] } ], @@ -127,339 +158,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Explanation of all 302 token labels (POS, dependency, NER, ...):\n", - " ADJ adjective\n", - " ADP adposition\n", - " ADV adverb\n", - " AUX auxiliary\n", - " CONJ conjunction\n", - " CCONJ coordinating conjunction\n", - " DET determiner\n", - " INTJ interjection\n", - " NOUN noun\n", - " NUM numeral\n", - " PART particle\n", - " PRON pronoun\n", - " PROPN proper noun\n", - " PUNCT punctuation\n", - " SCONJ subordinating conjunction\n", - " SYM symbol\n", - " VERB verb\n", - " X other\n", - " EOL end of line\n", - " SPACE space\n", - " . punctuation mark, sentence closer\n", - " , punctuation mark, comma\n", - " -LRB- left round bracket\n", - " -RRB- right round bracket\n", - " `` opening quotation mark\n", - " \"\" closing quotation mark\n", - " '' closing quotation mark\n", - " : punctuation mark, colon or ellipsis\n", - " $ symbol, currency\n", - " # symbol, number sign\n", - " AFX affix\n", - " CC conjunction, coordinating\n", - " CD cardinal number\n", - " DT determiner\n", - " EX existential there\n", - " FW foreign word\n", - " HYPH punctuation mark, hyphen\n", - " IN conjunction, subordinating or preposition\n", - " JJ adjective (English), other noun-modifier (Chinese)\n", - " JJR adjective, comparative\n", - " JJS adjective, superlative\n", - " LS list item marker\n", - " MD verb, modal auxiliary\n", - " NIL missing tag\n", - " NN noun, singular or mass\n", - " NNP noun, proper singular\n", - " NNPS noun, proper plural\n", - " NNS noun, plural\n", - " PDT predeterminer\n", - " POS possessive ending\n", - " PRP pronoun, personal\n", - " PRP$ pronoun, possessive\n", - " RB adverb\n", - " RBR adverb, comparative\n", - " RBS adverb, superlative\n", - " RP adverb, particle\n", - " TO infinitival \"to\"\n", - " UH interjection\n", - " VB verb, base form\n", - " VBD verb, past tense\n", - " VBG verb, gerund or present participle\n", - " VBN verb, past participle\n", - " VBP verb, non-3rd person singular present\n", - " VBZ verb, 3rd person singular present\n", - " WDT wh-determiner\n", - " WP wh-pronoun, personal\n", - " WP$ wh-pronoun, possessive\n", - " WRB wh-adverb\n", - " SP space (English), sentence-final particle (Chinese)\n", - " ADD email\n", - " NFP superfluous punctuation\n", - " GW additional word in multi-word expression\n", - " XX unknown\n", - " BES auxiliary \"be\"\n", - " HVS forms of \"have\"\n", - " _SP whitespace\n", - " $( other sentence-internal punctuation mark\n", - " $, comma\n", - " $. sentence-final punctuation mark\n", - " ADJA adjective, attributive\n", - " ADJD adjective, adverbial or predicative\n", - " APPO postposition\n", - " APPR preposition; circumposition left\n", - " APPRART preposition with article\n", - " APZR circumposition right\n", - " ART definite or indefinite article\n", - " CARD cardinal number\n", - " FM foreign language material\n", - " ITJ interjection\n", - " KOKOM comparative conjunction\n", - " KON coordinate conjunction\n", - " KOUI subordinate conjunction with \"zu\" and infinitive\n", - " KOUS subordinate conjunction with sentence\n", - " NE proper noun\n", - " NNE proper noun\n", - " PAV pronominal adverb\n", - " PROAV pronominal adverb\n", - " PDAT attributive demonstrative pronoun\n", - " PDS substituting demonstrative pronoun\n", - " PIAT attributive indefinite pronoun without determiner\n", - " PIDAT attributive indefinite pronoun with determiner\n", - " PIS substituting indefinite pronoun\n", - " PPER non-reflexive personal pronoun\n", - " PPOSAT attributive possessive pronoun\n", - " PPOSS substituting possessive pronoun\n", - " PRELAT attributive relative pronoun\n", - " PRELS substituting relative pronoun\n", - " PRF reflexive personal pronoun\n", - " PTKA particle with adjective or adverb\n", - " PTKANT answer particle\n", - " PTKNEG negative particle\n", - " PTKVZ separable verbal particle\n", - " PTKZU \"zu\" before infinitive\n", - " PWAT attributive interrogative pronoun\n", - " PWAV adverbial interrogative or relative pronoun\n", - " PWS substituting interrogative pronoun\n", - " TRUNC word remnant\n", - " VAFIN finite verb, auxiliary\n", - " VAIMP imperative, auxiliary\n", - " VAINF infinitive, auxiliary\n", - " VAPP perfect participle, auxiliary\n", - " VMFIN finite verb, modal\n", - " VMINF infinitive, modal\n", - " VMPP perfect participle, modal\n", - " VVFIN finite verb, full\n", - " VVIMP imperative, full\n", - " VVINF infinitive, full\n", - " VVIZU infinitive with \"zu\", full\n", - " VVPP perfect participle, full\n", - " XY non-word containing non-letter\n", - " AD adverb\n", - " AS aspect marker\n", - " BA 把 in ba-construction\n", - " CS subordinating conjunction\n", - " DEC 的 in a relative clause\n", - " DEG associative 的\n", - " DER 得 in V-de const. and V-de-R\n", - " DEV 地 before VP\n", - " ETC for words 等, 等等\n", - " IJ interjection\n", - " LB 被 in long bei-const\n", - " LC localizer\n", - " M measure word\n", - " MSP other particle\n", - " NR proper noun\n", - " NT temporal noun\n", - " OD ordinal number\n", - " ON onomatopoeia\n", - " P preposition excluding 把 and 被\n", - " PN pronoun\n", - " PU punctuation\n", - " SB 被 in short bei-const\n", - " VA predicative adjective\n", - " VC 是 (copula)\n", - " VE 有 as the main verb\n", - " VV other verb\n", - " NP noun phrase\n", - " PP prepositional phrase\n", - " VP verb phrase\n", - " ADVP adverb phrase\n", - " ADJP adjective phrase\n", - " SBAR subordinating conjunction\n", - " PRT particle\n", - " PNP prepositional noun phrase\n", - " acl clausal modifier of noun (adjectival clause)\n", - " acomp adjectival complement\n", - " advcl adverbial clause modifier\n", - " advmod adverbial modifier\n", - " agent agent\n", - " amod adjectival modifier\n", - " appos appositional modifier\n", - " attr attribute\n", - " aux auxiliary\n", - " auxpass auxiliary (passive)\n", - " case case marking\n", - " cc coordinating conjunction\n", - " ccomp clausal complement\n", - " clf classifier\n", - " complm complementizer\n", - " compound compound\n", - " conj conjunct\n", - " cop copula\n", - " csubj clausal subject\n", - " csubjpass clausal subject (passive)\n", - " dative dative\n", - " dep unclassified dependent\n", - " det determiner\n", - " discourse discourse element\n", - " dislocated dislocated elements\n", - " dobj direct object\n", - " expl expletive\n", - " fixed fixed multiword expression\n", - " flat flat multiword expression\n", - " goeswith goes with\n", - " hmod modifier in hyphenation\n", - " hyph hyphen\n", - " infmod infinitival modifier\n", - " intj interjection\n", - " iobj indirect object\n", - " list list\n", - " mark marker\n", - " meta meta modifier\n", - " neg negation modifier\n", - " nmod modifier of nominal\n", - " nn noun compound modifier\n", - " npadvmod noun phrase as adverbial modifier\n", - " nsubj nominal subject\n", - " nsubjpass nominal subject (passive)\n", - " nounmod modifier of nominal\n", - " npmod noun phrase as adverbial modifier\n", - " num number modifier\n", - " number number compound modifier\n", - " nummod numeric modifier\n", - " oprd object predicate\n", - " obj object\n", - " obl oblique nominal\n", - " orphan orphan\n", - " parataxis parataxis\n", - " partmod participal modifier\n", - " pcomp complement of preposition\n", - " pobj object of preposition\n", - " poss possession modifier\n", - " possessive possessive modifier\n", - " preconj pre-correlative conjunction\n", - " prep prepositional modifier\n", - " prt particle\n", - " punct punctuation\n", - " quantmod modifier of quantifier\n", - " rcmod relative clause modifier\n", - " relcl relative clause modifier\n", - " reparandum overridden disfluency\n", - " root root\n", - " ROOT root\n", - " vocative vocative\n", - " xcomp open clausal complement\n", - " ac adpositional case marker\n", - " adc adjective component\n", - " ag genitive attribute\n", - " ams measure argument of adjective\n", - " app apposition\n", - " avc adverbial phrase component\n", - " cd coordinating conjunction\n", - " cj conjunct\n", - " cm comparative conjunction\n", - " cp complementizer\n", - " cvc collocational verb construction\n", - " da dative\n", - " dh discourse-level head\n", - " dm discourse marker\n", - " ep expletive es\n", - " hd head\n", - " ju junctor\n", - " mnr postnominal modifier\n", - " mo modifier\n", - " ng negation\n", - " nk noun kernel element\n", - " nmc numerical component\n", - " oa accusative object\n", - " oc clausal object\n", - " og genitive object\n", - " op prepositional object\n", - " par parenthetical element\n", - " pd predicate\n", - " pg phrasal genitive\n", - " ph placeholder\n", - " pm morphological particle\n", - " pnc proper noun component\n", - " rc relative clause\n", - " re repeated element\n", - " rs reported speech\n", - " sb subject\n", - " sbp passivized subject (PP)\n", - " sp subject or predicate\n", - " svp separable verb prefix\n", - " uc unit component\n", - " vo vocative\n", - " PERSON People, including fictional\n", - " NORP Nationalities or religious or political groups\n", - " FACILITY Buildings, airports, highways, bridges, etc.\n", - " FAC Buildings, airports, highways, bridges, etc.\n", - " ORG Companies, agencies, institutions, etc.\n", - " GPE Countries, cities, states\n", - " LOC Non-GPE locations, mountain ranges, bodies of water\n", - " PRODUCT Objects, vehicles, foods, etc. (not services)\n", - " EVENT Named hurricanes, battles, wars, sports events, etc.\n", - " WORK_OF_ART Titles of books, songs, etc.\n", - " LAW Named documents made into laws.\n", - " LANGUAGE Any named language\n", - " DATE Absolute or relative dates or periods\n", - " TIME Times smaller than a day\n", - " PERCENT Percentage, including \"%\"\n", - " MONEY Monetary values, including unit\n", - " QUANTITY Measurements, as of weight or distance\n", - " ORDINAL \"first\", \"second\", etc.\n", - " CARDINAL Numerals that do not fall under another type\n", - " PER Named person or family.\n", - " MISC Miscellaneous entities, e.g. events, nationalities, products or works of art\n", - " EVT Festivals, cultural events, sports events, weather phenomena, wars, etc.\n", - " PROD Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas\n", - " DRV Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')\n", - " GPE_LOC Geo-political entity, with a locative sense, e.g. 'John lives in Spain'\n", - " GPE_ORG Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'\n" - ] - } - ], - "source": [ - "token_labelling.explain_token_labels()" + "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument:\n", + "```Python\n", + ">>> token_labelling.explain_token_labels()\n", + "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "### Batched token labelling\n", "Next, let us analyze a batch of sentences and have them labelled.\n", "> In this example the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -467,29 +183,29 @@ "output_type": "stream", "text": [ "Token: This\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | True | False | True | False | False | False | False | False | False | True | False | False | False | False \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | True | False | True | False | False | False | False | False | False | False \n", "---\n", "Token: is\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | True | False | False | False | False | False | False | True | False | False \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | False | False | False | False | True | False | False | False | False | False \n", "---\n", "Token: a\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False \n", "---\n", "Token: sentence\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | True | False | False | False | False | False | False | False | False | False | False | False | False \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | False | True | False | False | False | False | False | False | False | False \n", "---\n", "Token: .\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False \n", "---\n", "\n", "\n", "5\n", - "[[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" + "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': False, 'Is Pronoun': True, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': True, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" ] } ], @@ -497,7 +213,7 @@ "sentences = [\n", " \"This is a sentence.\"\n", "]\n", - "labels = token_labelling.label_batch_token(sentences, tokenized=False, verbose=True)\n", + "labels = token_labelling.label_batch_sentences(sentences, tokenized=False, verbose=True)\n", "\n", "print(len(labels[0]))\n", "print(labels[0])" @@ -512,37 +228,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Token: This \n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | True | True | False | False | False | False | False | False | False | False | False | True | False | False \n", - "---\n", - "Token: is \n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | False | True | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: a \n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | True | False | False | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: sentence\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | True | False | False | False | False | False | False | False | False | True | False | False | False \n", - "---\n", - "Token: .\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", - "---\n", - "\n", - "\n", "5\n", - "[[False, True, True, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" + "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': True, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': True, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" ] } ], @@ -550,16 +244,147 @@ "sentences = [\n", " [\"This \", \"is \", \"a \", \"sentence\", \".\"]\n", "]\n", - "labels = token_labelling.label_batch_token(sentences, tokenized=True, verbose=False)\n", + "labelled_sentences = token_labelling.label_batch_sentences(sentences, tokenized=True, verbose=False)\n", "\n", - "print(len(labels[0]))\n", - "print(labels[0])" + "print(len(labelled_sentences[0]))\n", + "print(labelled_sentences[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2) Labelling all tokens in the dataset\n", + "\n", + "Now we want to label all the tokens that our tokenizer knows - its entire vocabulary." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The vocab size is 50257\n" + ] + } + ], + "source": [ + "# Get all the tokens of the tokenizer\n", + "from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast\n", + "\n", + "def tokenize(tokenizer: PreTrainedTokenizer, sample_txt: str) -> list[int]:\n", + " # supposedly this can be different than prepending the bos token id\n", + " return tokenizer.encode(tokenizer.bos_token + sample_txt, return_tensors=\"pt\")[0]\n", + "\n", + "# Decode a sentence\n", + "def decode(tokenizer: PreTrainedTokenizer, token_ids: list[int]) -> str:\n", + " return tokenizer.decode(token_ids, skip_special_tokens=True)\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"roneneldan/TinyStories-1M\")\n", + "vocab_size = tokenizer.vocab_size\n", + "print(\"The vocab size is:\", vocab_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "! \" # $ % & ' ( ) * \n", + " inv lect supp ating look man pect 8 row bu \n", + " child since ired less life develop ittle dep pass � \n", + " matter reg ext angu isc ole aut compet eed fect \n", + " (/ ….\" Compar amplification ominated regress Collider informants gazed \n" + ] + } + ], + "source": [ + "# Let's have a look at some tokens\n", + "ranges = [(0,10), (800,810), (1200,1210), (2300, 2310), (vocab_size-10, vocab_size)]\n", + "for start, end in ranges:\n", + " for i in range(start, end):\n", + " print(decode(tokenizer, i).ljust(10), end=\" \")\n", + " print()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2405771500d24b7890f87694d533486f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Labelling tokens: 0%| | 0/8 [00:00 list[dict[str, bool]]: def label_batch_sentences( - sentences: list, tokenized: bool = True, verbose: bool = False + sentences: Union[list[str], list[list[str]]], + tokenized: bool = True, + verbose: bool = False, ) -> list[list]: """ Labels tokens in a sentence batchwise. Takes the context of the token into @@ -133,7 +135,7 @@ def label_batch_sentences( A batch/list of sentences, each being a list of tokens. tokenized : bool, optional Whether the sentences are already tokenized, by default True. If the sentences - are full strings and not lists of tokens, then set to False. + are full strings and not lists of tokens, then set to False. If true then `sentences` must be list[list[str]]. verbose : bool, optional Whether to print the tokens and their labels to the console, by default False. From 6bf1c56e62c970e618277e368a03254c8b96958a Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:54:41 +0100 Subject: [PATCH 08/29] fix errors --- requirements.txt | 2 +- src/delphi/eval/token_labelling.py | 28 +++++++--------------------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/requirements.txt b/requirements.txt index 07a340c6..7bda2f6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -# torch==2.1.2 +torch==2.1.2 datasets==2.16.1 transformers==4.36.2 tqdm==4.66.1 diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 5ec134e1..163d9bb1 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -4,7 +4,7 @@ Additionally, it can visualize the sentences and their poart-of-speech (POS) tags. """ -from typing import Callable, Optional, Union +from typing import Callable, Optional import spacy # pylint: disable=import-error from spacy.tokens import Doc # pylint: disable=import-error @@ -98,7 +98,7 @@ def label_single_Token(token: Token) -> dict[str, bool]: return labels -def label_sentence(tokens: Union[Doc, list[Token]]) -> list[dict[str, bool]]: +def label_sentence(tokens: Doc | list[Token]) -> list[dict[str, bool]]: """ Labels spaCy Tokens in a sentence. Takes the context of the token into account for dependency labels (e.g. subject, object, ...), IF dependency labels are turned on. @@ -121,7 +121,7 @@ def label_sentence(tokens: Union[Doc, list[Token]]) -> list[dict[str, bool]]: def label_batch_sentences( - sentences: Union[list[str], list[list[str]]], + sentences: list[str] | list[list[str]], tokenized: bool = True, verbose: bool = False, ) -> list[list]: @@ -146,7 +146,6 @@ def label_batch_sentences( corresponding token length where each entry provides the labels/categories for the token. Sentence -> Token -> Labels """ - assert isinstance(sentences, list) # Load english language model nlp = spacy.load("en_core_web_sm") # labelled tokens, list holding sentences holding tokens holding corresponding token labels @@ -168,18 +167,10 @@ def label_batch_sentences( labelled_tokens = list() # list holding labels for all tokens of sentence labelled_tokens = label_sentence(doc) - # go through each token in the sentence - for token, labelled_token in zip(doc, labelled_tokens): - # labelled_token = label_single_Token(token) - # labels = list() # The list holding labels of a single token - # for _, category_check in TOKEN_LABELS.items(): - # label = category_check(token) - # labels.append(label) - # add current token's to the list - # labelled_tokens.append(labelled_token) - - # print the token and its labels to console - if verbose is True: + # print the token and its labels to console + if verbose is True: + # go through each token in the sentence + for token, labelled_token in zip(doc, labelled_tokens): print(f"Token: {token}") print(" | ".join(list(TOKEN_LABELS.keys()))) printable = [ @@ -198,11 +189,6 @@ def label_batch_sentences( if __name__ == "__main__": - # result = label_tokens( - # ["Hi, my name is Joshua.".split(" "), "The highway is full of car s, Peter.".split(" ")], - # tokenized=True, - # verbose=True, - # ) result = label_batch_token( ["Hi, my name is Joshua.", "The highway is full of car s, Peter."], tokenized=False, From 70c337de880194be45c6fbc4032b5d7aaf32de83 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Fri, 2 Feb 2024 17:38:36 +0100 Subject: [PATCH 09/29] add notebook --- notebooks/token_labelling.ipynb | 449 ++++++++++++++++++++++++++++++++ 1 file changed, 449 insertions(+) diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb index 2e200922..aee206c0 100644 --- a/notebooks/token_labelling.ipynb +++ b/notebooks/token_labelling.ipynb @@ -4,6 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ +<<<<<<< HEAD "# Giving tokens a label - How to categorize tokens\n", "\n", "\n", @@ -12,10 +13,14 @@ "The second part shows how all tokens are labelled that are used for our delphi language models.3\n", "\n", "# 1) How to use the token labelling functions" +======= + "# How to label tokens" +>>>>>>> bf8ef79 (add notebook) ] }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 90, "metadata": {}, "outputs": [ @@ -28,11 +33,17 @@ ] } ], +======= + "execution_count": 2, + "metadata": {}, + "outputs": [], +>>>>>>> bf8ef79 (add notebook) "source": [ "# autoreload\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", +<<<<<<< HEAD "from pprint import pprint \n", "\n", "import spacy\n", @@ -41,6 +52,11 @@ "import delphi\n", "\n", "# from delphi.eval import token_labelling" +======= + "import spacy\n", + "\n", + "import token_labelling" +>>>>>>> bf8ef79 (add notebook) ] }, { @@ -53,7 +69,11 @@ }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 2, +======= + "execution_count": 23, +>>>>>>> bf8ef79 (add notebook) "metadata": {}, "outputs": [ { @@ -84,13 +104,18 @@ }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 8, +======= + "execution_count": 46, +>>>>>>> bf8ef79 (add notebook) "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ +<<<<<<< HEAD "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the package via spacy.load('en_core_web_sm')\n", "{'Capitalized': True,\n", @@ -104,14 +129,22 @@ " 'Is Pronoun': True,\n", " 'Is Verb': False,\n", " 'Starts with space': False}\n" +======= + "[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False]\n" +>>>>>>> bf8ef79 (add notebook) ] } ], "source": [ +<<<<<<< HEAD "from delphi.eval import token_labelling\n", "\n", "label = token_labelling.label_single_token(token)\n", "pprint(label)" +======= + "label = token_labelling.label_single_token(token)\n", + "print(label)" +>>>>>>> bf8ef79 (add notebook) ] }, { @@ -124,7 +157,11 @@ }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 9, +======= + "execution_count": 42, +>>>>>>> bf8ef79 (add notebook) "metadata": {}, "outputs": [ { @@ -146,7 +183,15 @@ " 7 Is Preposition False\n", " 8 Is Conjunction False\n", " 9 Is Interjunction False\n", +<<<<<<< HEAD " 10 Is Named Entity False\n" +======= + " 10 Is Subject True\n", + " 11 Is Object False\n", + " 12 Is Root False\n", + " 13 Is auxiliary False\n", + " 14 Is Named Entity False\n" +>>>>>>> bf8ef79 (add notebook) ] } ], @@ -158,24 +203,354 @@ "cell_type": "markdown", "metadata": {}, "source": [ +<<<<<<< HEAD "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument:\n", "```Python\n", ">>> token_labelling.explain_token_labels()\n", "```" +======= + "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Explanation of all 302 token labels (POS, dependency, NER, ...):\n", + " ADJ adjective\n", + " ADP adposition\n", + " ADV adverb\n", + " AUX auxiliary\n", + " CONJ conjunction\n", + " CCONJ coordinating conjunction\n", + " DET determiner\n", + " INTJ interjection\n", + " NOUN noun\n", + " NUM numeral\n", + " PART particle\n", + " PRON pronoun\n", + " PROPN proper noun\n", + " PUNCT punctuation\n", + " SCONJ subordinating conjunction\n", + " SYM symbol\n", + " VERB verb\n", + " X other\n", + " EOL end of line\n", + " SPACE space\n", + " . punctuation mark, sentence closer\n", + " , punctuation mark, comma\n", + " -LRB- left round bracket\n", + " -RRB- right round bracket\n", + " `` opening quotation mark\n", + " \"\" closing quotation mark\n", + " '' closing quotation mark\n", + " : punctuation mark, colon or ellipsis\n", + " $ symbol, currency\n", + " # symbol, number sign\n", + " AFX affix\n", + " CC conjunction, coordinating\n", + " CD cardinal number\n", + " DT determiner\n", + " EX existential there\n", + " FW foreign word\n", + " HYPH punctuation mark, hyphen\n", + " IN conjunction, subordinating or preposition\n", + " JJ adjective (English), other noun-modifier (Chinese)\n", + " JJR adjective, comparative\n", + " JJS adjective, superlative\n", + " LS list item marker\n", + " MD verb, modal auxiliary\n", + " NIL missing tag\n", + " NN noun, singular or mass\n", + " NNP noun, proper singular\n", + " NNPS noun, proper plural\n", + " NNS noun, plural\n", + " PDT predeterminer\n", + " POS possessive ending\n", + " PRP pronoun, personal\n", + " PRP$ pronoun, possessive\n", + " RB adverb\n", + " RBR adverb, comparative\n", + " RBS adverb, superlative\n", + " RP adverb, particle\n", + " TO infinitival \"to\"\n", + " UH interjection\n", + " VB verb, base form\n", + " VBD verb, past tense\n", + " VBG verb, gerund or present participle\n", + " VBN verb, past participle\n", + " VBP verb, non-3rd person singular present\n", + " VBZ verb, 3rd person singular present\n", + " WDT wh-determiner\n", + " WP wh-pronoun, personal\n", + " WP$ wh-pronoun, possessive\n", + " WRB wh-adverb\n", + " SP space (English), sentence-final particle (Chinese)\n", + " ADD email\n", + " NFP superfluous punctuation\n", + " GW additional word in multi-word expression\n", + " XX unknown\n", + " BES auxiliary \"be\"\n", + " HVS forms of \"have\"\n", + " _SP whitespace\n", + " $( other sentence-internal punctuation mark\n", + " $, comma\n", + " $. sentence-final punctuation mark\n", + " ADJA adjective, attributive\n", + " ADJD adjective, adverbial or predicative\n", + " APPO postposition\n", + " APPR preposition; circumposition left\n", + " APPRART preposition with article\n", + " APZR circumposition right\n", + " ART definite or indefinite article\n", + " CARD cardinal number\n", + " FM foreign language material\n", + " ITJ interjection\n", + " KOKOM comparative conjunction\n", + " KON coordinate conjunction\n", + " KOUI subordinate conjunction with \"zu\" and infinitive\n", + " KOUS subordinate conjunction with sentence\n", + " NE proper noun\n", + " NNE proper noun\n", + " PAV pronominal adverb\n", + " PROAV pronominal adverb\n", + " PDAT attributive demonstrative pronoun\n", + " PDS substituting demonstrative pronoun\n", + " PIAT attributive indefinite pronoun without determiner\n", + " PIDAT attributive indefinite pronoun with determiner\n", + " PIS substituting indefinite pronoun\n", + " PPER non-reflexive personal pronoun\n", + " PPOSAT attributive possessive pronoun\n", + " PPOSS substituting possessive pronoun\n", + " PRELAT attributive relative pronoun\n", + " PRELS substituting relative pronoun\n", + " PRF reflexive personal pronoun\n", + " PTKA particle with adjective or adverb\n", + " PTKANT answer particle\n", + " PTKNEG negative particle\n", + " PTKVZ separable verbal particle\n", + " PTKZU \"zu\" before infinitive\n", + " PWAT attributive interrogative pronoun\n", + " PWAV adverbial interrogative or relative pronoun\n", + " PWS substituting interrogative pronoun\n", + " TRUNC word remnant\n", + " VAFIN finite verb, auxiliary\n", + " VAIMP imperative, auxiliary\n", + " VAINF infinitive, auxiliary\n", + " VAPP perfect participle, auxiliary\n", + " VMFIN finite verb, modal\n", + " VMINF infinitive, modal\n", + " VMPP perfect participle, modal\n", + " VVFIN finite verb, full\n", + " VVIMP imperative, full\n", + " VVINF infinitive, full\n", + " VVIZU infinitive with \"zu\", full\n", + " VVPP perfect participle, full\n", + " XY non-word containing non-letter\n", + " AD adverb\n", + " AS aspect marker\n", + " BA 把 in ba-construction\n", + " CS subordinating conjunction\n", + " DEC 的 in a relative clause\n", + " DEG associative 的\n", + " DER 得 in V-de const. and V-de-R\n", + " DEV 地 before VP\n", + " ETC for words 等, 等等\n", + " IJ interjection\n", + " LB 被 in long bei-const\n", + " LC localizer\n", + " M measure word\n", + " MSP other particle\n", + " NR proper noun\n", + " NT temporal noun\n", + " OD ordinal number\n", + " ON onomatopoeia\n", + " P preposition excluding 把 and 被\n", + " PN pronoun\n", + " PU punctuation\n", + " SB 被 in short bei-const\n", + " VA predicative adjective\n", + " VC 是 (copula)\n", + " VE 有 as the main verb\n", + " VV other verb\n", + " NP noun phrase\n", + " PP prepositional phrase\n", + " VP verb phrase\n", + " ADVP adverb phrase\n", + " ADJP adjective phrase\n", + " SBAR subordinating conjunction\n", + " PRT particle\n", + " PNP prepositional noun phrase\n", + " acl clausal modifier of noun (adjectival clause)\n", + " acomp adjectival complement\n", + " advcl adverbial clause modifier\n", + " advmod adverbial modifier\n", + " agent agent\n", + " amod adjectival modifier\n", + " appos appositional modifier\n", + " attr attribute\n", + " aux auxiliary\n", + " auxpass auxiliary (passive)\n", + " case case marking\n", + " cc coordinating conjunction\n", + " ccomp clausal complement\n", + " clf classifier\n", + " complm complementizer\n", + " compound compound\n", + " conj conjunct\n", + " cop copula\n", + " csubj clausal subject\n", + " csubjpass clausal subject (passive)\n", + " dative dative\n", + " dep unclassified dependent\n", + " det determiner\n", + " discourse discourse element\n", + " dislocated dislocated elements\n", + " dobj direct object\n", + " expl expletive\n", + " fixed fixed multiword expression\n", + " flat flat multiword expression\n", + " goeswith goes with\n", + " hmod modifier in hyphenation\n", + " hyph hyphen\n", + " infmod infinitival modifier\n", + " intj interjection\n", + " iobj indirect object\n", + " list list\n", + " mark marker\n", + " meta meta modifier\n", + " neg negation modifier\n", + " nmod modifier of nominal\n", + " nn noun compound modifier\n", + " npadvmod noun phrase as adverbial modifier\n", + " nsubj nominal subject\n", + " nsubjpass nominal subject (passive)\n", + " nounmod modifier of nominal\n", + " npmod noun phrase as adverbial modifier\n", + " num number modifier\n", + " number number compound modifier\n", + " nummod numeric modifier\n", + " oprd object predicate\n", + " obj object\n", + " obl oblique nominal\n", + " orphan orphan\n", + " parataxis parataxis\n", + " partmod participal modifier\n", + " pcomp complement of preposition\n", + " pobj object of preposition\n", + " poss possession modifier\n", + " possessive possessive modifier\n", + " preconj pre-correlative conjunction\n", + " prep prepositional modifier\n", + " prt particle\n", + " punct punctuation\n", + " quantmod modifier of quantifier\n", + " rcmod relative clause modifier\n", + " relcl relative clause modifier\n", + " reparandum overridden disfluency\n", + " root root\n", + " ROOT root\n", + " vocative vocative\n", + " xcomp open clausal complement\n", + " ac adpositional case marker\n", + " adc adjective component\n", + " ag genitive attribute\n", + " ams measure argument of adjective\n", + " app apposition\n", + " avc adverbial phrase component\n", + " cd coordinating conjunction\n", + " cj conjunct\n", + " cm comparative conjunction\n", + " cp complementizer\n", + " cvc collocational verb construction\n", + " da dative\n", + " dh discourse-level head\n", + " dm discourse marker\n", + " ep expletive es\n", + " hd head\n", + " ju junctor\n", + " mnr postnominal modifier\n", + " mo modifier\n", + " ng negation\n", + " nk noun kernel element\n", + " nmc numerical component\n", + " oa accusative object\n", + " oc clausal object\n", + " og genitive object\n", + " op prepositional object\n", + " par parenthetical element\n", + " pd predicate\n", + " pg phrasal genitive\n", + " ph placeholder\n", + " pm morphological particle\n", + " pnc proper noun component\n", + " rc relative clause\n", + " re repeated element\n", + " rs reported speech\n", + " sb subject\n", + " sbp passivized subject (PP)\n", + " sp subject or predicate\n", + " svp separable verb prefix\n", + " uc unit component\n", + " vo vocative\n", + " PERSON People, including fictional\n", + " NORP Nationalities or religious or political groups\n", + " FACILITY Buildings, airports, highways, bridges, etc.\n", + " FAC Buildings, airports, highways, bridges, etc.\n", + " ORG Companies, agencies, institutions, etc.\n", + " GPE Countries, cities, states\n", + " LOC Non-GPE locations, mountain ranges, bodies of water\n", + " PRODUCT Objects, vehicles, foods, etc. (not services)\n", + " EVENT Named hurricanes, battles, wars, sports events, etc.\n", + " WORK_OF_ART Titles of books, songs, etc.\n", + " LAW Named documents made into laws.\n", + " LANGUAGE Any named language\n", + " DATE Absolute or relative dates or periods\n", + " TIME Times smaller than a day\n", + " PERCENT Percentage, including \"%\"\n", + " MONEY Monetary values, including unit\n", + " QUANTITY Measurements, as of weight or distance\n", + " ORDINAL \"first\", \"second\", etc.\n", + " CARDINAL Numerals that do not fall under another type\n", + " PER Named person or family.\n", + " MISC Miscellaneous entities, e.g. events, nationalities, products or works of art\n", + " EVT Festivals, cultural events, sports events, weather phenomena, wars, etc.\n", + " PROD Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas\n", + " DRV Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')\n", + " GPE_LOC Geo-political entity, with a locative sense, e.g. 'John lives in Spain'\n", + " GPE_ORG Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'\n" + ] + } + ], + "source": [ + "token_labelling.explain_token_labels()" +>>>>>>> bf8ef79 (add notebook) ] }, { "cell_type": "markdown", "metadata": {}, "source": [ +<<<<<<< HEAD "### Batched token labelling\n", +======= +>>>>>>> bf8ef79 (add notebook) "Next, let us analyze a batch of sentences and have them labelled.\n", "> In this example the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." ] }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 18, +======= + "execution_count": 55, +>>>>>>> bf8ef79 (add notebook) "metadata": {}, "outputs": [ { @@ -183,6 +558,7 @@ "output_type": "stream", "text": [ "Token: This\n", +<<<<<<< HEAD "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", "False | True | False | True | False | False | False | False | False | False | False \n", "---\n", @@ -201,11 +577,35 @@ "Token: .\n", "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", "False | False | False | False | False | False | False | False | False | False | False \n", +======= + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | True | False | True | False | False | False | False | False | False | True | False | False | False | False \n", + "---\n", + "Token: is\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | True | False | False | False | False | False | False | True | False | False \n", + "---\n", + "Token: a\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: sentence\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | True | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: .\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", +>>>>>>> bf8ef79 (add notebook) "---\n", "\n", "\n", "5\n", +<<<<<<< HEAD "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': False, 'Is Pronoun': True, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': True, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" +======= + "[[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" +>>>>>>> bf8ef79 (add notebook) ] } ], @@ -213,7 +613,11 @@ "sentences = [\n", " \"This is a sentence.\"\n", "]\n", +<<<<<<< HEAD "labels = token_labelling.label_batch_sentences(sentences, tokenized=False, verbose=True)\n", +======= + "labels = token_labelling.label_batch_token(sentences, tokenized=False, verbose=True)\n", +>>>>>>> bf8ef79 (add notebook) "\n", "print(len(labels[0]))\n", "print(labels[0])" @@ -228,15 +632,46 @@ }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 19, +======= + "execution_count": null, +>>>>>>> bf8ef79 (add notebook) "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ +<<<<<<< HEAD "5\n", "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': True, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': True, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" +======= + "Token: This \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | True | True | False | False | False | False | False | False | False | False | False | True | False | False \n", + "---\n", + "Token: is \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | False | True | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: a \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | True | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "Token: sentence\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | True | False | False | False | False | False | False | False | False | True | False | False | False \n", + "---\n", + "Token: .\n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "---\n", + "\n", + "\n", + "5\n", + "[[False, True, True, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" +>>>>>>> bf8ef79 (add notebook) ] } ], @@ -244,16 +679,24 @@ "sentences = [\n", " [\"This \", \"is \", \"a \", \"sentence\", \".\"]\n", "]\n", +<<<<<<< HEAD "labelled_sentences = token_labelling.label_batch_sentences(sentences, tokenized=True, verbose=False)\n", "\n", "print(len(labelled_sentences[0]))\n", "print(labelled_sentences[0])" +======= + "labels = token_labelling.label_batch_token(sentences, tokenized=True, verbose=False)\n", + "\n", + "print(len(labels[0]))\n", + "print(labels[0])" +>>>>>>> bf8ef79 (add notebook) ] }, { "cell_type": "markdown", "metadata": {}, "source": [] +<<<<<<< HEAD }, { "cell_type": "markdown", @@ -385,6 +828,8 @@ " \n", " " ] +======= +>>>>>>> bf8ef79 (add notebook) } ], "metadata": { @@ -403,7 +848,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", +<<<<<<< HEAD "version": "3.10.13" +======= + "version": "3.8.8" +>>>>>>> bf8ef79 (add notebook) } }, "nbformat": 4, From 1fcdd3564259ed19f3402f886e53140acddf4bba Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 8 Feb 2024 21:27:27 +0100 Subject: [PATCH 10/29] test --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7bda2f6c..7d76665c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ beartype==0.16.4 pre-commit==3.6.0 isort==5.13.2 spacy==3.7.2 -chardet==5.2.0 \ No newline at end of file +chardet==5.2.0 From bd3be77b50f0053762359d284fe9a88ac837cc5c Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 8 Feb 2024 21:28:49 +0100 Subject: [PATCH 11/29] swtich off dependency labels + add spacy to requirements --- src/delphi/eval/token_labelling.py | 55 +++++++++++------------------- 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 163d9bb1..4a9ee0c4 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -4,6 +4,7 @@ Additionally, it can visualize the sentences and their poart-of-speech (POS) tags. """ +from pprint import pprint from typing import Callable, Optional import spacy # pylint: disable=import-error @@ -75,7 +76,7 @@ def explain_Token_labels(token: Optional[Token] = None) -> None: print(" ", label.ljust(10), key) -def label_single_Token(token: Token) -> dict[str, bool]: +def label_single_token(token: Token) -> dict[str, bool]: """ Labels a single token. A token, that has been analyzed by the spaCy library. @@ -98,32 +99,8 @@ def label_single_Token(token: Token) -> dict[str, bool]: return labels -def label_sentence(tokens: Doc | list[Token]) -> list[dict[str, bool]]: - """ - Labels spaCy Tokens in a sentence. Takes the context of the token into account - for dependency labels (e.g. subject, object, ...), IF dependency labels are turned on. - - Parameters - ---------- - tokens : list[Token] - A list of tokens. - - Returns - ------- - list[dict[str, bool]] - Returns a list of the tokens' labels. - """ - labelled_tokens = list() # list holding labels for all tokens of sentence - for token in tokens: - labels = label_single_Token(token) - labelled_tokens.append(labels) - return labelled_tokens - - -def label_batch_sentences( - sentences: list[str] | list[list[str]], - tokenized: bool = True, - verbose: bool = False, +def label_batch_token( + sentences: list, tokenized: bool = True, verbose: bool = False ) -> list[list]: """ Labels tokens in a sentence batchwise. Takes the context of the token into @@ -141,7 +118,7 @@ def label_batch_sentences( Returns ------- - list[list[dict[str, bool]] + list[list] Returns a list of sentences. Each sentence contains a list of its corresponding token length where each entry provides the labels/categories for the token. Sentence -> Token -> Labels @@ -149,7 +126,7 @@ def label_batch_sentences( # Load english language model nlp = spacy.load("en_core_web_sm") # labelled tokens, list holding sentences holding tokens holding corresponding token labels - labelled_sentences: list[list[dict[str, bool]]] = list() + labelled_sentences = list() # go through each sentence in the batch for sentence in sentences: @@ -165,16 +142,22 @@ def label_batch_sentences( doc = nlp(sentence) labelled_tokens = list() # list holding labels for all tokens of sentence - labelled_tokens = label_sentence(doc) - # print the token and its labels to console - if verbose is True: - # go through each token in the sentence - for token, labelled_token in zip(doc, labelled_tokens): - print(f"Token: {token}") + for token in doc: + labels = list() # The list holding labels of a single token + for _, category_check in TOKEN_LABELS.items(): + label = category_check(token) + labels.append(label) + # add current token's to the list + labelled_tokens.append(labels) + + # print the token and its labels to console + if verbose is True: + print(f"Token: {token.text}") print(" | ".join(list(TOKEN_LABELS.keys()))) printable = [ - str(l).ljust(len(name)) for name, l in labelled_token.items() + str(l).ljust(len(cname)) + for l, cname in zip(labels, TOKEN_LABELS.keys()) ] printable = " | ".join(printable) print(printable) From e3013db10eda78599767dbf4cf37a8def8f26174 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 8 Feb 2024 22:14:07 +0100 Subject: [PATCH 12/29] small improvements --- src/delphi/eval/token_labelling.py | 55 +++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 4a9ee0c4..22dd0ca8 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -4,8 +4,7 @@ Additionally, it can visualize the sentences and their poart-of-speech (POS) tags. """ -from pprint import pprint -from typing import Callable, Optional +from typing import Callable, Optional, Union import spacy # pylint: disable=import-error from spacy.tokens import Doc # pylint: disable=import-error @@ -76,7 +75,7 @@ def explain_Token_labels(token: Optional[Token] = None) -> None: print(" ", label.ljust(10), key) -def label_single_token(token: Token) -> dict[str, bool]: +def label_single_Token(token: Token) -> dict[str, bool]: """ Labels a single token. A token, that has been analyzed by the spaCy library. @@ -99,7 +98,29 @@ def label_single_token(token: Token) -> dict[str, bool]: return labels -def label_batch_token( +def label_sentence(tokens: Union[Doc, list[Token]]) -> list[dict[str, bool]]: + """ + Labels spaCy Tokens in a sentence. Takes the context of the token into account + for dependency labels (e.g. subject, object, ...), IF dependency labels are turned on. + + Parameters + ---------- + tokens : list[Token] + A list of tokens. + + Returns + ------- + list[dict[str, bool]] + Returns a list of the tokens' labels. + """ + labelled_tokens = list() # list holding labels for all tokens of sentence + for token in tokens: + labels = label_single_Token(token) + labelled_tokens.append(labels) + return labelled_tokens + + +def label_batch_sentences( sentences: list, tokenized: bool = True, verbose: bool = False ) -> list[list]: """ @@ -118,7 +139,7 @@ def label_batch_token( Returns ------- - list[list] + list[list[dict[str, bool]] Returns a list of sentences. Each sentence contains a list of its corresponding token length where each entry provides the labels/categories for the token. Sentence -> Token -> Labels @@ -126,7 +147,7 @@ def label_batch_token( # Load english language model nlp = spacy.load("en_core_web_sm") # labelled tokens, list holding sentences holding tokens holding corresponding token labels - labelled_sentences = list() + labelled_sentences: list[list[dict[str, bool]]] = list() # go through each sentence in the batch for sentence in sentences: @@ -142,22 +163,24 @@ def label_batch_token( doc = nlp(sentence) labelled_tokens = list() # list holding labels for all tokens of sentence - - for token in doc: - labels = list() # The list holding labels of a single token - for _, category_check in TOKEN_LABELS.items(): - label = category_check(token) - labels.append(label) + labelled_tokens = label_sentence(doc) + + # go through each token in the sentence + for token, labelled_token in zip(doc, labelled_tokens): + # labelled_token = label_single_Token(token) + # labels = list() # The list holding labels of a single token + # for _, category_check in TOKEN_LABELS.items(): + # label = category_check(token) + # labels.append(label) # add current token's to the list - labelled_tokens.append(labels) + # labelled_tokens.append(labelled_token) # print the token and its labels to console if verbose is True: - print(f"Token: {token.text}") + print(f"Token: {token}") print(" | ".join(list(TOKEN_LABELS.keys()))) printable = [ - str(l).ljust(len(cname)) - for l, cname in zip(labels, TOKEN_LABELS.keys()) + str(l).ljust(len(name)) for name, l in labelled_token.items() ] printable = " | ".join(printable) print(printable) From 90f2dbbe1c4130a98cb732711b2070b46ab1991b Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 8 Feb 2024 23:40:42 +0100 Subject: [PATCH 13/29] improve notebook explanation --- notebooks/token_labelling.ipynb | 131 +++++++++++++++++++++++++++-- requirements.txt | 4 + src/delphi/eval/token_labelling.py | 4 +- 3 files changed, 132 insertions(+), 7 deletions(-) diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb index aee206c0..3bb02ee1 100644 --- a/notebooks/token_labelling.ipynb +++ b/notebooks/token_labelling.ipynb @@ -5,6 +5,9 @@ "metadata": {}, "source": [ <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> a71e2a8 (improve notebook explanation) "# Giving tokens a label - How to categorize tokens\n", "\n", "\n", @@ -13,13 +16,17 @@ "The second part shows how all tokens are labelled that are used for our delphi language models.3\n", "\n", "# 1) How to use the token labelling functions" +<<<<<<< HEAD ======= "# How to label tokens" >>>>>>> bf8ef79 (add notebook) +======= +>>>>>>> a71e2a8 (improve notebook explanation) ] }, { "cell_type": "code", +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 90, "metadata": {}, @@ -38,11 +45,26 @@ "metadata": {}, "outputs": [], >>>>>>> bf8ef79 (add notebook) +======= + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], +>>>>>>> a71e2a8 (improve notebook explanation) "source": [ "# autoreload\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", +<<<<<<< HEAD <<<<<<< HEAD "from pprint import pprint \n", "\n", @@ -57,6 +79,16 @@ "\n", "import token_labelling" >>>>>>> bf8ef79 (add notebook) +======= + "from pprint import pprint \n", + "\n", + "import spacy\n", + "from tqdm.auto import tqdm\n", + "\n", + "import delphi\n", + "\n", + "# from delphi.eval import token_labelling" +>>>>>>> a71e2a8 (improve notebook explanation) ] }, { @@ -69,11 +101,15 @@ }, { "cell_type": "code", +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 2, ======= "execution_count": 23, >>>>>>> bf8ef79 (add notebook) +======= + "execution_count": 2, +>>>>>>> a71e2a8 (improve notebook explanation) "metadata": {}, "outputs": [ { @@ -104,11 +140,15 @@ }, { "cell_type": "code", +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 8, ======= "execution_count": 46, >>>>>>> bf8ef79 (add notebook) +======= + "execution_count": 8, +>>>>>>> a71e2a8 (improve notebook explanation) "metadata": {}, "outputs": [ { @@ -116,6 +156,9 @@ "output_type": "stream", "text": [ <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> a71e2a8 (improve notebook explanation) "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the package via spacy.load('en_core_web_sm')\n", "{'Capitalized': True,\n", @@ -129,13 +172,17 @@ " 'Is Pronoun': True,\n", " 'Is Verb': False,\n", " 'Starts with space': False}\n" +<<<<<<< HEAD ======= "[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False]\n" >>>>>>> bf8ef79 (add notebook) +======= +>>>>>>> a71e2a8 (improve notebook explanation) ] } ], "source": [ +<<<<<<< HEAD <<<<<<< HEAD "from delphi.eval import token_labelling\n", "\n", @@ -145,6 +192,12 @@ "label = token_labelling.label_single_token(token)\n", "print(label)" >>>>>>> bf8ef79 (add notebook) +======= + "from delphi.eval import token_labelling\n", + "\n", + "label = token_labelling.label_single_token(token)\n", + "pprint(label)" +>>>>>>> a71e2a8 (improve notebook explanation) ] }, { @@ -157,11 +210,15 @@ }, { "cell_type": "code", +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 9, ======= "execution_count": 42, >>>>>>> bf8ef79 (add notebook) +======= + "execution_count": 9, +>>>>>>> a71e2a8 (improve notebook explanation) "metadata": {}, "outputs": [ { @@ -183,6 +240,7 @@ " 7 Is Preposition False\n", " 8 Is Conjunction False\n", " 9 Is Interjunction False\n", +<<<<<<< HEAD <<<<<<< HEAD " 10 Is Named Entity False\n" ======= @@ -192,6 +250,9 @@ " 13 Is auxiliary False\n", " 14 Is Named Entity False\n" >>>>>>> bf8ef79 (add notebook) +======= + " 10 Is Named Entity False\n" +>>>>>>> a71e2a8 (improve notebook explanation) ] } ], @@ -204,10 +265,14 @@ "metadata": {}, "source": [ <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> a71e2a8 (improve notebook explanation) "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument:\n", "```Python\n", ">>> token_labelling.explain_token_labels()\n", "```" +<<<<<<< HEAD ======= "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument." ] @@ -530,27 +595,37 @@ "source": [ "token_labelling.explain_token_labels()" >>>>>>> bf8ef79 (add notebook) +======= +>>>>>>> a71e2a8 (improve notebook explanation) ] }, { "cell_type": "markdown", "metadata": {}, "source": [ +<<<<<<< HEAD <<<<<<< HEAD "### Batched token labelling\n", ======= >>>>>>> bf8ef79 (add notebook) +======= + "### Batched token labelling\n", +>>>>>>> a71e2a8 (improve notebook explanation) "Next, let us analyze a batch of sentences and have them labelled.\n", "> In this example the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." ] }, { "cell_type": "code", +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 18, ======= "execution_count": 55, >>>>>>> bf8ef79 (add notebook) +======= + "execution_count": 18, +>>>>>>> a71e2a8 (improve notebook explanation) "metadata": {}, "outputs": [ { @@ -558,6 +633,7 @@ "output_type": "stream", "text": [ "Token: This\n", +<<<<<<< HEAD <<<<<<< HEAD "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", "False | True | False | True | False | False | False | False | False | False | False \n", @@ -580,32 +656,45 @@ ======= "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", "False | True | False | True | False | False | False | False | False | False | True | False | False | False | False \n", +======= + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | True | False | True | False | False | False | False | False | False | False \n", +>>>>>>> a71e2a8 (improve notebook explanation) "---\n", "Token: is\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | True | False | False | False | False | False | False | True | False | False \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | False | False | False | False | True | False | False | False | False | False \n", "---\n", "Token: a\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False \n", "---\n", "Token: sentence\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | True | False | False | False | False | False | False | False | False | False | False | False | False \n", + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | False | True | False | False | False | False | False | False | False | False \n", "---\n", "Token: .\n", +<<<<<<< HEAD "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", >>>>>>> bf8ef79 (add notebook) +======= + "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False \n", +>>>>>>> a71e2a8 (improve notebook explanation) "---\n", "\n", "\n", "5\n", +<<<<<<< HEAD <<<<<<< HEAD "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': False, 'Is Pronoun': True, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': True, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" ======= "[[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" >>>>>>> bf8ef79 (add notebook) +======= + "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': False, 'Is Pronoun': True, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': True, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" +>>>>>>> a71e2a8 (improve notebook explanation) ] } ], @@ -613,11 +702,15 @@ "sentences = [\n", " \"This is a sentence.\"\n", "]\n", +<<<<<<< HEAD <<<<<<< HEAD "labels = token_labelling.label_batch_sentences(sentences, tokenized=False, verbose=True)\n", ======= "labels = token_labelling.label_batch_token(sentences, tokenized=False, verbose=True)\n", >>>>>>> bf8ef79 (add notebook) +======= + "labels = token_labelling.label_batch_sentences(sentences, tokenized=False, verbose=True)\n", +>>>>>>> a71e2a8 (improve notebook explanation) "\n", "print(len(labels[0]))\n", "print(labels[0])" @@ -632,17 +725,22 @@ }, { "cell_type": "code", +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 19, ======= "execution_count": null, >>>>>>> bf8ef79 (add notebook) +======= + "execution_count": 19, +>>>>>>> a71e2a8 (improve notebook explanation) "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ +<<<<<<< HEAD <<<<<<< HEAD "5\n", "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': True, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': True, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" @@ -672,6 +770,10 @@ "5\n", "[[False, True, True, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" >>>>>>> bf8ef79 (add notebook) +======= + "5\n", + "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': True, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': True, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" +>>>>>>> a71e2a8 (improve notebook explanation) ] } ], @@ -679,6 +781,7 @@ "sentences = [\n", " [\"This \", \"is \", \"a \", \"sentence\", \".\"]\n", "]\n", +<<<<<<< HEAD <<<<<<< HEAD "labelled_sentences = token_labelling.label_batch_sentences(sentences, tokenized=True, verbose=False)\n", "\n", @@ -690,6 +793,12 @@ "print(len(labels[0]))\n", "print(labels[0])" >>>>>>> bf8ef79 (add notebook) +======= + "labelled_sentences = token_labelling.label_batch_sentences(sentences, tokenized=True, verbose=False)\n", + "\n", + "print(len(labelled_sentences[0]))\n", + "print(labelled_sentences[0])" +>>>>>>> a71e2a8 (improve notebook explanation) ] }, { @@ -697,6 +806,9 @@ "metadata": {}, "source": [] <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> a71e2a8 (improve notebook explanation) }, { "cell_type": "markdown", @@ -828,8 +940,11 @@ " \n", " " ] +<<<<<<< HEAD ======= >>>>>>> bf8ef79 (add notebook) +======= +>>>>>>> a71e2a8 (improve notebook explanation) } ], "metadata": { @@ -848,11 +963,15 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", +<<<<<<< HEAD <<<<<<< HEAD "version": "3.10.13" ======= "version": "3.8.8" >>>>>>> bf8ef79 (add notebook) +======= + "version": "3.10.13" +>>>>>>> a71e2a8 (improve notebook explanation) } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt index 7d76665c..4e3785ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,8 @@ beartype==0.16.4 pre-commit==3.6.0 isort==5.13.2 spacy==3.7.2 +<<<<<<< HEAD chardet==5.2.0 +======= +chardet==5.2.0 +>>>>>>> a71e2a8 (improve notebook explanation) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 22dd0ca8..5fb8fa31 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -121,7 +121,9 @@ def label_sentence(tokens: Union[Doc, list[Token]]) -> list[dict[str, bool]]: def label_batch_sentences( - sentences: list, tokenized: bool = True, verbose: bool = False + sentences: Union[list[str], list[list[str]]], + tokenized: bool = True, + verbose: bool = False, ) -> list[list]: """ Labels tokens in a sentence batchwise. Takes the context of the token into From 57689f42928ec2612a58890adf65e61ae60dbf17 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:54:41 +0100 Subject: [PATCH 14/29] fix errors --- src/delphi/eval/token_labelling.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 5fb8fa31..163d9bb1 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -4,7 +4,7 @@ Additionally, it can visualize the sentences and their poart-of-speech (POS) tags. """ -from typing import Callable, Optional, Union +from typing import Callable, Optional import spacy # pylint: disable=import-error from spacy.tokens import Doc # pylint: disable=import-error @@ -98,7 +98,7 @@ def label_single_Token(token: Token) -> dict[str, bool]: return labels -def label_sentence(tokens: Union[Doc, list[Token]]) -> list[dict[str, bool]]: +def label_sentence(tokens: Doc | list[Token]) -> list[dict[str, bool]]: """ Labels spaCy Tokens in a sentence. Takes the context of the token into account for dependency labels (e.g. subject, object, ...), IF dependency labels are turned on. @@ -121,7 +121,7 @@ def label_sentence(tokens: Union[Doc, list[Token]]) -> list[dict[str, bool]]: def label_batch_sentences( - sentences: Union[list[str], list[list[str]]], + sentences: list[str] | list[list[str]], tokenized: bool = True, verbose: bool = False, ) -> list[list]: @@ -167,18 +167,10 @@ def label_batch_sentences( labelled_tokens = list() # list holding labels for all tokens of sentence labelled_tokens = label_sentence(doc) - # go through each token in the sentence - for token, labelled_token in zip(doc, labelled_tokens): - # labelled_token = label_single_Token(token) - # labels = list() # The list holding labels of a single token - # for _, category_check in TOKEN_LABELS.items(): - # label = category_check(token) - # labels.append(label) - # add current token's to the list - # labelled_tokens.append(labelled_token) - - # print the token and its labels to console - if verbose is True: + # print the token and its labels to console + if verbose is True: + # go through each token in the sentence + for token, labelled_token in zip(doc, labelled_tokens): print(f"Token: {token}") print(" | ".join(list(TOKEN_LABELS.keys()))) printable = [ From 197364d0674cfa66a24beb5e866234083a820203 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Mon, 12 Feb 2024 22:56:12 +0100 Subject: [PATCH 15/29] complete UPOS tags for token labels --- src/delphi/eval/token_labelling.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 163d9bb1..d4208c34 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -22,14 +22,23 @@ "Capitalized": (lambda token: token.text[0].isupper()), # bool # --- POS (part-of-speech) categories --- # -> "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. - "Is Noun": (lambda token: token.pos_ == "NOUN"), # redundant - "Is Pronoun": (lambda token: token.pos_ == "PRON"), # redundant "Is Adjective": (lambda token: token.pos_ == "ADJ"), # redundant - "Is Verb": (lambda token: "VB" in token.tag_), # redundant + "Is Adposition": (lambda token: token.pos_ == "ADP"), # redundant "Is Adverb": (lambda token: token.pos_ == "ADV"), # redundant - "Is Preposition": (lambda token: token.pos_ == "ADP"), # redundant - "Is Conjunction": (lambda token: token.pos_ == "CONJ"), # redundant + "Is Auxiliary": (lambda token: token.pos_ == "AUX"), # redundant + "Is Coordinating conjuction": (lambda token: token.pos_ == "CCONJ"), # redundant + "Is Determiner": (lambda token: token.pos_ == "DET"), # redundant "Is Interjunction": (lambda token: token.pos_ == "INTJ"), # redundant + "Is Noun": (lambda token: token.pos_ == "NOUN"), # redundant + "Is Numeral": (lambda token: token.pos_ == "NUM"), # redundant + "Is Particle": (lambda token: token.pos_ == "PART"), # redundant + "Is Pronoun": (lambda token: token.pos_ == "PRON"), # redundant + "Is Proper Noun": (lambda token: token.pos_ == "PROPN"), # redundant + "Is Punctuation": (lambda token: token.pos_ == "PUNCT"), # redundant + "Is Subordinating conjuction": (lambda token: token.pos_ == "SCONJ"), # redundant + "Is Symbol": (lambda token: token.pos_ == "SYM"), # redundant + "Is Verb": (lambda token: token.pos_ == "VERB"), # redundant + "Is Other": (lambda token: token.pos_ == "X"), # redundant # --- dependency categories --- # -> "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. # "Is Subject": (lambda token: token.dep_ == "nsubj"), @@ -44,7 +53,7 @@ } -def explain_Token_labels(token: Optional[Token] = None) -> None: +def explain_token_labels(token: Optional[Token] = None) -> None: """ Prints the explanation of a specific token's labels or of ALL possible labels (POS, dependency, NER, ...), if no token is provided. @@ -75,7 +84,7 @@ def explain_Token_labels(token: Optional[Token] = None) -> None: print(" ", label.ljust(10), key) -def label_single_Token(token: Token) -> dict[str, bool]: +def label_single_token(token: Token) -> dict[str, bool]: """ Labels a single token. A token, that has been analyzed by the spaCy library. From cdef0d6477bac56fc3a4d0dc986a0f214563ea92 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Tue, 13 Feb 2024 16:39:48 +0100 Subject: [PATCH 16/29] add tests --- src/delphi/eval/token_labelling.py | 15 +--- tests/eval/test_token_labelling.py | 114 +++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 12 deletions(-) create mode 100644 tests/eval/test_token_labelling.py diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index d4208c34..d45cf3e1 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -66,7 +66,7 @@ def explain_token_labels(token: Optional[Token] = None) -> None: """ if token is not None: # get token labels - labels = label_single_Token(token) + labels = label_single_token(token) print(" Explanation of token labels ".center(45, "-")) print("Token text:".ljust(20), token.text) print("Token dependency:".ljust(20), spacy.glossary.explain(token.dep_)) @@ -124,7 +124,7 @@ def label_sentence(tokens: Doc | list[Token]) -> list[dict[str, bool]]: """ labelled_tokens = list() # list holding labels for all tokens of sentence for token in tokens: - labels = label_single_Token(token) + labels = label_single_token(token) labelled_tokens.append(labels) return labelled_tokens @@ -133,7 +133,7 @@ def label_batch_sentences( sentences: list[str] | list[list[str]], tokenized: bool = True, verbose: bool = False, -) -> list[list]: +) -> list[list[dict[str, bool]]]: """ Labels tokens in a sentence batchwise. Takes the context of the token into account for dependency labels (e.g. subject, object, ...). @@ -195,12 +195,3 @@ def label_batch_sentences( print("\n") return labelled_sentences - - -if __name__ == "__main__": - result = label_batch_token( - ["Hi, my name is Joshua.", "The highway is full of car s, Peter."], - tokenized=False, - verbose=True, - ) - print(result) diff --git a/tests/eval/test_token_labelling.py b/tests/eval/test_token_labelling.py new file mode 100644 index 00000000..a57d0062 --- /dev/null +++ b/tests/eval/test_token_labelling.py @@ -0,0 +1,114 @@ +import pytest +import spacy +from spacy.language import Language +from spacy.tokens import Doc + +import delphi.eval.token_labelling as tl + + +@pytest.fixture +def create_dummy_doc() -> tuple[str, Doc, dict[str, bool]]: + """ + Create a dummy Doc (list of Tokens) with specific attributes for testing purposes. + """ + nlp_dummy = Language() + + # Assume we're creating a dummy token with specific attributes + words = ["Peter", "is", "a", "person"] + spaces = [True, True, True, True] # No space after "dummy_token" + pos_tags = ["PROPN", "AUX", "DET", "NOUN"] # Part-of-speech tag + dep_tags = ["nsubj", "ROOT", "det", "attr"] # Dependency tag + ner_tags = ["PERSON", "", "", ""] # Named entity tag + + # Ensure the length of pos_tags and dep_tags matches the length of words + assert len(words) == len(pos_tags) == len(dep_tags) == len(ner_tags) + + # Create a Doc with one dummy token + doc = Doc(nlp_dummy.vocab, words=words, spaces=spaces) + + # Manually set POS, dependency and NER tags + for token, pos, dep, ner_tag in zip(doc, pos_tags, dep_tags, ner_tags): + token.pos_, token.dep_, token.ent_type_ = pos, dep, ner_tag + + # Token labels for "Peter" in the dummy doc + PETER_TOKEN_LABEL = { + "Starts with space": False, + "Capitalized": True, + "Is Adjective": False, + "Is Adposition": False, + "Is Adverb": False, + "Is Auxiliary": False, + "Is Coordinating conjuction": False, + "Is Determiner": False, + "Is Interjunction": False, + "Is Noun": False, + "Is Numeral": False, + "Is Particle": False, + "Is Pronoun": False, + "Is Proper Noun": True, + "Is Punctuation": False, + "Is Subordinating conjuction": False, + "Is Symbol": False, + "Is Verb": False, + "Is Other": False, + "Is Named Entity": True, + } + text = " ".join(words) + return text, doc, PETER_TOKEN_LABEL + + +def test_explain_token_labels(create_dummy_doc): + """ + Test the explain_token_labels function. + """ + # explain all labels + tl.explain_token_labels() + # print explanations for the first token in doc + text, doc, PETER_TOKEN_LABEL = create_dummy_doc + tl.explain_token_labels(doc[0]) + + +def test_label_single_token(create_dummy_doc): + """ + Test the label_single_token function. + """ + # create a dummy token + text, doc, PETER_TOKEN_LABEL = create_dummy_doc + token = doc[0] + # label the token + labels = tl.label_single_token(token) + # check if the labels are correct + assert labels == PETER_TOKEN_LABEL + + +def test_label_sentence(create_dummy_doc): + """ + Test the label_sentence function. + """ + text, doc, PETER_TOKEN_LABEL = create_dummy_doc + # label the sentence + labels = tl.label_sentence(doc) + # assert the first token is labeled correctly + assert labels[0] == PETER_TOKEN_LABEL + # iterate through tokens in doc + for token, label in zip(doc, labels): + assert label == tl.label_single_token(token) + + +def test_label_batch_sentences(create_dummy_doc): + """ + Test the label_batch_sentences function. + """ + # create a batch of sentences + text, doc, PETER_TOKEN_LABEL = create_dummy_doc + text = text.split(" ") + batch = [text, text, text] + # label the batch + labels = tl.label_batch_sentences(batch, tokenized=True) + # assert the first token is labeled correctly + assert labels[0][0] == PETER_TOKEN_LABEL + assert labels[1][0] == PETER_TOKEN_LABEL + assert labels[2][0] == PETER_TOKEN_LABEL + # iterate through tokens in doc + for token, label in zip(doc, labels[0]): + assert label == tl.label_single_token(token) From 2c49e2e7c6c7f56928211bdd30eb99221f7d4ab3 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Tue, 13 Feb 2024 17:19:26 +0100 Subject: [PATCH 17/29] update requirements for delphi tokenizer --- requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/requirements.txt b/requirements.txt index 4e3785ee..4e6c5358 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,13 @@ pre-commit==3.6.0 isort==5.13.2 spacy==3.7.2 <<<<<<< HEAD +<<<<<<< HEAD chardet==5.2.0 ======= chardet==5.2.0 >>>>>>> a71e2a8 (improve notebook explanation) +======= +chardet==5.2.0 +sentencepiece==0.1.99 +protobuf==4.25.2 +>>>>>>> d50e206 (update requirements for delphi tokenizer) From 535a0c0ef6ea1daff0435f2896ea2fe3104189dd Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Tue, 13 Feb 2024 21:33:03 +0100 Subject: [PATCH 18/29] added token label script --- scripts/label_all_tokens.py | 116 +++++++++++++++++++++++++++++ src/delphi/eval/token_labelling.py | 37 ++++----- 2 files changed, 135 insertions(+), 18 deletions(-) create mode 100644 scripts/label_all_tokens.py diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py new file mode 100644 index 00000000..7cd79b5d --- /dev/null +++ b/scripts/label_all_tokens.py @@ -0,0 +1,116 @@ +import pickle +import sys +from pathlib import Path + +from tqdm.auto import tqdm +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + +from delphi.eval import token_labelling + + +def tokenize( + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, sample_txt: str +) -> int: + # supposedly this can be different than prepending the bos token id + return tokenizer.encode(tokenizer.bos_token + sample_txt, return_tensors="pt")[0] + + +# Decode a sentence +def decode( + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, token_ids: list[int] +) -> str: + return tokenizer.decode(token_ids, skip_special_tokens=True) + + +def main(): + print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n") + # Access command-line arguments + args = sys.argv[1:] + # Directory to save the results + SAVE_DIR = Path("src/delphi/eval/") + + # Check if arguments are provided + if len(args) == 0: + print("No arguments provided.") + return + + if len(args) > 1: + print("Too many arguments provided.") + return + + # Process arguments + model_name = args[0] + + print(f"You chose the model: {model_name}\n") + print( + f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{SAVE_DIR}'\n" + ) + + # ================ (1) ================= + print("(1) Create a list of all tokens in the tokenizer's vocabulary ...") + + # Load the tokenizer from Huggingface + tokenizer = AutoTokenizer.from_pretrained(model_name) + vocab_size = tokenizer.vocab_size + print("Loaded the tokenizer.\nThe vocab size is:", vocab_size) + + # Create a list of all tokens in the tokenizer's vocabulary + tokens_str = "" # will hold all tokens and their ids + for i in range(tokenizer.vocab_size): + tokens_str += f"{i},{decode(tokenizer, i)}\n" + + # Save the list of all tokens to a file + filename = "all_tokens_" + model_name.replace("/", "-") + ".txt" + filepath = SAVE_DIR / filename + with open(filepath, "w", encoding="utf-8") as f: + f.write(tokens_str) + + print(f"Saved the list of all tokens to:\n\t{filepath}\n") + + # ================ (2) ================= + print("(2) Label each token ...") + + # let's label each token + labelled_token_ids_dict: dict[int, dict[str, bool]] = {} # token_id: labels + max_token_id = tokenizer.vocab_size # stop at which token id, vocab size + batch_size = 500 + # we iterate (batchwise) over all token_ids, individually takes too much time + for start in tqdm(range(0, max_token_id, batch_size), desc="Labelling tokens"): + # create a batch of token_ids + end = min(start + batch_size, max_token_id) + token_ids = list(range(start, end)) + # decode the token_ids to get a list of tokens, a 'sentence' + tokens = decode(tokenizer, token_ids) # list of tokens == sentence + # put the sentence into a list, to make it a batch of sentences + sentences = [tokens] + # label the batch of sentences + labels = token_labelling.label_batch_sentences( + sentences, tokenized=True, verbose=False + ) + # create a dict with the token_ids and their labels + labelled_sentence_dict = dict(zip(token_ids, labels[0])) + # update the labelled_token_ids_dict with the new dict + labelled_token_ids_dict.update(labelled_sentence_dict) + + # Save the labelled tokens to a file + filename = "labelled_token_ids_dict_" + model_name.replace("/", "-") + ".pkl" + filepath = SAVE_DIR / filename + with open(filepath, "wb") as f: + pickle.dump(labelled_token_ids_dict, f) + + print(f"Saved the labelled tokens to:\n\t{filepath}\n") + + # sanity check that The pickled and the original dict are the same + print("Sanity check ...", end="") + # load pickle + with open(filepath, "rb") as f: + pickled = pickle.load(f) + # compare + assert labelled_token_ids_dict == pickled + print(" completed.") + + print(" END ".center(50, "=")) + + +if __name__ == "__main__": + main() diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index d45cf3e1..1ce88e4d 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -21,24 +21,25 @@ "Starts with space": (lambda token: token.text.startswith(" ")), # bool "Capitalized": (lambda token: token.text[0].isupper()), # bool # --- POS (part-of-speech) categories --- + # They include the Universal POS tags (https://universaldependencies.org/u/pos/) # -> "POS Tag": (lambda token: token.pos_), # 'NOUN', 'VB', .. - "Is Adjective": (lambda token: token.pos_ == "ADJ"), # redundant - "Is Adposition": (lambda token: token.pos_ == "ADP"), # redundant - "Is Adverb": (lambda token: token.pos_ == "ADV"), # redundant - "Is Auxiliary": (lambda token: token.pos_ == "AUX"), # redundant - "Is Coordinating conjuction": (lambda token: token.pos_ == "CCONJ"), # redundant - "Is Determiner": (lambda token: token.pos_ == "DET"), # redundant - "Is Interjunction": (lambda token: token.pos_ == "INTJ"), # redundant - "Is Noun": (lambda token: token.pos_ == "NOUN"), # redundant - "Is Numeral": (lambda token: token.pos_ == "NUM"), # redundant - "Is Particle": (lambda token: token.pos_ == "PART"), # redundant - "Is Pronoun": (lambda token: token.pos_ == "PRON"), # redundant - "Is Proper Noun": (lambda token: token.pos_ == "PROPN"), # redundant - "Is Punctuation": (lambda token: token.pos_ == "PUNCT"), # redundant - "Is Subordinating conjuction": (lambda token: token.pos_ == "SCONJ"), # redundant - "Is Symbol": (lambda token: token.pos_ == "SYM"), # redundant - "Is Verb": (lambda token: token.pos_ == "VERB"), # redundant - "Is Other": (lambda token: token.pos_ == "X"), # redundant + "Is Adjective": (lambda token: token.pos_ == "ADJ"), + "Is Adposition": (lambda token: token.pos_ == "ADP"), + "Is Adverb": (lambda token: token.pos_ == "ADV"), + "Is Auxiliary": (lambda token: token.pos_ == "AUX"), + "Is Coordinating conjuction": (lambda token: token.pos_ == "CCONJ"), + "Is Determiner": (lambda token: token.pos_ == "DET"), + "Is Interjunction": (lambda token: token.pos_ == "INTJ"), + "Is Noun": (lambda token: token.pos_ == "NOUN"), + "Is Numeral": (lambda token: token.pos_ == "NUM"), + "Is Particle": (lambda token: token.pos_ == "PART"), + "Is Pronoun": (lambda token: token.pos_ == "PRON"), + "Is Proper Noun": (lambda token: token.pos_ == "PROPN"), + "Is Punctuation": (lambda token: token.pos_ == "PUNCT"), + "Is Subordinating conjuction": (lambda token: token.pos_ == "SCONJ"), + "Is Symbol": (lambda token: token.pos_ == "SYM"), + "Is Verb": (lambda token: token.pos_ == "VERB"), + "Is Other": (lambda token: token.pos_ == "X"), # --- dependency categories --- # -> "Dependency": (lambda token: token.dep_), # 'nsubj', 'ROOT', 'dobj', .. # "Is Subject": (lambda token: token.dep_ == "nsubj"), @@ -46,7 +47,7 @@ # "Is Root": ( # lambda token: token.dep_ == "ROOT" # ), # root of the sentence (often a verb) - # "Is auxiliary": (lambda token: token.dep_ == "aux"), # redundant + # "Is auxiliary": (lambda token: token.dep_ == "aux"), # --- Named entity recognition (NER) categories --- # "Named Entity Type": (lambda token: token.ent_type_), # '', 'PERSON', 'ORG', 'GPE', .. "Is Named Entity": (lambda token: token.ent_type_ != ""), From 48f7f6a8d7ae23090f3239362f558e337ebc216d Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Tue, 13 Feb 2024 21:37:24 +0100 Subject: [PATCH 19/29] add the files containing token information/labels --- notebooks/token_labelling.ipynb | 167 ++++++++++++++---- ...tokens_delphi-suite-delphi-llama2-100k.txt | Bin 0 -> 45121 bytes ...s_dict_delphi-suite-delphi-llama2-100k.pkl | Bin 0 -> 274517 bytes 3 files changed, 128 insertions(+), 39 deletions(-) create mode 100644 src/delphi/eval/all_tokens_delphi-suite-delphi-llama2-100k.txt create mode 100644 src/delphi/eval/labelled_token_ids_dict_delphi-suite-delphi-llama2-100k.pkl diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb index 3bb02ee1..a447a8d7 100644 --- a/notebooks/token_labelling.ipynb +++ b/notebooks/token_labelling.ipynb @@ -13,6 +13,7 @@ "\n", "The first part of this Notebook contains elements that explain how to label tokens and how the functions work.\n", "\n", +<<<<<<< HEAD "The second part shows how all tokens are labelled that are used for our delphi language models.3\n", "\n", "# 1) How to use the token labelling functions" @@ -22,13 +23,20 @@ >>>>>>> bf8ef79 (add notebook) ======= >>>>>>> a71e2a8 (improve notebook explanation) +======= + "The second part shows how all tokens are labelled that are used for our delphi language models.3\n" +>>>>>>> e0ed3b4 (add the files containing token information/labels) ] }, { "cell_type": "code", <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 90, +======= + "execution_count": 23, +>>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { @@ -73,6 +81,7 @@ "\n", "import delphi\n", "\n", +<<<<<<< HEAD "# from delphi.eval import token_labelling" ======= "import spacy\n", @@ -89,6 +98,17 @@ "\n", "# from delphi.eval import token_labelling" >>>>>>> a71e2a8 (improve notebook explanation) +======= + "from delphi.eval import token_labelling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# 1) How to use the token labelling functions" +>>>>>>> e0ed3b4 (add the files containing token information/labels) ] }, { @@ -102,6 +122,7 @@ { "cell_type": "code", <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 2, ======= @@ -110,13 +131,19 @@ ======= "execution_count": 2, >>>>>>> a71e2a8 (improve notebook explanation) +======= + "execution_count": 15, +>>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "This\n" + "Peter \t PROPN \t nsubj \t PERSON\n", + "is \t AUX \t ROOT \t \n", + "a \t DET \t det \t \n", + "person \t NOUN \t attr \t \n" ] } ], @@ -125,10 +152,11 @@ "nlp = spacy.load(\"en_core_web_sm\")\n", "\n", "# Create a Doc object from a given text\n", - "doc = nlp(\"This is a dummy sentence for testing.\")\n", + "doc = nlp(\"Peter is a person\")\n", "\n", "token = doc[0]\n", - "print(token)" + "for tok in doc:\n", + " print(tok,\"\\t\", tok.pos_, \"\\t\", tok.dep_, \"\\t\", tok.ent_type_)" ] }, { @@ -141,6 +169,7 @@ { "cell_type": "code", <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 8, ======= @@ -149,6 +178,9 @@ ======= "execution_count": 8, >>>>>>> a71e2a8 (improve notebook explanation) +======= + "execution_count": 5, +>>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { @@ -157,19 +189,31 @@ "text": [ <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> a71e2a8 (improve notebook explanation) "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the package via spacy.load('en_core_web_sm')\n", +======= +>>>>>>> e0ed3b4 (add the files containing token information/labels) "{'Capitalized': True,\n", " 'Is Adjective': False,\n", + " 'Is Adposition': False,\n", " 'Is Adverb': False,\n", - " 'Is Conjunction': False,\n", + " 'Is Auxiliary': False,\n", + " 'Is Coordinating conjuction': False,\n", + " 'Is Determiner': False,\n", " 'Is Interjunction': False,\n", - " 'Is Named Entity': False,\n", + " 'Is Named Entity': True,\n", " 'Is Noun': False,\n", - " 'Is Preposition': False,\n", - " 'Is Pronoun': True,\n", + " 'Is Numeral': False,\n", + " 'Is Other': False,\n", + " 'Is Particle': False,\n", + " 'Is Pronoun': False,\n", + " 'Is Proper Noun': True,\n", + " 'Is Punctuation': False,\n", + " 'Is Subordinating conjuction': False,\n", + " 'Is Symbol': False,\n", " 'Is Verb': False,\n", " 'Starts with space': False}\n" <<<<<<< HEAD @@ -211,6 +255,7 @@ { "cell_type": "code", <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 9, ======= @@ -219,6 +264,9 @@ ======= "execution_count": 9, >>>>>>> a71e2a8 (improve notebook explanation) +======= + "execution_count": 6, +>>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { @@ -226,12 +274,13 @@ "output_type": "stream", "text": [ "-------- Explanation of token labels --------\n", - "Token text: This\n", + "Token text: Peter\n", "Token dependency: nominal subject\n", - "Token POS: pronoun\n", + "Token POS: proper noun\n", "---------------- Token labels ---------------\n", " 0 Starts with space False\n", " 1 Capitalized True\n", +<<<<<<< HEAD " 2 Is Noun False\n", " 3 Is Pronoun True\n", " 4 Is Adjective False\n", @@ -253,6 +302,26 @@ ======= " 10 Is Named Entity False\n" >>>>>>> a71e2a8 (improve notebook explanation) +======= + " 2 Is Adjective False\n", + " 3 Is Adposition False\n", + " 4 Is Adverb False\n", + " 5 Is Auxiliary False\n", + " 6 Is Coordinating conjuction False\n", + " 7 Is Determiner False\n", + " 8 Is Interjunction False\n", + " 9 Is Noun False\n", + " 10 Is Numeral False\n", + " 11 Is Particle False\n", + " 12 Is Pronoun False\n", + " 13 Is Proper Noun True\n", + " 14 Is Punctuation False\n", + " 15 Is Subordinating conjuction False\n", + " 16 Is Symbol False\n", + " 17 Is Verb False\n", + " 18 Is Other False\n", + " 19 Is Named Entity True\n" +>>>>>>> e0ed3b4 (add the files containing token information/labels) ] } ], @@ -612,12 +681,13 @@ "### Batched token labelling\n", >>>>>>> a71e2a8 (improve notebook explanation) "Next, let us analyze a batch of sentences and have them labelled.\n", - "> In this example the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." + "> In the example below the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." ] }, { "cell_type": "code", <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "execution_count": 18, ======= @@ -626,31 +696,41 @@ ======= "execution_count": 18, >>>>>>> a71e2a8 (improve notebook explanation) +======= + "execution_count": 9, +>>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ +<<<<<<< HEAD "Token: This\n", <<<<<<< HEAD <<<<<<< HEAD "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", "False | True | False | True | False | False | False | False | False | False | False \n", +======= + "Token: Peter\n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | True | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | True \n", +>>>>>>> e0ed3b4 (add the files containing token information/labels) "---\n", "Token: is\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | False | False | False | False | True | False | False | False | False | False \n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", "---\n", "Token: a\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False \n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False \n", "---\n", - "Token: sentence\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | False | True | False | False | False | False | False | False | False | False \n", + "Token: person\n", + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False \n", "---\n", "Token: .\n", +<<<<<<< HEAD "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", "False | False | False | False | False | False | False | False | False | False | False \n", ======= @@ -682,11 +762,16 @@ "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", "False | False | False | False | False | False | False | False | False | False | False \n", >>>>>>> a71e2a8 (improve notebook explanation) +======= + "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", + "False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False \n", +>>>>>>> e0ed3b4 (add the files containing token information/labels) "---\n", "\n", "\n", "5\n", <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': False, 'Is Pronoun': True, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': True, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" ======= @@ -695,12 +780,15 @@ ======= "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': False, 'Is Pronoun': True, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': True, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" >>>>>>> a71e2a8 (improve notebook explanation) +======= + "[{'Starts with space': False, 'Capitalized': True, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': True, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': True}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': True, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': True, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': True, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': True, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}]\n" +>>>>>>> e0ed3b4 (add the files containing token information/labels) ] } ], "source": [ "sentences = [\n", - " \"This is a sentence.\"\n", + " \"Peter is a person.\"\n", "]\n", <<<<<<< HEAD <<<<<<< HEAD @@ -821,14 +909,14 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The vocab size is 50257\n" + "The vocab size is: 4096\n" ] } ], @@ -844,51 +932,52 @@ "def decode(tokenizer: PreTrainedTokenizer, token_ids: list[int]) -> str:\n", " return tokenizer.decode(token_ids, skip_special_tokens=True)\n", "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"roneneldan/TinyStories-1M\")\n", + "model = \"delphi-suite/delphi-llama2-100k\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model)\n", "vocab_size = tokenizer.vocab_size\n", "print(\"The vocab size is:\", vocab_size)" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "! \" # $ % & ' ( ) * \n", - " inv lect supp ating look man pect 8 row bu \n", - " child since ired less life develop ittle dep pass � \n", - " matter reg ext angu isc ole aut compet eed fect \n", - " (/ ….\" Compar amplification ominated regress Collider informants gazed \n" + " 20: \u0011 21: \u0012 22: \u0013 23: \u0014 24: \u0015 25: \u0016 26: \u0017 27: \u0018 28: \u0019 29: \u001a \n", + " 800: te 801: happened 802: flow 803: food 804: list 805: just 806: Her 807: animals 808: hig 809: didn \n", + "1200: ice 1201: ount 1202: worked 1203: okay 1204: irt 1205: making 1206: dress 1207: enjoy 1208: advent 1209: bright \n", + "2300: lift 2301: ign 2302: ba 2303: line 2304: Doggy 2305: clouds 2306: dogs 2307: yard 2308: wolf 2309: spray \n", + "4086: 1 4087: 0 4088: 2 4089: 5 4090: 4 4091: 9 4092: 8 4093: 6 4094: 7 4095: $ \n" ] } ], "source": [ "# Let's have a look at some tokens\n", - "ranges = [(0,10), (800,810), (1200,1210), (2300, 2310), (vocab_size-10, vocab_size)]\n", + "ranges = [(20,30), (800,810), (1200,1210), (2300, 2310), (vocab_size-10, vocab_size)]\n", "for start, end in ranges:\n", " for i in range(start, end):\n", - " print(decode(tokenizer, i).ljust(10), end=\" \")\n", - " print()\n" + " print(f\"{i:4}:\",decode(tokenizer, i).ljust(10), end=\" \")\n", + " print()" ] }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2405771500d24b7890f87694d533486f", + "model_id": "ea9ff9bbe9364a3ea1ab9acc11abe338", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Labelling tokens: 0%| | 0/8 [00:003Snanw`BbvgLhWpBaxmUUJ$+Y>Aw)-Bz{KtyW7dO_ydg-CjWuBvAzdXn-iH zz1Je&6#&vh?@|k%2_65pmA9#EIaG?!ynR_?z%I?f%$@AI!Qx{^1An?oWL9 z!J_+na`p7E8OEn_gDG6>i!y^lkTtcIqm)ipR?|7@;UGR7N3jmZ}ZuAe}~Uy z_jmamy1&P>z3BcvpI6;K;B(UbLq4b7KjL%N{bN4oKK`Qn4L`z_w+ zMfXqnyz2fLpOfyl`J8tDoX=VJFZi5y|B}x|_pkWuJFYWZcE7{t(EWglc+vf9KCikL zd``M6KBwJ;&sjI+bKcGPTy%3j`)VB8cN%wnvPP>1@=dAm; ze9pUn$LFH^A)kHs@A+JIAMrVKFRAty-H-UZ>OSUk(tX0`wEHoiv+gH+&bv?fTy#I> zv+q9RbJ_ij&!PLA)_T!>!RJ-?b3P~CmwZmUulSsGzt88q`vsqi?iHVX_cfo(?hp7J zx<90mUv%H_dDZ<9pOfyFd``Px@j2`M1E2HmKk~Wg{u7^l_n-M(cK?OXq5H2i@{1SU z%FR`G?Pk)g-Aub1H?wZzX5QVpS#)=9`tIJ%vfH{Dy4P&zVsG4Bbvrkc?!nEp+q;={ z2RHNX=w{LV+D+e`+$_7Zo1uGTLtneNx$54!nRHKXrro!0X5BkC^X@x0i|)U<>AU~# zX4(A@H$(S7+0g6$mz%5Z|Nfuy@yhhmwr9LDy|wnxVOwFmGTpZA8Lv#|ZF|Nm(}mlf@yc}Mwr9LD-MRMA zqg!FTGX1*k8Lv$5ZhOWn)5qJM@yhh{+Cyh=h4IRC`L<`gG9AC|8Lv$DZ+pfo1A?|^ zyfRRzJ)of##w&w~wr9LD$Y^`UD}#@=XS^~fsXYLt6~-$Am$qlTGN5UD#w!Dzwr9LD z0BU>Ml_#JmdnVwh721_2V5#kCSDt{Uwx?Zr0;bxYcI64Usy!g96~-%ruC`~qG6-vX z#w&xewr9LDNNan>D}%P$1KwI;yfT<;d&Vn+yS8V%GT3W-#w&xr+5-YxVZ8E)!3hSS z{WM+~Gtl;oSH>E&J>!+J3AJa$;AF($1mn!+JA#KliWh_bCGhP{c0;f$t;5{Td0f6xA1e*fkO#mOYq*01w! z593ZSEt_n6>3D*5d4xMBpl+4lK^^2j!N$PB6A%Y6Pp~puh;{Aa!D>qy%KM&fNsYZz>pMs|C zK07AeAv>nshRaQ{Osj0?-HunCVw(=!b_$SmBs5O>I%spJU}(j|OfgTqByTqbLmRJ$ ziL#2OAn1`AonoZ6-VgkI>UA+wR_zq@Y>OyoDbP7#2B+Anx7O1XGJ>@`o$FQ$!h|bL%uG%H+G*Xc=2IC3(m$Ath};qr(oqiPl>%+XUDADc|6wY-d3D~ zl(UbIv3ipOmU;kFEY;_1z{qu~2aN1J42Z0;`%@6Y>-#8}sdsj^DGuwzWc$13jzLN3`ep-$@{U~>ChrpfIxQ_R$M=e+<6=t-&9dzu25C+`JZ z?pNLfxa`w!0L;#I0x=is5y(_c1T%RTP;&rHQ&3aq()5qMB(OPqRdHi2100{&`vRW! z-r#4G%YdJaH3@#sE4v5?`n;YO3_aebUjw2q_g)h>?jtu;#w9HQj2^_-6d>KEOM|4d zT@xhnJn8M&sNHj!diJRR)Yo+)9W1R4rhXt8aP+nh+-wG_)?8@@s`lbxhPUTa%rH@p z?U0#%+zbnKxY$fHfJGfkCo{0cY4)!f_`2q%Gpy9v3jwgFItYmU1~;7nu&34jF-HTk zYe6y76RZLrW!jaRnt`>qbsU~$4imlJ2DEKx@fm2lu07ywl|7T-e+JxEcGMYo`>E%G zw@0xs196AEHGsPS=^1d_-sI4MhqbL{Anral0C7*&41oJ`FEVD>DFHD9aOrh35XZ6f zof)9J2TYLEoE457xgT2Q?j={f{ zlxBc$YfqnHrKoTV@V`$30PR0B@OOePXW$R-dj|Y&q?8%x;}YrFGXR)70>Fz81pprp z?|-dMJp+a-Av^s%WcKITMBl`xwD!o$I;QB1ctXhA1GW&^D}VxG945czHtXA zyw~8&Kp}7ABY?w8wp^CLWQIjMOBgc{_>CJZTKWh0yH3LbzzdH&2Y=KoPcR38hiaxd z7`$J}kaJM@V9m`jZeMumIWXKEf{i&iT<02gxKExgV_;+al52Z!fJ zE%6*A9z^*Z7@n!?IVil3Rh?t)uG9K+FnDzVIB2ms*_EKhl6fVmE2=Rk98 zV&F+T<$VFD{51!h+g+WpFhwv2ow+6U@!XyaXx;^QbFg`S2omO?^9&Qr0q3=?3pig{ zfnXCtwc0@ESzydT=SsqxgU<6qyB+YP?Q;dH{+eF_=c}Veat=D5G&ytq%8=7s*V6h0 zpO0P>Y_0;kIc9QW9|f96Au|V>YpV!gKGc0d=2Pt59Bkg6v`%xt`FKCp2DUOU+qxqr zbF!$4O9LNF#FDMUKr9k##PXw|LHadu1Y3}9#c6G8#V>V9# zm`7UoNM zqJ_3I54pf}p6U9&+XB?S$?GhD?Vd_qfLSa6ZL$EhM;c%OW_O!B^a9jgWUmFVy*^Om z3w=}^l?9l6ik(;h+H~{OsKl`Vw))@;aC_F$EkNz9F92rY zyzC*7MORtiw`$fFK=uso78uYk>?I2TYtt>j>%{|r>+zV{6Ry4wSY7RXL9iODxd5z} zwXXwKFAu&eVBOiV7l3uEGr9n%pVvVE6?nR9V!CbtMjs163o+{Hda$cB3nuiumGl8y^#el zU0ug8EkNn98WJm7-9{UlOMucFDR%)%X$}ihpT)-mrYq@b0Zt#{a4f(n9o1cQak*>( zP>EQG}005!C&T(@<7hh)qKyO=#m~~U^w*a0; zD+6?1Z&C-qlce6l_%M{Z0G}V}04>1hU7iAbY77>@6OQqj!RO_aU6?&E7z4k+uu@HB zfffMu6OyQabn`7Q)q_zDq$LPoR~PC5seQ2rr008EvInG8qh3=FPJQq^Ru5E92RmU8 zQ1M_bU|y-E9-P|3Jt)no)=h6Dso&HaO74x;dyq;GurTjlrz`eg^*a5n2dl^Ywlo&? zne=i`yloFwZ}cR3z^cLR0qaF>=|O62cn?&+sj~p;?aG?%0qWlC0MvbqQx8%Bc`nJd zzg<^<>;da3wE$ReH#)vOVEv`xN)K9hvC=(o{V3NHuLpyY9xHpcM#ZZ&172wsI&TkR zxBF^|9?*t4dQq#n?Sbv1M*-S%I|1B2SVzFt9T%)&C%q!!R;2i>u>l>={DeYwI6q4VCzg+A)Nu-ek7&#z;?gME^tfGXZrwLURccc zK=(111G-eUcL%y}(;a~BN4AK-&9Tr~QAH1OH(LK5Ojk0UHfdvJS=ZL&lx z<7L6^N@nQ+Zgoj;%RBoF7}kr%w+FP}`tsQqcSM7??LC6JBxx&*S& zn8!d1Z>Y^Bh<&tLmjL#hv+R;SC8qV348jt$u0umh;QBU(d)EZ`4vUaR{PfyizOfiIdJ3>6n~y>CW=EG zOHu6OrkjA|?E^l{64Se;AqIeqLlNc}0P|h*B?q8Pv-7M2P~LnyKEwc=FZ=I*r_~R3 ztN~C{dsfQ;tnZ%7)Zi}n9KiLVQN;kVcX^iqi+s%!4IulTR~`U1Ahm#1pe6@>W}12c z)>}>5fEm7v_cR#jn%f;fc0!v0U{n8jAdr2%GnyabI0G1&bpYPy$N1g@_^wVofNs2` z7=i(S-^!5#2;bts3;_HqH@aNf#sPHyoZdTtZhDda*Z{s)QuqMAFAv(00e~Mw_yEGU z415mY+vv$sALk;^1HcKk-3j;zO!A6AoIaD(;{e7FcBui3ztQO#0Qn_X2IGki-Di#q z3=aUDs7p_B0O4;8CkGI&eXvAe3Oa$(0L0hXLavhbpj}S9G63(-d_k~n-ek}$Kc3=a z48VJr8-Z>e%E7SrF}DHTub%tb0LCx+V1se5&eZ_O#g(4V0LoX!o&zW+AL>)aKKtB& zoRAZE4#50cGd+OwYYpN6%(-eB9iX4{1i^V-4W!e$ItK$#zdqzmLAz)g0Q=>v%0hWUpw)|bceEBzKfrjF)?a#`?x*~?x7fZrN9 z4d7j>&ps;6`V{QD1J8qEzE1Y7UJ<5R*1 zl+`H7$Gc=-_yS-a4;5UZe!v$@`i3I33{EIax*us%CFC8Z?SzFLV%4a5$cj$c8dxmZygV4q8lh(Z5e|n<>Ww&8;B<{KUEwT!53L z_sk(ZkxvSysZwxIL_$L)@}m1J*+hz0$;MMCwB~0bj7_^0oIsAQ&Z7QybuCySS>!yK zkv2_Nfk4F&PoT@GK2cB|Yc;cJZ$>52n8XG|rLxd({P{JElqVhtwU>8S|GG@C~ zxw-5^-59z!J>N~Aexs{{vku5iXra-;!m zc^@2s1xid$aY>++1Io*^2x+wL0c${o&o|apfHgUS^unefij(kjX`Y=V80uGe@G!Ox zyb%|h-^~UZhEoj0j2(~(k$|b)Kre$jz8b&)xO>@-GDDq39`rW{OQa@tW1CjxZ}UYj zR}Y#Z{p3(&!3&a?Hr32-`la>+Nm2$j(tp(_B@UzjeY}&{C<5zr36)Ao^s&#Of*?}C zl9>nljH;#L0)AZMQgd$p6C zR=qV0Vr0gTrVZ*xU7`9F0!hNq10WEBhSUTU@>&A8fN>W zuOffs1~?NB@znVpkIu1>Tmuf-JjCCo2^ffKtqF}>Z)Bh=yb99wDPb(DvhXZ$_G>2; zlKcoq;<#+#tx(7n4h9^N^9aq;MPQJVxd2)ijG6wWG$CvhgI7in4((H-7d9Taf<6e( z@{aHaPEh(F1aiAlvUUZ1&?|BuoQzM$GW3CAP4qz@x~eQEC3JF!IC_+Ek>6?3DRRT` zS-zCu?Ft6@CN%(me4HD=9yon^J)D9JFUIo!FYG)^n=1@8%f~4ZvE1izy64?tiavKkTKyJ#sUBMs+gMD@aFI-v*hN^jP zxFbPm9sv5N3Q2~I!xn~|Rhy8=+q|G0a;Y|eL%#Dx63y=A5L-2`2rKi zW2J~&Xv9ZJx@y3fz#ccv`%rnc*;my~O!(ap;?)^AA!8j)j5LyzK%2;IwSUyk(kl0+ryFwK8xUE)}F(pTbcQ{=^L-U&4FbrOse@lJ~fClE<9hZCseAt5yfNdNJQCUD8G4W%j&v}QnQ=hd8|i*PaL)4P`W3MV2Xg;BtJVfq|kRRYbL?~nBC@6>MRDiEAs+- z6H1unEKfqg7kR~aBC5XVpz-09emM%9x`$eBGDynVAfg`M$b}SkI!%}*KAl>yUD7EW z%P{pp0S0!0t`g-1MfkE_zyKzJ7CY^hMmiK3R?bCnOvr5{Vqq^}*xo*0L%UN+N`AsU zAvmLuF}4vtFigVZya623Oo9?JI3qsxM1m=vR7m#r&@cp}#24 zSasS>Ixz8VzcXQ?HhhcS^jfZKq(ZI4VIl-wk=<6R2G~ zwr;I{P4^_Mp@{AAP=1d>HX=j&gIc!qA;oO;TB}q&n6yjkt zFfFIOz`q}eO_QcVU?YlSXTa6rVbaua(M?RB zGQ4x^*f3G~4+`;Cd7d~qX1efZ8CQgDQq)%r6)VRho@%`f5>jH(I*2qhb`W?7zop(7 zXD80g6c(zdfrG9qHl0F3w+(=h5Eedt4GMb5IK|Zz0y1%3P>i4R&^owTmN?l{_(!Ev zMTimXt<%kv?46TXyHd6HGzR4=_nu<&)$Uza!_Huz^JBSUD)?U2<&^N58{`U-yS1r) zkCeGjs)TQit6()L*Xjy!XIt`a2I)XEI(X^}AL^F+Kliv> z*^AXm5@%*=MwL~IVP??IT3s+_I^LH%!Z_R3h*D13Th)e;s{<%kDh|(2;jJ08v#Az? zbs)slE1nP4n)ZNoDr{}yLrHrE-@H}~Ldn6!@TC&Ocvs2a5*#~vSp)2RaR{?|60}2J zFOijsL}b~Od7QyKW6)R40snki@sJ9|*K}R)2=|a>&$B^3l4BhI8Sc-~hf}P0NR=p7 zEJXx6&5V~!lP~yO>G}#5pUCW4DQXr?-`G$kT8t+>hkL%P>l%XC#8$G(MF-x9^)m|J zte)yc^LhCJ&*xB=gMBh+&C%YlR1JKYBigKYQO6j+B4;{+RA^Jphy$Jngn{5l-&{fC z#|^V{*oSnWy;)J?H!Z2Hu5n$Sp~^;bzVKL`tL~_ti(o74(0*tUWmO0M4^C#@6(UmF zm@-7vvf=fWL7=2~6g{pU%gtB$_?T8v_(<9z=vVr9c4V1Uh;C&H7?SvDb4che`HQ(J zU=$ts9THl#4y*+7q4J}1C`flWPk_^dyl9Lm=kU-WVTW4CbL4?n$|3QRaOG2vunH&B+w3hFp(h?ZAqsbh#tc7~ZHlocj z<@VR3;o!V%sdbf?_+GgTFyF)(Reb3r2pdBIW(!HwV7^W6MFr+wtsy_tZioGNOWd35 zGF8ej$@JzB!nh{T;dpo@%&HcX;lo&*1+;=0&fDPLWVB8_=1tru6`9vbD=5h{J72`4 zFjFa68hin%G>4^J0!5lsn-2iF>^6B6gQ{5gR5JAPlO({^Ykue0hk{MR%sdIyf_gbo zh_cOimC81oRa3ZmI>b3wx_PePUh(E<2Z5k^Q>j*JF@DXeMdb@v<*9-mWtf$S{@CI!q5}?3r+k2 zwjq`-|3!glgR2ZzC=$JG?$`p-xeL_boWw4zf5JO-hcvaJRf7%gjlNYuR`&*9 zfxAXAim45@RE-*3WxuXmoK#h#=Y7$m%F*xajtWPUOZ zCDo+RL3=F-=yd`MC8g{5>WWIo{-Cau{6?z7*lJf{UJw012ML{3my(#Tu_h&^O;r|X zD>QwGFWkdIMX#wAf2`ytO##!NdWvDg6(n~{(yykZ%xb9o9 zq7wBSVD~VQL5%K!YSd)>+H;%Lsb_+Xu4{Cohl;*v53Pg4!ldF*WrmfsHWACg7iUXh$ty901@3Bn)lK}Mj)*AllV~Cb)~vB zX|9|J9SQxY&G?C%w#ZAR>shi;>3SC!!bLn>s7IkIPO?8kLg(ivg@l+?Fro++IxAY{ zATvCu&;x8}$jG)i){0-@;pcNuI8k8*6P=(CT#Xf}g-w_u@fgL=*{ww_hRlDB;B=jvS`rZZKMq72c7%hk-9*|P_!o86@T)y;Br zqZGBXWK69I#k1&J%MVdOdtJFKCA4(rI>hc2A5smiiRIc2@1}Mjq}zDpO9-j7&Q}U) znUUljAfqIhl+rruD^CUw?TRea(_X*dZ_J*{LqnOF%>{V9bkJyO+xbp(gn4o(G{i7c z!Y+li8k?Y6ajle?&Zfe4&uE2@1Qor^D993*=rL($^|jj!a;vXJe-@LX#Fk!@b8x?^ zR$-vH*KHJ1r7gO{hjh@WZ6C{vTf##SeFA>fw(E%681aQ_OWUL5meIBhgDbi{S1L~3 zEj6BZgMdi6<*9Lqa0Id!`niq&v4noUex6~<5Hc;;JhdexM2M2qg*x0%9+JOSiAyXi zniS(2PgiBZLBK4#iGofMa;V8Q;>%6ppkL%c;2-34X?qyxJSLJA=aLff3NR2lfI8T~ zq|Rr&fTU*H-;gV@uKL`!?PBl`o@KBH`#3lxZ7b5n5o{iuLS6MH4I6L}1d^ko5(slH z;U5e@ZVv-php3h85<7?VqS?)>%0Q$I^DYn&iAAp^`8@Y?_$T9Gs&&y)%4^DX@8i`k z@qcKZbaWU8mTGb86239#oui#%RZf|@U8h|!f`8S0H4X@>+NJZA9z)?SoubAP)$OkK z?F@1*(ZzByHhOicU_{k!&~If8Fb-ZsjF-AyoZ7L@mF|*X4}Gt+6?Uy_*`*RohErWe_d`-%{A9@ZA-j6!|&==M`Wc zRU0WH<8o09cFyR|$vP5OR?`GZEufN?z?N&bHD*Qms_gfmq^hAU>?w!(iL-f ztMjV&&9VUeo@rKfXjNuVt>{-uZLbCO;LPWaa1TwC3W0h^Y8osl{w4WL*YXSW^P0SP zm01e`@I=APFZ7~QLJ5Vy7AtU24typ}o3jb5Nfox|hkpnh(tQW`=kr!IW_&^x1N8Id z&pZ$E`JzV6l?C56w?kzx@T%?x_mH&;*(wgcZPA$WV3Y=VUbtr;;#DB5`3jpW68^k? z=S8WPdQrINlvM2i^I#^UY#St>aksG%W32W3vd>NAS?Y!9qCPh650xl92iza!dF6bm z8U|!_47McWUvi?t;h)r~v&!L8sSHpMGSB>MYNYi+$jAJ6o*egQSB(Yv;GpDK#|)y) zQ9B%QR_&+%Tw+8A_=i;%o*N1Z`mD%M6|wKxE(!_VRM&xpwiyE*AR$r!%_CJu+`1n; zbZptT0Ty~FX{%ylnum2?)Dk<7nF2nZ8ZJuM<~Sb|v6IH=l;9pP7)OzBAW}6rC9Kv( zPNYcc-l%|*!&|!@FG$(>nTcyq3=vQ3>`-!?8!S+Q(HZg1vGC1S?}5G8tBiOi4bOKt z2Tz8^qfFwUJu0L_y_kn#w{aK{r^MTt9z&Nz(y7A3d_(4&)GI%@@`iagicoBfG)1zm z+DJ-SfCJ@5o(9(05UjA9tbG+QAdITEr^DB|)6+!zqz#EPLmBsjyoueKFGRKRtSI6^ zKDc}dMNC#CxfJ0Y)gzRk00QHr%J>GOSFM`<$&?9p*4%LfR!+;aPgBHMZBGH|C?8M5 znf+n7wjK{>2MgYg*KX~KeL1^%AC3m_jp!>gPbf0i@R;m2Wh*wl1>(V|C+mf_%h)zp zqRg$Cfs+@}=Q)5Cqc47ec%)!ThPQ`G%kvGVAfvMsO=)MqnKC?3WB-)(w_1OKd=7Ue zOOTO~VyowcdeHjRPF=f-Rye}JnCdk!k3)|+538lM;CBK8cuud5FNT?N6Gh}8Ej@xl zjW&%FO_8Zi@uDcgcKFBDy1G7CmGmdL$GX#!aTNLV5F174=coXUBGs#3o_f*!*`=ur zzmhLT3FkdGxt^II8qo8go|m~Sj?Z@oeO2~3nJ2FW?J#v*W6EA9c+~H@lKY!F-`v`% z38py6v0!J%u-X~-gTL}X5YOkEfE(U1NUr_@?T}~5o!}i*VtEyahumTw6xKOjOh0lt z!&xl>MOerAHEIj_M#s&JDkOb0S!vN%~x_>Bo%#3<;aAsMsNrSaV@p!lvJS*iKRSg%LK6-PoxABl&xN? zR7vvn#aT`i@e7)okuMpstaJ_E1Zg_~v{1kG1t!|3ub?>u2e~37gcFt?j7*O(%+Vxd zjj_?#@pok=~A7vp$W@Xo?8)X*RL1RE6 z)5>#Ci0D;DSlLTRvPVoyDdSBk^=N7HK#LDN= zoRT~vyrcXsPbBLg;K(o$=USb3?UWGt9bz~b^P!Vf)k8W^Z~6nAV_*~DF&;vY=xI<6 ziN2JamLeKdD(|*V38V-xw%-OX8bt}MMs(S#3|ymCJjZgBzCo15brgD+=B!yvo#!TV!xz`!tER#~epqGO@oP>5O2nTYoN$D$2XG#fX z#fQloQrRpZQUgaEqe#>_)2Ni0866n1_q|*(F9zpq4W&8A#8O5oDQla@38s8CWUnh} zN9=bKDCb8>?@{Eo3>el`AURqvxqB|jJ43Un2`HyfF@bQ>-b$JAvvVGVlR18QC@2Tn zk0vT|<`BDTW)jEI7qeT2+f;bMH~oP;AJl__$rGg>j!4mzGeb!1}&i2S`Ure%=kr zsW^iptjy5T1kPblBp2Y#s%c>yKoSQ;Ei+SD{T;^HI0{Xvb(+*4q=U)HZfcTgPAPJ) zhhsG_;nfWo&u%)Z()K!$`_c9CKDaq$QcK3%1a~ww!-%e^MhRO{K{RD^+NpJT=cuEW zBXlF{1t6YezBm-8H7nIo1d^Mm;whUGEVSX^VtaX zqts!r>wOsqrOYTcQrnJs6;n-MAJ&OT9YQ~!Rk8rxJz;vG2lg4^)*cED!cR$^K|*g#|v@qctXmVohIkQDQLz}So2v}QxP>KI;-tC zg?}*b)~Pb)kDHULj5&g*x`Nub^dObYHA8^I71xAC)iE>d6;D_pGdX#$2m2(ALrF1V zjkfT`$OkD-8CIptq@{Db1Je6=AS#&&p=viTSySgY*Ob*eC3CMK zD)q}4s>=N{=flxxRlrqKct6gdPxZh;3OOrWRm$v>)ivOvV{%no0MG7}M}vvpfaGwgAXqsZ&7 zC&~h8sU~RYO>u^cit2-w`=X|Kze&a$O9S;*f|;`874@F*Qf1xP3pdm_6J_Ir+y{4v z7Kqw}-FDsToKO;Z!PFa+v_6DXTygYlS_)fcjv|6R%2$MN=~7Tp((!2`$cXtSs>eBm zo@5lQu9*<9_BlqJ!BTdkPWM&Pm+*&NjgY+=I`GO$n(0$15fsE%QO;~^S5Q#W?6OH! z%c`1d3J<3yaF0)_s@b%7uG4p4#L}p19_^;WW{@2JCBo)JW*#mZ>GG*;PUbUZ!4l)D zvRQFqyEtBuDv*qla|)sReLMx_&32re3@6|!1(+(FNfx*pEoL_kLq!o@w~E9Uav(mQn5&d_#j3<>cIK)roF3>FNsr8jG;p3OWhwT;uR{&SL%FU{Rb{WA=Mt3U@W64TpG`) zY=TPXM-nQjFK9@vO*NVU%HSz39f|hQm2u6~avJL82+Ce&C0h=|8*@CC5>5lJTyI2F z=IChlW4kZO@8TTNz7bwhbcM4bUWLxPRS@k`s^Hl3;2|fNfcn4#E~;^{dIb3BCPOUM zO34K|DUb7(t>ZyQfZtt^k(HOnfs9DTW^yh+Oboc1lQn~p4i)0gAf%7d`{ARf=baiF z0;ck|6h7ZFP>>tIzr;u>l73=%sU8bfx^3M6QmT5=Ju)crV6#DoN>(X(#zj$Yzeki*`?kY)t-G*y>pUa(K_EBlE`N- z^?O;jG>?Mk400Eht9kyoWjYi+n+x=wF4lUZ42Y6szdD6Z)jhL1M2xhm=bze{T(VUu zW3`{_wvwRCer3SBcps{sK|w)H*)uCmTJ0|2nq`V9*?%PmmvCj#AWKk^llmz*3Rbs; zk2tPXh=OMfL*72==*ptFoGX?qJtzyhW`v5%;Hhj+QDyUmS7Z=CZg*kV2Z!qjON>Lu z%#rK6BH52cR_pAVi?)MW=d3J1DZ{8#b%i48omt(`Mvvfm%fNYiCMb$Y#DuGL1*Ffc zA(j`0p*~IK#HC*!%IQ$~+$P!Nc<4!yh}VUhPV|!$aaRtziu2VzgYVMcaTJ{xPqbn& zSA->t*BTw4=~d=la+H=1W0>cFp6=8hP3ExESG$^X$rpSU7AUVLgr9hcKR(m1Z(EO;UU(Ijce~}vNUiDfalQBW8~3H!F6<_YFXE)&eStv zCPR{eN}*S$T-!`hi>6dUf7$?&R8a=_xLoT!MGK0d9bU*g8?h;1p18CuV?~-o<f-{p?l-aLS_TJ2r*F zXtK9y45iVGfOvg{(aCBR!77i&G9@ZjAia$frb2pu6Zor;#_BusnRB^u+8_(jDg@4* zV54i8NlT|n`n-OwS(VbxyU8z#kTM&Vqo`)WwxTVi(zS>ZKj12*fs{$#MK{1s!lw;m zl3oQ`a=L2W7;0kKvFyf!K!P7%N3C@9UU^@rDM|VygV(HAOX-AoT6!yxc znaQaSpUY*&(4r8J3&;$W51Soh~|j5m}+u2g!R zaeYqCh^KcFp&{b zy(na~uRN)1c*^$qG_L~dJ@#+lymnRv*ER+h__PB04>P|*{KTj zvU5F34kk=&siOtVL{6fPVJ>R3j7q1;uH}}-Y*ig_(pNbWLUN70w6#g3JD$l4!%0~; zo>H)q@nzN02d8wg*Hli?u`BD+9a=JB0VicLXPOgMB3E2@qIM6il0$+H0ZnRTTj8;i zX%;WBZCz>iOf9CEEOym}nrTd1O?%)d*6z$ZshY+XL~O%u_>c@i-Gn^0V{#>C<~PMA zOxXAJlux5`4V2VRGatQ@vudboZ558eS7Lfzo{BiGrUZnXU5uAZlP#c-`m8E9Wu0%u z7wmRUMW{6@>RN4>6r`?bnm`pbqc%AhuZbbUTn=V>P0p#^7HT4B8Q%_Sx~?!oCG{Pt zr92zdbbhSSJVn$RkYaL$)c4h^T(9>~uM0I1AN!0jlPb*=rT^3n)YRr|L@mNmraf#- zm+oD+p?oFO%*!{rD2i7|4Qa#&a3$Y7QF|vNJ76r?Mox|YQDx`GWvA);mL6A6-KOeu zG&7G^mFZGX{dJxYrb6&Ri>J;ZEA7O1NmpDTtK?PTt8E!a71b#Ib2g0Cl8UbO`&6vt zioYIEJ*W9gHEiJ0zl7=a^e`5}(^MXmwazGzT58zjdxOzxs!8RfJ>f01JTk0`YV^{H zDb-X{-|>zVRbTewlR{dpGr?Fl?L_O^4qD6FawRO#zB(qHbr=05d1|VeASsE52&+du zF+X90&b5FmU!W}7O{!O3Z6~atU4b2}RRcV)R3pSHt!ccMX(!4Wg+ zav;3fyb(p#k=}ATL|p`kQ~5@1X*FcWBiKcl~y*0$V;0Ex61v|1T$P4{4cc z$YbhejW|+(4a;~gv~`yiQ7BTy>FtWIFMB(<+Us4jW@@jQ5RtvSXvs5{@Rc*S>mE=Q z>?Sk1_VFqzMv4;`#7=9COjVXz@inT5aD?LPHj=OSnp5)zW}w;#ysIG-K}q&5hm5VT z1Wz%Ow|HuaU*-RZ$eFOzx6LDPU1WxRlS`__k=3wsF5dE{xw0;h{CFryz!H+G0rJ!w zJoU19?h3J)(Oa(wP2qBtab21aKh*RH2H5CMhveI{WHQW=)`qE=GT#I-uAKZ`@`ft0 zfxEQs+R143inJAD+cq)$st#J__p9ZF4Zf>d*s>-sf3=^6*VH_YgQK8Ks{aDj#w9^8mU5H8> zIjPmCNEph{jG7oGMAyU04~Rz$#=ON zJ{XG#j(t=Ow@q1jGnZyoSX&3Wl*V>fqi2N5>IX(#;rENL)>~Ydv$GIIs4= z8FQdAdl_-d;G$~o^(Fqca&GV*Z_(A85rHLKRM5>piM>rlH^oH9158%KE(rw`bw6Z< zNseMvoM0*3>1xjT_fG5v=8li#La@v84PdiYx2?sYG}69RJuuoAIR!#PvlO3DNjGhl z8i3L;O$i=c|IwmM4QFW8rJw0VW$_1CZOiP9x+<*3T0k*( z3cCSr8En^phUvfeU4#`ctL|=#i142*8AixN5{e8LSLcV-fMa_^XpLd!7)-U@{Ld~K zL~S=_BCiakaX_orWV1#T!D>Kg6ci578hMVKz34uw#(~xFPRi~_<=rTKfYgrFK2X{z z4ytQJ6Lz;I#_ww5x7A*78k(l+a5`VOYPDtodIJ5;CTmpaeHWQ$tr%VD5ry8rY_A5N zy^bCS@qs)(RySW^a_b_|FJCd0m3g4-f0Jx+JiUq?F5gR4QKGpI$LdRbW0kYK0YzsF zJVy1(7b<;~ksiuIxCZJ_JqW6L{M0dfsD{er#}{G;xq{%5;xzeq3aJ8ohCXUvr zW~GQsg7>UfG6lw}7wwA~T;m50()%r02Vbz{n_4J%D3W6v z452!e0jV5(I2|GmpyFB8D>aV;_o{xyc{Aya42i+KQ{e)l)n`Hh!0#fIbw%}6?$l5($NdnhS)=}{Ze z<*gqM8Iz~XTRuc8OPOGx7O$pEP&+1K`KDISKCPm>{BDYP3eMW9%2QKr?gFJ%lII~R zR*??z1WSs_adHe+IY>7(SA@%)cpfw;D`z0UWCw>hGa>b%r2J{6L&;t$cpsHBN-i*U z7>|L?_-8X70+UhQ`aqQF)tMYX(fZ}Jb8+a5`8ic%iW~?ns>iU@C}$`0$th+`m_nT^ zRhqYT9+l-RMw1$c&>o);gV4k;X%fU`R5ZslsIwTOs~hI_FxjJi0mqnBT#_@|?T6dHEwld@4WBbEjEbI`w zr3gJ!1~^KymG;9qLt>G5l-+uR-%Gx+$D!(Y@RV^@$YQmQ4Snr!7suy;!W09AUariL z?f<_kL1k@3m}O5@4y7;PraAMfoyN12c~YN_gXs%FVZ`v#w`*xLYN7jv$`IQbE3Pp8 zAz>9ep{}G!DbmCD=}P9uN`b>(r>B`=phypEq^~sHY)>i#&cY5KQ{T8}NEtjD+ERK@ zZ4Gfb9FfEVKWVF>A1!WH)!u(wqpf^nK;fwvqEh_1D%^FmeSQ(>>u!8&*sHmToU2MI zom7cBtF|V`fQ*vFLNKpHoqv;8?vf()46JhiW-949iu5p-1f)dg-WfboBq2unrFJvZ zz$qQ<)jWI`H6!9o7$g}Bia6iWc}|75@}E`MO_GROD&Lc@_yf-ug}UV7?e`)W2BZi}M@= zvP*0W{BR`FV$7Qo3--I-OdbX&Y+Z0hBa6X6agfP?MHkd}dIH3*cI zxlimy?Jc76ruD<=s%rz&wG&OA4l2z}OSE8(;_x9ZrLcVlXpgWzpyvQBnhKwJWb){Bqo z9ksLu9-UZ?O$Qh0%RDB%zl`OoD2UJC9mM% zq}lxX0!PPAMcQ888}T{=*v*L~n68oS@kwDXWV7Y+DsM-V!}}2Ek*rP~5pg+Ov~DXE z@{=v?7(8y*hRD0E+Az=Z<+)+}C@DGN$m z!MosAPopQ6u?Y_KKcrVp$!W(7jG}JmKL^Bd;RsGmi@;@M-OYCKyI4LEz2plC!9tI! z_Z>`|y>W9h#KZBBs6$6dtnpR9)6@(XN_wieudu89UB4ub>l8`@cBwchjW|qC)P=km zJ!&-#pVgWj>~|#EDG?NTXHj20OBsA5_52VaF#?n5@zLxXOq$lEh&#gL=XD`6D!18W zHSiLCVgc9lCi|PaKxEBUxu7?PG@(4#Qg9WzRs~PYR-%7?$lOs=epDAkUM%F&-i!$5 zF`zOAkTFH33pvQ4+4?~y8=KBZGS)NsIS!-49louq(k0`a>h{Kw4KHQ(6~%2I; zW|lTX?|j3O?Hq}tI05)c>`~T?W#J0SCPGPR-P9BXc9^;%Aj?T8$M>zPLTy-#gp11K z&qtMW9Im>Aip;CQYz*Y&-JrI&hv(Xd*^rFo31K#tpAxKrA9{UO=m(eYuSA}ujM{5MYwLH>Ino(j=^`f2)xW9qgbh=8&4uRqJpzxgv?_Uf zZv2iZyrv|oY8hTjKDqTbow8jfmU00+Q1qo-OSL?aymjRQ;J9rPTfX6~;87Pql+5EA zR*;}#e$_%rCG!qjl#8Qi{<1UA7ILe9`on?|Q(S%iNDWAtH z2y;b0pSHlEPwT0om!gbvDjPC`>b$}D$dH}hwoUDF;Y=}2y}@;*u_^N>3hsjZn zpvtZ4>N!5u0n3pSD(kDX-hZ)d_L_eaKq`TM8l57y)>E$)xcCXn0?ePkVo_@$9DaiF z_!pfy3zBc^EU-ULdCXQQQ{-;R&f(>`ROab$j+2^9!MTcfTUi{krC(Zh6bXTY0okp$X^a#tE>lqJT)Tj0vQzC_SS*`BAqhhoLQ1waU| zjJ@i6RKn@Ou$|qeA}7`NkJV=(yT|(=8lJ0lX)s+8q}02@atPe>-z4|}ime=sOOEUC z75BR8cw34Tb4Ly=yU5~U^9DLW?!t9Kuz6`&j<;($9rgG42hV^|hVL>@u}@ieJlY99t&3& zBIlAM8U??~e2#f;1=k}d^r@eCbynA^vWwNJujUM+QW{)#Q=Lx+y^6dxgk*y#&#!4Y zVpw@TLbNJEu$Yci2djcZS!1cB&gzV=3LZk*-b7e}00^a=3moFdI*yw%@2J;6k*t zD!8%c<%y zB1Ju$4h@mf{oh}^?|z*3lga##00|0Qw45=>v^+$1tlXDNmV>rN2P*c_qC~`rCyFsC zqpEDbUI-#P-zWZd+445Nk%h}o56@K&kKsS3N5EqKLppp}`gWF*p~&0PpIci&VkpW| z!LZnOV^@I3_;+ThPzJazlqB=(f;L)boM7Bt6k@#(UjxeI7f;(iGh)a&!?C7NYFKElKIzjITZExi@Iiie}Az>74aJfS66n?^IE67e&Yd* zE9wiGM3&pgIix%i6{zB8a5Vey&9jaK!Hz?0$#$sVCt}RQ;hf!S78US2Mq03T;q&Gb zD&qIw&7`-g;wPi~{;Q(_t&gkaSNRxfU*MJT2+yx7g)#mX&QR1(o-DzD3!t0D9{(3Q zGm@v>kJ9Ea7k8&o_PaKXlcjR)>Sj+4Nr`Az zLoWdxMRL^qhs|0dfv+}o%XujCKOcA1h7i~rm?~$Ji>H{?**Sx#pfYA^{NIHy75E>w zWA}l*2n}_d`ty%sntqe7fGorL3&^aJEUuFNbqnN``jfo& z?ob(tzs6rKlKvImT>Gi-epTbq3%HEthyln7>6MUlfpk|Vul9w}i2J-ITy}64Athe; zx`yl2`x_XQ=~44f{FojPpRC9MkIcoD+p9xlOyYe$pXSbzQ-u1eatK6IT?+nPGKKpo z`1fCER|WNWWKJ{oCQAODAeKy@ihtVb{k0YTlPuNlM)v<$k+zGan{gX2XWV@v4Xo>- z>VLGog%2YS3#`NPF>Tm{t_|9|UoV8HRCDn0$6a4fzX7G18ZB78N_r1R*%(i`Wi63u+ zhi0}v4+gpM&XqB7E`al?GNkO<>JHV#H*_ih`cExy55*~N35mI`IzzQ}rOMjZ@sQgZ z4=K~XSi8Zwz_(c+i7zfl^?Xa{&HuFGG#q}Sv$faPKcgKX+gu!ibN%~PWpLa#6}vbg zU`zHdXA=qe?=Pp$2XMi=6v^-9l`nD&MgUOFsIfBwc%oE`$+frTUc;7pKeG-JJx$z^uK%NdSac)|C2x0#b#x>WOi~9ckoeQuzXK7u|(>X-| zmKt{Pb^d#rkat2r#L81=17Mr-urL`+8L!6q0I!bu7fT#szuq_B(Fp-$WokwMzbS)? z$;pg>iY%QHfLC9)!bN+9(Am3^L~~r$qCuuh6}iA8-!LSs7*&s{qvqmu?9SL{goAt` z%8Zx0bnG)jR!dk6S1ysX3$dq)#Dg;exJF%sup{QqnE-9Z2#ci1+6il2haKImsBjf_ zro~s7?ohaDAxXWxr4)JBTyD+~q0^ag}jq6JQCCMc&8CtYdg0W@G4K5+ByIS0kjyNsgz_6*+ z<(z<@`ENWZ(X%1gNaFeC;9FkGmD}OIni9?r=Ba#HrvxyrmlCdTm0+zef@LQFFlkT< zQT+<8hOi#chvi{@EHk9TSLMB7vb4a>|dw5ZcmHe zT)7>+SbHJ}%L~`!S4MX5^Bp1eQVQDD zRsF9~EL2NET`T~W>nTT17lF47Y268LA*ZcZg}KT@aSiv?cfpHlfBt6~ZIz?d3Y-z} zWqd-(KT&TYFV}ESezP4I7oR9Pl&11OgYgRgb(H0~Eat9kJI?`iIhiQ?@j4p%T+E#T z*Hj0N*4AlVIRxr(kg#0{!el^7JFfI!_r1<=7a>SxeR^-I1?E!TRS8Gc z{|542D$dq=?4>Z*e)E(8t&;w?#De1g6{>-p!MMsERcrf(&vTzuzxjv1wQ_#03vD%D z*u~vfZ@<^bu-5jWApmzv;}VWg|4$aE^>s*#EM)B40BfaO{T|*rB%E{(0IO)HYGEy= zx{a5GwhX@VB#>53rAwv2Si7>CgUPWhJj>W5z7Os(fp;1W;@UIFA1Z>l-hA(U;Vz<; zTHJv1@V+oJ+=oF;r308(FTpoW)~I*oD|PU|+8+7I1w=?a(|r8EP{Tt65|ruY-;x4i z-99ro>>#o-MutK8f+uv5l_~lV?FIQpm)C!4HjXh>%ts6iMINI9=bQ++xqf53YktIh zAXf8=IAj`(*3tRi{%h&X!8bcHZTAAtT%qj^{rn zXUl)o%ogX#Ee!N3+pF%=Y$x4Mvz>NdW;=5dCdbXY&$C^0zmaXVBT>r28aW z%xt!^?yGG1M+n(6B|6*Qf9uBi%kFox9sIw2jJUt({xDn2TDFsJn(egvy=-UQ?`J#j v{wUi;ca?45y~uXiO|l)jxmz$j%l4`pvYm9xY^U8K+gaCVJMVt$!w>!+%KLai literal 0 HcmV?d00001 diff --git a/src/delphi/eval/labelled_token_ids_dict_delphi-suite-delphi-llama2-100k.pkl b/src/delphi/eval/labelled_token_ids_dict_delphi-suite-delphi-llama2-100k.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a429c5e9a0a8031609f799b771918ab60055e2ac GIT binary patch literal 274517 zcmbu|b-Y#8);DmvyHr3$1;j$Y7Ew|~IDjG;So8rY=@3Qo9K`@ZX}dANq7*x@yStMx z5#yaY-#b6+*?W%h8*BaUAMfY6_qosa{mn7QjJ5VY+i7JMmMl@S#J~TVx}aH?fB&Dk zHG7WjJ9_MxMiYjNEo?MqRNwvs7ZlH}+@bHNA!GXv8!~y|0QYaJbRN^F-GHG3`;Q$m z{@?#~)&KtesF7oaj2$v^g!>OF{`Vh@A2_<-|Nh(KCJq@kr0?iS?%&sU|H}>|M~)sa zWJKSwLq-g4)PLlNq2rwO-~Z{qR>y&32aX;-WW>PH?!R8c{Rf>#xc^}2xDo&RFO_%y zS+|knMl2|rTj}5baoq5Mqx%kX|K-Z=-}P|$hx8xzZ?J#=mpw+09P#gewRmnd|L;Z( z9PR&)|NnpTFFVd9`S1Vx2JXMpb6h_u-M<6tIca#mk^laG_HT;bevB2||JZ$O;eUav zyZ^ac-{AuXH0m^B?2xf;$m6<{bfqmURa{uQxUfudVcFusa>a$^iwi3h7gj7TtW;cB zxu~#8abeZs!fM5Z)r$*j6c^SkF055tSi88ePH|yfCzPUv(zH;97Ru5>Ia(-B3l(Ug zA}v&+h03&0g%+yPLN!{bP75_?p(ZWVqJ`SDP=^-k(n392*ohW)riEQ-p*}4%poNCC zuq!R>Mhm;sLL*w(gBJFrg}rE@F)cKqg{HKyH!U=yg?(sYUs~9Y7WSuw18AW+EgVP- zEoh-7EwrMAgJ|JkS~!Fj4yA>|XyI^LID!_Aq=nYBa1<@Hp@pMq;TT$IOAGC2p*<~h zpoNaK(1{j~rG?{Y;dol;ObaK_LKj--N(%+F(2W+l(?SngIFS}kqJ^Hca563QqJ`eH za0)G)N(-ma!s)cohZfGDg)?d4ELu357S5rCzO>Mf7W&h|09qJG3xjB3FfA0)!Vp>* zN(<-G!Z2DGP75PwVI(b#qJ{HlVKgm_p@p%uFpd_+)4~K=m`DqgXkjufOreGIX<;fY zTtEvK(!w-axQG^})4~i|m`Mw>Xkj)j%%O!MS}3N4i)rB!T9``w6Ka6R@1`0v~V9S+)oP+(87bX zu!a`a(!x4gSWgQZXyGARc$gM8(!wLO@F*=jMhlPA!V|RcBrQBe3s2L+Gqmt5Eo`EN z=V;-1T6lpLUZjPWXyIjAc!d^TrG?jM;dNSggBISTg|}$oZCZGT7T%?W_h{jLTKIq# zKBR?@XyId8_=FZdrG?LE;d5H}f)>7{g|BGgYg+h*7QUs0?`YwBTKIt$ex!xXw6KL1 zw$j2jTKI_;ex`+AXyI2{_>C5Rr-kjb@CPmIpoKqa;V)YFn-=~-Lb3eY3q}9__Ci64 zlK&A!^3OLY#D72EP*Bnn$}d19iK56aKopdM1o{z)g3>%uhOb3gB}LJHzlM>lMeNrw z3d->*$}1^szsXThflpBprbyNz+834h6qWfDRUiSovno$ig9I#Rb)KjJ30RAoJW?YQ5U9wlUffFa7K3G2ePx0!npD6q7cSit3FRO;A_z^q$tYxlSl=-@+o!; zDYE`BQ^D?hibgO6Z16qoL{a<~rV9476FEOpRrLRVq^e*qz7~yP3b<06D1>pDHB|`X zGTWOcnn5D;g*2)A*aPjMihq6MF#C7+@dpW+}s z#ld`vLqdwoN712y$b60*#uJAtgmL3LLLvOd7q%Q}zc|tp%t~zyQ^1va6eQq4+CT!P zIGV4;F?@=)3SsPvb_!wai}sL!iJoBk;v^|W_|aS`MW{tjg)sKT$qHfI_2L1xu1*7bQwAjzH6FFCua5)&w*J4aa zA)RKhFUAH!I*LL$rCN-G1WYlWuf>FrB6BS!212R@9LOY|m<)+9Bhn!-g(uGEiK(7o z7W)PE6jFCyXeXqVI*lL5MKA?yt?4{50}`RNq;k%*6H<51@&wZtvqK7Lv4??3Ddq$s z^I|XJ2T}}Egn>via@JFo#J0PcJICiYKmyL~E65a%xQ0)0EhO}Ex)j$bgz-`LdWA4P z>fYc9raNy;l_I%cga@me6v9}Gn-#+NM(GxwxYZL(Id2Ooq8Fv2!pg}tk}PLEA@q|3n_kkA|4?H4O~;!aO6<-9ATh?djv57H~RI}n-6d5@it9%)wbwOE}wg=;PU zvBJFyVO**ADTHy?x}UGb1AK}H?S!<<*4T;Y?bj`{wUE$Psym9-@x*$b*x-rGi`@<6 zp^zfF*x}ykh=&6qEp}MWjr>3!;Zr=yr+5se2fGJ=tUJOLG{ld-2OFZ#1 zUyE1x6tD8cYdrBfBtmORyVe^#@g`5am5RuD67bixw>`n^7w^CnaE-hRi7+G5Vt>z0 zNEhh$`C5EnOi?8MQ@IO1gaj<-N012Rlp6eFp7_KQOly4#Q@~n$#;5o^q<}|}RL(E> z6kmoE(m4Vr^(#9eEwiufgj9=fc;Z{0_zn`VwZ4Z0oRJ@R;zyp?oQg=EBjE+zmQ+Oa zRr*#>F#E+eKE+Qw@iR~S!V|wj0yg+>kbn*TJ0xI=?T~;e{(uBbv4bc6Ybnh&q5XN$rRS4sU8|C;~ln*H~Uz92Y zBL9|StVKnIFup#n#Mh!SOaU9b3QttcAu@baUoD5o@KJqro~YpoW~J7IDWsJeKHR7U z2{@^>`GM5oQ`F^&dOWccPwdPSyFen@7tyP0eMrE*XaEVAq9IT0>Ir6j?G`9#`3Sv$lN%%mO)Q@|NHOd*Ub^>BqSuGAwS5hhhS1dfCRtVL@`z!XREL>o^q zeQ|V1k@+F>m_Xz|WEv;6twh-O&UO-E|5L{H3Sk^b2cGEY31(6|@hOglL}+kn?>x>< zNGtVtp6JXIC-6iUp6JRG1$IK3)NXddO=|RV&>a$R`|aTgW{sQ(Q-lVWy7MHS=*bf& z^R?&|Qe-{^dIuu^5HMa2PEiQs_IoN{i_>5V*cYesL?3=2XTTJ2nVqQ+#z{R(A&irH zHcynFdN^1R4I}>czAI;P$7(cF-RedeK8mkp`6lbR%j=r zLtqF`4CRS)ApsZrFi3=QO0^iy6C-$HBu|X;1hdS}gDK#QjD`f9kuf|mmM6yX#CT6I zGcqBh$b8G0$fuYTQsiHh{Kf|-buv#(;feEkVrnWPdK6vY38pVD%$y>8Uu&8|7!SmY z6vEgS(;)%3oEe^A$~luyF-swg<($nAWR8-;ctt5v2;)GC6~dU}VxG8!C+70RrI3Jo z=VhK?8vJrT#T7g;k0<8y#Fc@_e5Ni4MCJo=Ax~Vz6Ib)ZBA!_638ul9q)HJzmX|^T z9!1L_0T=sno>;*X*YL!(JaL^Tn7+7PNnt$AZcqs0E$2o^z}C7660o&y=80Q8!OY05 zQVRP^soNyN{!;38p14CHjIFg&A&jkcCnR8N-31BQT6Y@><$F2zKmrbAl_!|)T%DgH z?UyI+%_nlcJOKxCA0)zzNUsU+j}k?N6{2e-=Rov7(e*%-C{7S?AP;(inUOUxMW}`J z>D^jLz=5n&2;*X3uMozkq76Lp5KlbJ6B`x6Sk6Zj!dT8ndEzmic-#}r8hL_G@uWf+ zXXGh`FwV%+kVr1}=r^&S@dQ)OXJHDsUu@!u=Xm0Io_GNga7JE)1T5!Ekbo&(=80E$ z;#Hn_%@a(6zpkV(F7`JR!noMqghZHB>6Y^rBw%a34T+FK`V8?Mo_N<2%%r{-Qe=Ct z<$l@feLlqpAw~YHF=K0es1U~1`iLLM$1p{*FOnDN@Da-=kbvF!DI{Qu&r%W5Gxc+x z_`(xRgMSHAz;b@Y6JPVhH;|AfHGG}rTTd|M{LY?2+J3*c6VkKU4+>%2FMd=A<9@N3 zC$>N$IjPZQwlx)z+;T!+Z1V)u7eB!iaGCwg6Tk2S`8A}-d?5Z7i2T#c(pv6J{hcJ- zufJ`FL})E(rTzg4IH@}z5mHDOr9VBvbmw2GQY6<%IF|ob2;-LXk3#sqfVC)5TK@eD zV~Uark)I-2i?Cmmf&}b~(w<-@wG2#=EN5~cVN%QTL^(SlEwl2FfaR>f6BQwmENAp6 zs$?gmT2#&_(!NTsl27EkN{0if$`jR65y|ch%dC1TB03{AJi#opnjuBz1F=>hG9QSw zNx~h7b$Ft#Czx{9<5TR!6Fc+7E_^NO!xV6(Hn0=YmebHqNY5j?LIUpKyFnt(VVZvfvHkNuP7~25z#A3OMW1&Ql&@^Bs>Wmq!7ma;$Vd^ zK3E+B33!ei>Ir7C9|lu|T}#?>4u=FB$Po%*oRK3H!nlLC_5@STqhJbH&Nh&MM6#UGy|WWf9Lp2O@x<|-VEUqSsua=gJOL80 zwYoq8PHI=4DBy{1kbt%54vA0;>Gs@^InUrzoCygyBWFP(lv7&lXWI#BM$WMlQi{G3VgFNu zei9M=8g=w!@9znwF9z@_2IdeMeg}F`4w3&m(4jA+NgWIc*qw!tfGLJR0;U+s6X!xg zABg*Gd6*|Mx0WM@^C?Csgyr$hr5LFYmd86sjDmz-PWQg^JV?O4810G7GvZQ=fhm#$ ziQaz4+6igz9H$VLNp%AmuMn0=b;Jaon5YoOflN{e<3J|!#1v03t#v-1VyZ$I%XvXQ zk@iX8!hAw{5|A2vnw^kNvx|6Qx+j=Poe@%G`&8F0vzdXAKGh9dj#SQBJTaRm=GX~0 zsnNZ&NFj_3UaSzt{o-PtxWp4oYt25Cg-isXz$%Xt$dU^#E*iCZ9%97yza z(5;?e%6S`~;&z_6gCEFBKE<7ofPHZnPu$HD_jrQoi&Zd1vOAN@EPQ!lH6&m;?^Oun z{o+1_FkTw(=ZOb+;z6ER1Bo!H(zDsxd?M`w_qu#Sdf=9BeCv5)15Z5U31*o+%%|80 z3AoH2fkfyFX+|F9iN`#_l=E?z0@mUQo_LZU$WwfZr+MNTg)nZv&nkp*``siF_K)74 zlL-4qZ_h&lw$=+g@uDY~HS!Wn0T=trkbt##1ro^=?tZbbWQmd`{{2^RVa?*gT1ADm z-Q$t}aj94BmUOJWW+$W_@O4ixQ}YI&;!Qgt)#5EXA=Tn-NWcbp#}iCB-wi1;zs-Ix z5YpRhIFR=tk?i#7bKVC$@gXGOKtAG$k0B9iA$8{`kbndE6cQnY)Zm{fgz;qjTp^4v zKEL3JFL~lCPY^40T(g3&`5fOs0(Qo?kbs@>9Z!7k38J6{KkzAjgaj<-W=Mo`O3Qr< zBw&iIkO(QH<-Uz4e&UIrApvXgizk@Y`jt=d8zjPvNPY1;Pi&V6`*Xz~o?yzkBc#Z` zq!>?21e}qoo?te>idEyA3IFct?dxB~3qf(_v zPHH$u+CTy>_M>^?7?N;*de#;a`j+E}c94LzXb%ZJh5I{@4xV88q9aTJ2hvF)jEntP zg)p9G$MMARo?yz^SxI56#R&>wtVNeVWWF494MhIsz&MZsg)k1J8zkTw>CO{9c;ZB! zILQ-CclHb^q`uJ0={CNT1CiesImfc}E~l5BaF6}bqo_Aui&K;o#s)uCA&d=v8c&?g z6MZ0|_k}x(&ftkNdEzWcz|-t(PcX~u96m)~p6JID{dr=5Czx^$gehQm4uS;ii@}h9 zDGGUF2u}?41T!P&rb-dr!G}Qt&d6{`z!@3A6C*vrlyelH;yg&`<#Z?eXhnLnwH z7?X;~e`a^YSe_W?iOe(NQjF(QOyG%$JTZwUCVPS@=M+B0`H+AOJ{1zl29I{<1(1L# zF7yOb&S@|O9LPmHF`XaC4449zb0$f+tLrRC=;d_8Y_b-Pn8T+i;)!COxR|fSB_W0M zoT~SQE9cxmNYAN|xRf8rWg&$$5ZGFm2SOSMB(C6zd3-JA!xV5vu2cx)Gy4LCFg~*{ zu1G~B z2jV|^bHp`}(3hDbu7yOhocXnI#C1GzJtW{jZs3UPN34KysIdA8QJ93B&-=JHWLuB{{-JLvf7f;-6C!|Te$4*Eu^jARw_Qh(RxEB(k z!KE)G-scHsvEL6519IWAqwLG!T6U>aPhbfYMk$)_^U26j*;G{kT z37FzxNWc^uJ;9Xo5tsrFdAw}kg%%=j8|Bz|iFP>Hi z<9_iBPdpn4SPSVi+Z2fWTKIuTeG$GX^;{|<{{rpyi{~K$Tk8d$c#$Vw@&wc1FT)hc z8Htwj6-dBxzN!$$HS(H57}v<_Jn@Dnm>GE!rhqf@7Eip*6YucEyPjam`JR%(*jn!^ zgt4_gfJAaqqqm$7Apw`!N1kBH`7ulZ2l7cOBD&>#suM+pm6a6!M|VEciQ@lDL{ogu z6JJ0APU@GQU?%k|m?BxuXf3|xiEkhQYw;~ld7fzRVo)CnWTNIN~ozz!ZP;#6OTo4kX&(CCZwA|3Z4m zENLgCSEi*h6XDOaN_&E7@G_}VJb=z-`awJJ|k$OPcY?d&!_015XQZ;qe58rPPfcD@x-x^2$L%9TF2Q5 z>GpekDk9oiogo2R>ja+Y0*O#gsTN)Bgft@skbt%5#uMFnq6bO1WBEi+WZo|vaS}|S zPpTt&^2EtJ(TlG|?^G$0D>dA;PEiQs-g&A*828T8AQ2{2T4tw10xq*YJaGmj!a$@g z=S)vzUL)=(IxD2e_RQ|q$k~CAp4s(*IN}_h=t~l=7X5glKTizs1T(1vLyByZ>S{5F zPcb;8$krDwMIlcN;fbL>G(PcY@21XIA)nrtMB3ah#9Oy0HJ z1GoPk-xNr|Nj;yh#Z;IgS&QUA!WY{wfCL=Kg*-705@8_HO1%gYa3Iqm5mHFI)(l9% z6f=2ZmM55HHk(f|2NI#2(xevg11YwrkS6tFp16c3=JLd)o?!aovXCO%vFsiuE)Rrs zEW;07#_QTLg)m;%mV1Kf&J{2PoYZR|0VnlZp16)DuJ;5}&KqC~shr`)cOxWVId9^L zn|b0Ep13s-(rKpG!kuQf1tRllb~{hp0f}T^L_f(~=?SJg?}RDza=JH4cR?ZyL^@ON zh6GG;4^OOuL>Nf^LuOH7T~~|Mo?vF=UYG*b;y#5i9!2*ngz+eP01~jA5AwtsPcSpG z7N&qRvQ8n4GqPSG{EUQhO3Q46L>Rxo?Do!wJdwGaj(Au~5xxT*1`>S|aKuKQc!Vb& z<%!2g!fiQ^dm{6UIN}MI0yg-Q3Sn&Urxe1tc}84{H&dlZuGH{g^_HEG_KUakiM0F0JNZP;egRwST}XtD zPrACkrx3;+{C$PUxuQrZK1fC6x0YL}AM(UUkkDturTCa9KH-T^dEzrqWL~MR7N5fu zQg?=LN__zdeMa1V@ufl-8~iJU$Zzoc?sO@>=8123;#*H-o>VuG?_dfzBj5AH4?OWB zUyIFriY<@`YeYJVwn75V$Tpt%i6?&c1k;_r@F{*(2;+Y7n?e}(i{B-}eqGxx5%%la zAD&=lWJjtL$?ID9tmw~FM0ET83leaR{LK^p1S0wp+j5OBQ4an71sq68PcV~Oice7* z5@F+uz7BFnQ5i_UT9oC9ay(ILn7-HprbzZhbW-=U6Vg-B zUUovdUo?gUEN2s*XbK5f&b>XsOlq@`B6@YT{2b~&fyn&*i+y=wKc3hh5^zQi@B}j> z&G{4u@ zoqg;?&h{H#+@7Hj#z{R>A@V0RdIvuX5^z$_hD0c*v{KLU1k)FN`4s(lqCZaz;E92r zV9GfNrht7hm?sJ$5n4;SYYnjzQg;sJiF0{k7*7oM1k)EI_!J{~ViZrD2MO5VqahKx zQ);a-kbo)1LIS232ML&BJS0L2X{Anp1WYjz5+Q{&Ba=MAtdYqu1suo}NQ8k%dyI)fCIUbCl+{uStASi6j$-Y)e2#3@I?w?Z1Ba9fPJyV6U>Y( zO_d_KUxaUgEQ18>&gGB@wU8G33OkW=riOuB0|_{gYdyit$aOG9s71~|!sXz4NWg*I z0EuLZ=uvbdUyGag6gS%mskLs25=CkLxb#-O7Pm!H_&;$q?pn7igmKrpgC|x>g#Ekm zcS=O~Zai%8yA;A$&bt-D_}29vNWg)t;)&Hfajz$sjqg5~BDv)xkD{>g-ESwP#r}Yu zkhb3kGZWz`TH^_(oNH61i0X8VkVJ(2BH3EeT0HCtW=1wDDU4gr zBNAb+#iJ5ouf=0L@i-)4UpxVcFsagFe=?s)`*`_KBCz!r?8>WDh`VLRL3yCnPZbtIU>Au489#6av30R8{c;Z8z_=qPy=7~={ z!F1=RFooV1?vG19g9Mz^&w1htp7;_H$yy|zio(OhSDs*INq@9qi@ufV$^hIe)ilV}*#qvMeri_Is{vQ!NN6PXl!as1U|os}fIC=4(*}rU+{!`c&lZ7gZqvS86qdFwRJIg)q)Y4NowwRg+Iq3ld>Q zq+P4FoscHA4o}qOiF%%3W@INm#m+fIhF_}LC5OoWrJ69Q(u~ySi3U8;kSBKKiQRZ& zcb;ej3Apj?;R&X}_k<~6YwZOIIFQDWfGL_l0;Xun6MK7tnUQ88g>?Icwb&;R`L|!= zfw-?i7!Sn#6vCKde}yonIKUIkj5Oy{9LN(bc%mgFU~9GFiGw`B%*erfibEg)_lrX% z!v2ZXVSFtP4=FP57e@plf4?v;vm+sqY^~_CS?g3p^da*oo@nC@`Qh_+T|o;ZOgy6{9-o+wZV<1MF~LKwH-?mW?hCr;#vlRUw!)SfU!SY|nE zBzz2caw;OaMtY?pk_Tc)(K{6pO>s&pBAVjVR75nzX*_W{PxSEw(^_ZnDbD1Hv-p9W z&8IkrC;IY4Kc48%69ZBa(PcIe67YU8h$jX^BDr@)7keQj;6R4(#88DW9?Rz{gz;D& z#uLLm!R%Ti_!J{a!hQX16eNmPDhOPMCNijVhowWedsloC&ux_ zc%GQR6B8i;_lrq9F_|Z(@WlC$NUo7+gHQDY)8H4t6#9&~HF6HCz$TMETqW%2Kw?qWPSsEg+dr_eDf5-c;lPz38tJ^!W7BH zo_`3q?py!~*jfvD;wp)-e?D@xM1;TSfwfrV31&tXD=CbvwL~F|t+iAkj475WgfYc( zNWd9c;R$9&uHjQ$%M;f@0xq-bArZz>2 zxS1zz;fY&$;xwhz%xAYEKh9WYw;YP;(18GNqvDQUgU|FXkuKmf|ntZY_8~Xe+3e- zpszwAq>$F?Ymk5`UWY_TAuY5wJi#pYH<9Dv5@9@;-OK0?QW42z7Cw>ukgvr@N(y6ZeXJ11+tw#M@hMMymP2ItTdB`;i2T2m zf(`x!PkgBm#!3B3A&isyHBWrw31+c>3sb<2?>nCOUMGqQD<@MV7khYW|3N1#|76-9 z?SyouZnhKdKuiuKd=|I`68ai(d*@b9FnzHNrch?2sIabkv-J}s^np0yXGp*lzd!<} z_!Sa9MUnhB`oDRCnUUW^3h9G9IFRju$oxUx9|~dIJ9j9A@vikJBw#K6@&q#@f5Q~X z?u@<=`UetlASEiuKZRkOk&>QZYEeo_VVu;`JW+-x%JQ`+$EPUI6BT%(B2QG}iOP0D zI+m*_gmH~jRS3(C&mGIvJdt^sIifmDp>H{ksKFC8CBpcG=u*^@i137{Z+woZ%@cJX z0n1sJC+hLUPCT)*Co*@ZyRPj*rf@`ko@f9Gy|rA5h6-WaFLqT3%YNZf?B)q(Ms|lO zk_{ex9%*DJr0sVPNWdA{6B5aRMDJRAd4eftV?IR_NWfY&g+#Iz$w>{L>+TH+n4%d^ z?BfY$M)u`X?8g)P^TYvsEtNrt@soNK?2s|V4gSx z60n?y+6id~Kg>=@@3jtx1T5zfo?!NiBVh_y&enE9D(6vlLMmq)NWgL)?Fpuw$4DuR zPXg|i(^ewFlK`BNc0AFZCpvh7DQ8DMMJI(YUR{q>2;(y(g~1& z%d87ebcIB6Akj0mKqBlbwVOoPS88{KFm`7Tg)qKpJW(R-wKz#4?6v606DRXTFP`Y_ z31;Ivg->xRPn^aRr}IP~o;ZUf+}pJ?dEzXdIGZQV;fcPU$lTzroc&UzNFGJu>n!~t z0gs{q3Sr!S2P%Yd`yB)cIFP}f$UGzN-dPA!z%z9SB;Y`XLIS2ZmnVkt#BiP%!4o5S zViZrD#}lJT!fiQYcw#J1j3WuRQpbCOS?m*F3b;}yDui+GoTL!O2dl|EF~t*1InO6k zxbB?F6BqErg=8(Z|%YfV=OV{6TTL^z71_gXU{0higVd?M{XN_BQVk@g>@ zI!7UlGg71w#u+K*iHkkKEcQ$I6muZ~TkFysBEz?jFUuh^eEaxvPcY@Yf=@9I5}};Z zpIyx72XbXdA$=iHPvQ2?1%Z&hkO+x|kbqmxRglnAxGT!lo?!Z75uai)Pb}eyr983B z6HGamr%I9k!0je=1tef=UBeUC^2BvKalI#)8My(bNG`MJjNAwbI3qVfLP`<7JaKa> zBKkJ|7M{43CvJlTEa&YCVZ12ap%BK4(n_AVlPB)-1hYo&=2P6m6RUV)H6-9NyVp)g z&m;Fqg#Ck_`z6BuLC*u8VEW=gKE)cISPKcbU#x>f=nH95*FyrP*pNeHc$j!7hsf|S z@vtYDzSsy;gmOwx0*^ofmh(}bc#J0=_XJbUCtwO#&L?@|DV}(mC!XPnXFb8p$fi^& zqL;?!AORcvd7gNIuf>Zn1?6M~>i45WsvoBQKp~6|Rv#*a@xkgNp7_`k%#3^j zQ^1YyQ%JzY{uxhvt`Npa{X!v(llmo3d=)Z%iexRKFK+*U1gymliLk$v`coq8FQxwCiN8I;%*a0>MYhin z-KR7qD)PU7p%BJ1wG>a3=7}<%U}mH&pQ4;XSavPP!Psk=e~rr3=qc2@|? zQRMCyjTFLi6ggrKPcRL>C!b<3NQ6n17JFkzz)5Yw6HPtAlyh%BMKegia_$2OSk8U< zTI|QC*dG$G76;e~Y2$0I5XQxRph6fIdkdau=?SL6Tk$ClQV3%?4^{|Y&d?XqIdTXj z;2Js96HGY|<5L{Y6G!mGkvT+$3v}xoBEtpxD4uBJ31(7{hAEQW8GXDv1`@D4+bV={ znYB|0KB>QzUn-=wk296Q@7| zPU@+UfCD*=Cr*b197rEeFq3))Op&ZbbeWx*iiq~bSv+wzPn_ckrks6KrHGcZA0%L3 z^oK;4ROxas01`08Ku<8`9K@#>%oBw?F+?GZ=g3fnFrFjlLL&5qbmJSQ5XONFR|w-k zMnEDtkm!~(QXz~38Kn@$ft<$^qj_SCCzzEw7N$tfNOI!~U!E8T3Ap8qheR?(v@a%* zg!?>oq9>SgPU2Hc=7}kg&}YQe;(SQxGvbJ;p2%EIM_dq6Wc%`jBQ6Yt^at3m7Ss5F zT*RlC&J#0uVkRV#eUaQR!moVIN=4)^b~mZBCBps%qd5{`|AJAGCo(s<``oOUPjN9M z^agh+E`daHQuAlT5pyNNJ|mY(gndRX^8_;^m-8vEfCQY8c|0+nuf>%xMY1pQ7rX0= z1qxxjt}Rpu%XQ7|7gzDb)jY8X63KE#m)T-Sz}8yA6H9qwnL-%*V!1;2zJTRi!4ua& z0?x>_o?v#Z>-ZGc^TZ912s0wRzqrv(NS~+PXMw-S}>S1ni4jdEz#OFfOy( z6~eg8?(hUNBP&CSY)=C2D7rHc*`5R(aTibA&DY`{C53TPS1E+$Om($btq{f(_bP<3 zweEuiTxRz}BJ5hymh*rom=}hH&d9S0VVsdoJnGan~;D5d5b6BhD358(F5@vo_Loh-s6e)J;C(F2QUSk z)DL;$BcAwJA&lq9CkkOaM?Qr_=uYYO`&kZ=;j6=+=MWjbI{bwvnAZ9-q>yeop%&63 z%~yet?iYcO%K0@UV1s`HiI74X$hSQ4ohO(X`Cdw4|9s>Ji3p#Mz;gZwiBL{yQa9TP z=?QU5Dk6H0Z1n^)BimpKSk9jy0n7O_Bw&hP_*(obrLcGBZxUhe&fgWnc#dpW2;({O z2T$yPMCc1?*ZR{FOoRW$r}!HZusi>O1gu4gO7ibt_(}CdQDL>5EyumfDK4z*URIWb z1RO{yPcSo5I;4Qdvb6n{2}JT(PW97mW%+@Wi>65a`Wu|o@{oWtQb8e%`$a{CFzy$X zAQ5UIO={(QBJDpBrb<4M_MZq-RUwS!tfmmga#rVw8lGTQYE3>xEl7mEkPd;`JW+=y z>heTAPcSpGQ>qlvw8i~hTV9{>q><7=)E#(^B@38p(+C@GAGKuexz1&L%|ME8q>c;aABFy%Z1 zrht8MC{G;56Nl#z8D1YBkwav7eS9QOw1xz1t)o1_^hKLgDWb>n(L8YsBw%;8g+#KP z(Sfvs1WeH$5->#vg)nZv9Tmd3cXr~5V<8c0Aw9DnXD6h`yW@GHGf$kr6I~zyyR$1v zxG(t^@I*IAguaj_wYw*njjsnx0n2$JB*H+X20zJ8TB)NU0S7V$5+Q}OM#csL zek2wKiS8FudEx?v zFm8MoDunTVF^wlK@&wbJ(_xC_j6@$MX7I#Jo|wfGvpvC-b55!h(FQNF6Vf42ED`pL z(!~;CzbIXjib&QXJnGI>2;)k(>sl`8ecJUbD6m~hAPd`Rf0 znIo>`i3L2dFiP0J^5q6{Rg_3xQNoPml+%C0?TD*Ak-0A%u_#rF{2km8i+N%RPb?(~ z_l3k|kkGr+5z9T1c}5(uBBaQ6`*p-MfyjLOy%rM5?#%BCm*P6U7T2dr5j_NMuoKe8 zcOxWVU);nKH+zC>Yuy7=z*?+Q2uo|ZEoZet7_V#h^2B|fV9I$vpW*?9FqZQ{g)o+L z4Nt7~1XIp+Fh%k-i!S!{kbn)ofhQhP#4nmueMqHovSGTR6VSc^wI!A$C-Fh!_E z^aAY$@|Z#x*T~}vVO%3m@Whjl2(^&<;webLNqrg;A%)ah&v=6Ai)Ud9IFL;|@f=S) z&l4~3#EX!Clll@*yv!4?@WiV;@tU2GR_g0HM27D)zL7&@_)g=Sfyn%v`c@z^Kc~LU z6YucEyPjZnt@mJxbVgJ6C zJBpe{3F-S<`PVf^?Cl9=Mw+Eck$j{H58V4GgmF^$RS4sx?#C1R^TYw3U}mH_OaZ&| zKs%B1NoJVT7O9A6cednf(F&%3hrmHRaWEv{j2yz(;!t}E>8^DcBw#rYheW7_l;Q|T zz!XRFL~BnlEA=QoMH_`M?wv;~gmLdYhOb3iK1Dl;uzz>0y+qi*yVijxI(mX>txl;@ zM33cTApw`!aSCDV&f^ur*qxm{!Ibj^m?F7rMa$VG6%joUyYfT;(rG!n`W7oY42ronrK6q#Rv^yX8X!lyVj5Sd@ZoTd=QUF&p( zFz#A?c;XC5gzl8q$eECUYve3Qz!YcO3F&#{9EC7WYF|$<%dDT0!uLg}g|uJvhXkCF z0g!+x2J*xpNWg&%_5?E{g?x%33Sn%mp$cK#FV5wOVV+>hIUJ^dtu=xtMk<7HMn)-w zaYoMLiP4^5W@HRZ0cT_^PmH69g(XUqEb;HZ3dYk`!30Ray=NjMU|lEi#AHt}4KjsK zalS$r*V|NuFs`=?c;Z4&Fy)*EQzZK$`KTEFj_)E!!0w#R6Ek>XCQr=biP=0c2NGdY zrJHn-osjOZ#hHoFS{FkCw$>#)F&7e{oKh_=RS4sjbD2UIx17s);tEeNTh2U~LMmtI zi}`j!YVa!|0b6SUPb`E)vKG~q~5|4w?ZO0snL7qZFWLB z1a7wz(hIITAOXv{k|*xuiMu?(tkk<<3TaZq7cuU!6Vjxvf&`qA)qE}Pg(;G?h@R~C z@x=W+@c>UeC=uq5y2mvuSR*0f!)`dOYk6WFB*L^x{kh&wNSn?EJ0Ycb$WBNPH4iI< z@u6m;LKq)v9`OXTZ#@cAgh`cVpMXRdh}0KPLIS3E3KAiORL-X*!n|V? zJR=cdzkn&8^#s#en_vo9&gXdIc}Rp>NPY1FB;Y_^ghWUo^~Fn`U}ogyoD>=UxaXA| zBEui|yb1|ei`RJKb)I-bA&iH>n+o9%0a%N-c;anHzy^N@63KxiA8Nw=_gzmg4gMZX z0SEHFLKr9Y1BEc2>>u*A_=r#OF;9HL6QAXTH zNEhgDQW4Q-#BV*p>=)mK6q#Qje;)|=0$D2O4|YN-=Z~p~XgN3Y#1=@v?RTpum`U9R zQ^2F>CxtK`0zWH+@nru660n@V^2BeRU}ofZm;%no_Ebc)!T*2+Eawi1uj|d8>+vaef&^^vofX1(UE4(=jL+=#0|6fFn!SkrhqH8DI}7$h+dTT zh6GH}j3@SiL>P#)*!P759LRo<2q~ofVt<}Ez!OYsHBXfyxkke4;{zc9%h`e_TJp7M z6;i-+M4Hrt0+IO~IhZF7;fX^%!L-(4sZvB+>+n=Wbk{n9CywNa);w{PCzu&&lPX1Y zMvhhpsc2 z%cnSwCyrMLV|R8|2xE7i0Ey(~Ai2!KHNHzKBD(Q)<%t5G=*AP>QxVB>hAT>sR7A9# zC-THekboDZo}S3OQr-KDllc_A6vDX7dMku+nVrHDr+R`Z=V>s7wAe#;o^B_kqo|La zh@K;^FV5hJGa&&R{47W$%Nbp%XY<54JkggY`gwxs&i<)VMEA}CkbsjqP$7)_#UO?7 z`vvTa!8}n230R9Eo?s?*C`QtBlw$=qa zaUoAk$>NWx%ou#|0Uqwd8tAe`{FW%F!sgeo?zC<6?}?$3Sq3pe1$O9 z;!2)apb*A^Ec663BUdRYd@aHnaSz-%4etLrWml&nqK~?Zcw(_c7++$$mAXVCj4!br zu~Z?9le$bH@@FJk&gGDRTh0nkFs*eBOp$D@=s>RJiR*acdLyBHdEy37Fy*|_n8N?# zdZOjLi6?I6iCcK$R-U*m6%pO|Zcjx-XXFlrFzy#CJ;Airol1)QEhk!wyC4C#oV$7A z9!P{ON4oK?f&?7MYDk0>(o5`ndE!1#Fnw{qlET=X4=9B3uJs@!U^&pj6t>IOc=Ly!nFA}zCrApvW#ktZJEiAO!b%*bOfMJQ*^9Xx#W_PCvpX5VFa_+3mlVQyn!T(L z#?$N-NWi{$l_y@~iPt^BOzIn{Qbg~aZ$bk0#aocjXT*JL>}{TS2NG}~??OT!h|6ON9Ls z?ivzd|Af0HPt@Xx+MZxqs}4*7_lvp;VcdS}DTHzR-H9i5=80W+qP{1XNo~NVXvh=0 z^2BaDu{%#R;)y*x!A$C&Aw}kIrSBDp%->3H3<>E#3?FVZQ3&IH(NrOf7wEk`!OTcA zm;&yd`zVC57W*oMaqrxZC-&!w13baZNOPDX*`3jg(t$kDf+t$?L@Q4)!*YlW|3I$co?se$1fOCgB*LUh3eGW4c{dSOB?Bnf(G>{39(047DVxlLQ2A>pCNH4KriphbHUScQ9nLLWZ zQ_&PVA(iudNWg(ig+y{7(Tmaro?vF=LO#VbNWfZL#1qqbVumM}a?a#a%z{Md3#mJ2 zLjumo9G)nGL>P#)@fCZ5nURZO3RsIvcw#OjLM^08y;LEL4-=Q=6KPMg%kznx(+rmL z3P`|O%u@(sE#@nP@%_b>Jh1>0u$&7$!K{(1_!L(|B1~%Z!OGn)7TF1DjV!hk(w4IX z5^zSA^29PvFq68RPqBh0uHlJmdEz=qNZlFUUtA9f*cUf=f|=ACVG3y=;r4qIB;Y`9 zhD1mqt<+l}p{H|+d z-3nnm+3!&Z<15oukO-|Mm2)*D^jf&qx)%~5MYO>kaUW0I?+KVI8z@^MMPhLY_t>7N_|8kj4vzQ#`mZvGMCd4k4Y(_ z?>m#_44+Iqo{GpXr%Uk!Pdv#JPw~Xlp2$2SZtr|1Rf^<{ga&^$6_Gz9F2yFEc#bEY z=ZP11;zds|lloGs6v;^q7wDHE0S|#!6vBARc~v2dx185J!IblLKE)f5fT!7;kbpDt z79^4>l6?^t``eI!Dc`lseT6VK_y-DMZ14{~!IbkOKE=m8@d;0S z3W;Q2M317+c;a(Nz}EVLC%()fGJIj+s~jT37Y4qD1e}p?Ji#>hx2aM@H@@#60Vnl) zNWfbBz!N|6#O72)w47T!!A$DbR4Jm1eH%~w1PRy|Kl8*dJn^e1m>KzvPw_iXZ0CtT zcw&bqm~#FZQb=DUf&0Z@fsnpP0=x5XejxwA6v=@k_lxj6Qlgst`xnNGQc0dD#S^7@ zq6|-zg+y{jqI+jKo+!@~6(9i{yrL(Vm0Ag=NR~6%;NiilG9=(Ysz4%{B08y6ApuiV zYp3RA!tIZPpp zGjg~>_!&voB6*sHCxIg%0S9uVCzx`!hAH4cjf5XM?`;)!D+5gJ_T&f{{33{L{b=Mc#!0b@BkD}=F}C-6iU zp6KccraKGx6y12DyFwTzwTD6&C-p>6Fy%Z6rU+|9+J1XxCc=Suaw;PFUaJ>R^yZ0E zJi*M!sUbz?XS34+k@?x|bUPvSMIVJQu8}i5!Ibk%C52yRQiF%?JPQ)=esMM=k}0AM zehwsHioQJ24-(0NByYdrT~2>bFq1kURf=di2ighgaxe%Ia7G400+zGT6HGaWkSW|Z zZihlbuZ1JdB?C(#rx?W(=kdg7iLk%89U~F>FK(q5 zbYpp998Zk*1k)E2Ql&_4zu`2S2nl_qx@9&A63JaFx<)4R#1v03NdU)Qdd9%*b>pMRc)88+-;N;H1vviCH``ntE4bKvtJ5{Pzz~BF0&KTfp|Gziz{FXxXk83 zBGf_}$b6o-QX!13wIH8JyG9n~6Ve)y`r;~yurxZV>?Id4dnB6&p#uRv~u1Z=IFAQ4(i+AnUF2>X6< z3s2lC5&2t=G?3dQ!ak7OB_cnCbQIme6DxV*PM)}nC+_wHv&`;+Dc~Aer4YuGeYHXu zZ+!Ri#C?zmi(Tr@`#r(T$OBRe`!ah_BEm9*lez{Huoi0}0aL7l1Wd6W5-`OEo_L5S z9`*#&S{s!V#+CYrLKs)-qdf5#Pdu&=##%g~5c#izl2?@Q!2Kj7V0S*{38pWehAEQE zEIE*H*LnsLa3Ig}#3o222NHd-dd?F}IiKfKykIA!>)MMFVLwM+l8A7Qz;eFK6R$u5 zw$`gW@fuIO4vA1sX;R;?6VessO-R6UzU2vKjl9jLcn1=poKh{`<%#zs!oJMjmk9eZ z`@l{}t@U9(k@oG{NBM;Gc1@a*k3GS()+hE9QY}8U6K){+2cr9&{WD1D%ghmi)qKJ3PTm>Yp%0SZ30({1;FB%@hCdM2YI?_b*^AO7cW0J0Wd3 zr4_>1;AK3)v{qRqh4B?gxl}~-C@OC!q;giU6VWx|u4@%Nk-40Xs035!8=oU8D}=Ex zswhPMq(*0?Do<47iRwI2!xNb&)!i>@@+oTB38}$r+lgp{yMff;zNj z%gohcXN52>vt1OzxXkMFL<2~`S~P?NJX3egAu{|0;BGlYhQ9#ZT_KD&zD7K;2T$zj z31*G##iwYj5XLpqL?Mhb(i9TO?u@>C*jpit18JrZ#)0g^6Z?9C>5Khf3RuqlApy6X z10a!15xt@`hXhPL5=3uI{p>`vwOoVuw-eE;pCbl% zB6B$%F%YJJ%WM!&42DElsX4U>x5q+Az<~_$MCNk3N5!FhigOjhc*_~45XJ*>I8Th= ziIF@piYLzV1k)FzVTxpfN4K0Y5@GxzhFk1oCBpbE3P+5CgucuiF&+}I784+$r*JEE zq9>S1odi=P2NHeeoopwf=ZG7~lpG?%9sK+pB6$Zl?iW)%!Ibj?KE;KQNX|%fnN8yd za#5-j$-W3Lb*4iCHuwxkz*@}YiCK_<1DVYeb07h0Q3MG%kYZ0TYvf`jh4FH5i9#3` z`&^#5lqW9ZiOV4o`a;@%udox+6XHBbz}A}Y38uBKgegKTa_-<^*IHmFq;f9IAu^n1 zSLG1-r&)3!(L>;BPcY?N1XI9CT?`30kR|*;mhvf<@x*eTSiuw5c!KDQam@;@O_d{g zJ@5~^?h1WfDk8td9dW&#kk;xAc0$_vZsduZc;aTBxWyAhXB6BDQ@|bLHlDbhC+^^h zl{|4LB*N|~HTYeSfHQJ8Bti=5RJ}(bjCa0O3SqqSt>%e)dE!2vxZe}ZN_`-t$o3(N zduo3$5YmS%u(j4eBH3E`hk#45mM7Nn#Ck};T5Ny>ti?k-@i0$p4)qPUw7%$K2Ln05XM@3s1UvuQg?=r$v;X(M7#52PcVJ)NvaglW%j9^koL~c zAOY9N=RENRPkadp*cV?x0@mVdp7@3*zU7JUJi#>h_k4;UAQ2{2YVaRsa#*5P5kbqmxKXyW>#kl7G4<0=_wEzGB literal 0 HcmV?d00001 From 210a3da34ccdda1b3ddf59e9da6208fe2e44b342 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Wed, 14 Feb 2024 18:02:25 +0100 Subject: [PATCH 20/29] small enhancements suggested for PR --- scripts/label_all_tokens.py | 4 ++-- src/delphi/eval/token_labelling.py | 14 +++----------- tests/eval/test_token_labelling.py | 18 +++++++++--------- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py index 7cd79b5d..bd66c24a 100644 --- a/scripts/label_all_tokens.py +++ b/scripts/label_all_tokens.py @@ -60,7 +60,7 @@ def main(): tokens_str += f"{i},{decode(tokenizer, i)}\n" # Save the list of all tokens to a file - filename = "all_tokens_" + model_name.replace("/", "-") + ".txt" + filename = "all_tokens_list.txt" filepath = SAVE_DIR / filename with open(filepath, "w", encoding="utf-8") as f: f.write(tokens_str) @@ -93,7 +93,7 @@ def main(): labelled_token_ids_dict.update(labelled_sentence_dict) # Save the labelled tokens to a file - filename = "labelled_token_ids_dict_" + model_name.replace("/", "-") + ".pkl" + filename = "labelled_token_ids_dict_.pkl" filepath = SAVE_DIR / filename with open(filepath, "wb") as f: pickle.dump(labelled_token_ids_dict, f) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 1ce88e4d..9f68d4ed 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -1,14 +1,7 @@ -""" -This script creates labels for tokens in a sentence. -It takes the context of the token into account. -Additionally, it can visualize the sentences and their poart-of-speech (POS) tags. -""" - from typing import Callable, Optional -import spacy # pylint: disable=import-error -from spacy.tokens import Doc # pylint: disable=import-error -from spacy.tokens import Token +import spacy +from spacy.tokens import Doc, Token # make sure the english language model capabilities are installed by the equivalent of: # python -m spacy download en_core_web_sm @@ -101,8 +94,7 @@ def label_single_token(token: Token) -> dict[str, bool]: Returns a dictionary with the token's labels as keys and their corresponding boolean values. """ - assert isinstance(token, Token) - labels = dict() # The list holding labels of a single token + labels = dict() # The dict holding labels of a single token for label_name, category_check in TOKEN_LABELS.items(): labels[label_name] = category_check(token) return labels diff --git a/tests/eval/test_token_labelling.py b/tests/eval/test_token_labelling.py index a57d0062..a727ddc0 100644 --- a/tests/eval/test_token_labelling.py +++ b/tests/eval/test_token_labelling.py @@ -7,7 +7,7 @@ @pytest.fixture -def create_dummy_doc() -> tuple[str, Doc, dict[str, bool]]: +def dummy_doc() -> tuple[str, Doc, dict[str, bool]]: """ Create a dummy Doc (list of Tokens) with specific attributes for testing purposes. """ @@ -57,23 +57,23 @@ def create_dummy_doc() -> tuple[str, Doc, dict[str, bool]]: return text, doc, PETER_TOKEN_LABEL -def test_explain_token_labels(create_dummy_doc): +def test_explain_token_labels(dummy_doc): """ Test the explain_token_labels function. """ # explain all labels tl.explain_token_labels() # print explanations for the first token in doc - text, doc, PETER_TOKEN_LABEL = create_dummy_doc + text, doc, PETER_TOKEN_LABEL = dummy_doc tl.explain_token_labels(doc[0]) -def test_label_single_token(create_dummy_doc): +def test_label_single_token(dummy_doc): """ Test the label_single_token function. """ # create a dummy token - text, doc, PETER_TOKEN_LABEL = create_dummy_doc + text, doc, PETER_TOKEN_LABEL = dummy_doc token = doc[0] # label the token labels = tl.label_single_token(token) @@ -81,11 +81,11 @@ def test_label_single_token(create_dummy_doc): assert labels == PETER_TOKEN_LABEL -def test_label_sentence(create_dummy_doc): +def test_label_sentence(dummy_doc): """ Test the label_sentence function. """ - text, doc, PETER_TOKEN_LABEL = create_dummy_doc + text, doc, PETER_TOKEN_LABEL = dummy_doc # label the sentence labels = tl.label_sentence(doc) # assert the first token is labeled correctly @@ -95,12 +95,12 @@ def test_label_sentence(create_dummy_doc): assert label == tl.label_single_token(token) -def test_label_batch_sentences(create_dummy_doc): +def test_label_batch_sentences(dummy_doc): """ Test the label_batch_sentences function. """ # create a batch of sentences - text, doc, PETER_TOKEN_LABEL = create_dummy_doc + text, doc, PETER_TOKEN_LABEL = dummy_doc text = text.split(" ") batch = [text, text, text] # label the batch From 6a4a42df36b542405e374f47f70ce8435bab9ffe Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Wed, 14 Feb 2024 18:37:47 +0100 Subject: [PATCH 21/29] rebasing --- requirements.txt | 10 +--------- scripts/label_all_tokens.py | 4 ++-- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4e6c5358..5fdc84c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,14 +11,6 @@ beartype==0.16.4 pre-commit==3.6.0 isort==5.13.2 spacy==3.7.2 -<<<<<<< HEAD -<<<<<<< HEAD -chardet==5.2.0 -======= -chardet==5.2.0 ->>>>>>> a71e2a8 (improve notebook explanation) -======= chardet==5.2.0 sentencepiece==0.1.99 -protobuf==4.25.2 ->>>>>>> d50e206 (update requirements for delphi tokenizer) +protobuf==4.25.2 \ No newline at end of file diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py index bd66c24a..eaa2ea41 100644 --- a/scripts/label_all_tokens.py +++ b/scripts/label_all_tokens.py @@ -3,7 +3,7 @@ from pathlib import Path from tqdm.auto import tqdm -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from delphi.eval import token_labelling @@ -82,7 +82,7 @@ def main(): # decode the token_ids to get a list of tokens, a 'sentence' tokens = decode(tokenizer, token_ids) # list of tokens == sentence # put the sentence into a list, to make it a batch of sentences - sentences = [tokens] + sentences = [tokens] # CHECK AGAIN # label the batch of sentences labels = token_labelling.label_batch_sentences( sentences, tokenized=True, verbose=False From fcf4ba658b3600bd844734bd2db0ccb361d694fa Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 15 Feb 2024 15:31:12 +0100 Subject: [PATCH 22/29] improve optional downloading of spacy language model --- ...tokens_delphi-suite-delphi-llama2-100k.txt | Bin 45121 -> 0 bytes ...s_dict_delphi-suite-delphi-llama2-100k.pkl | Bin 274517 -> 0 bytes src/delphi/eval/token_labelling.py | 19 ++++++++++++------ 3 files changed, 13 insertions(+), 6 deletions(-) delete mode 100644 src/delphi/eval/all_tokens_delphi-suite-delphi-llama2-100k.txt delete mode 100644 src/delphi/eval/labelled_token_ids_dict_delphi-suite-delphi-llama2-100k.pkl diff --git a/src/delphi/eval/all_tokens_delphi-suite-delphi-llama2-100k.txt b/src/delphi/eval/all_tokens_delphi-suite-delphi-llama2-100k.txt deleted file mode 100644 index 438dddae5a703d7c5bd4efc3b840708916eb9dc7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 45121 zcmZ9V>3Snanw`BbvgLhWpBaxmUUJ$+Y>Aw)-Bz{KtyW7dO_ydg-CjWuBvAzdXn-iH zz1Je&6#&vh?@|k%2_65pmA9#EIaG?!ynR_?z%I?f%$@AI!Qx{^1An?oWL9 z!J_+na`p7E8OEn_gDG6>i!y^lkTtcIqm)ipR?|7@;UGR7N3jmZ}ZuAe}~Uy z_jmamy1&P>z3BcvpI6;K;B(UbLq4b7KjL%N{bN4oKK`Qn4L`z_w+ zMfXqnyz2fLpOfyl`J8tDoX=VJFZi5y|B}x|_pkWuJFYWZcE7{t(EWglc+vf9KCikL zd``M6KBwJ;&sjI+bKcGPTy%3j`)VB8cN%wnvPP>1@=dAm; ze9pUn$LFH^A)kHs@A+JIAMrVKFRAty-H-UZ>OSUk(tX0`wEHoiv+gH+&bv?fTy#I> zv+q9RbJ_ij&!PLA)_T!>!RJ-?b3P~CmwZmUulSsGzt88q`vsqi?iHVX_cfo(?hp7J zx<90mUv%H_dDZ<9pOfyFd``Px@j2`M1E2HmKk~Wg{u7^l_n-M(cK?OXq5H2i@{1SU z%FR`G?Pk)g-Aub1H?wZzX5QVpS#)=9`tIJ%vfH{Dy4P&zVsG4Bbvrkc?!nEp+q;={ z2RHNX=w{LV+D+e`+$_7Zo1uGTLtneNx$54!nRHKXrro!0X5BkC^X@x0i|)U<>AU~# zX4(A@H$(S7+0g6$mz%5Z|Nfuy@yhhmwr9LDy|wnxVOwFmGTpZA8Lv#|ZF|Nm(}mlf@yc}Mwr9LD-MRMA zqg!FTGX1*k8Lv$5ZhOWn)5qJM@yhh{+Cyh=h4IRC`L<`gG9AC|8Lv$DZ+pfo1A?|^ zyfRRzJ)of##w&w~wr9LD$Y^`UD}#@=XS^~fsXYLt6~-$Am$qlTGN5UD#w!Dzwr9LD z0BU>Ml_#JmdnVwh721_2V5#kCSDt{Uwx?Zr0;bxYcI64Usy!g96~-%ruC`~qG6-vX z#w&xewr9LDNNan>D}%P$1KwI;yfT<;d&Vn+yS8V%GT3W-#w&xr+5-YxVZ8E)!3hSS z{WM+~Gtl;oSH>E&J>!+J3AJa$;AF($1mn!+JA#KliWh_bCGhP{c0;f$t;5{Td0f6xA1e*fkO#mOYq*01w! z593ZSEt_n6>3D*5d4xMBpl+4lK^^2j!N$PB6A%Y6Pp~puh;{Aa!D>qy%KM&fNsYZz>pMs|C zK07AeAv>nshRaQ{Osj0?-HunCVw(=!b_$SmBs5O>I%spJU}(j|OfgTqByTqbLmRJ$ ziL#2OAn1`AonoZ6-VgkI>UA+wR_zq@Y>OyoDbP7#2B+Anx7O1XGJ>@`o$FQ$!h|bL%uG%H+G*Xc=2IC3(m$Ath};qr(oqiPl>%+XUDADc|6wY-d3D~ zl(UbIv3ipOmU;kFEY;_1z{qu~2aN1J42Z0;`%@6Y>-#8}sdsj^DGuwzWc$13jzLN3`ep-$@{U~>ChrpfIxQ_R$M=e+<6=t-&9dzu25C+`JZ z?pNLfxa`w!0L;#I0x=is5y(_c1T%RTP;&rHQ&3aq()5qMB(OPqRdHi2100{&`vRW! z-r#4G%YdJaH3@#sE4v5?`n;YO3_aebUjw2q_g)h>?jtu;#w9HQj2^_-6d>KEOM|4d zT@xhnJn8M&sNHj!diJRR)Yo+)9W1R4rhXt8aP+nh+-wG_)?8@@s`lbxhPUTa%rH@p z?U0#%+zbnKxY$fHfJGfkCo{0cY4)!f_`2q%Gpy9v3jwgFItYmU1~;7nu&34jF-HTk zYe6y76RZLrW!jaRnt`>qbsU~$4imlJ2DEKx@fm2lu07ywl|7T-e+JxEcGMYo`>E%G zw@0xs196AEHGsPS=^1d_-sI4MhqbL{Anral0C7*&41oJ`FEVD>DFHD9aOrh35XZ6f zof)9J2TYLEoE457xgT2Q?j={f{ zlxBc$YfqnHrKoTV@V`$30PR0B@OOePXW$R-dj|Y&q?8%x;}YrFGXR)70>Fz81pprp z?|-dMJp+a-Av^s%WcKITMBl`xwD!o$I;QB1ctXhA1GW&^D}VxG945czHtXA zyw~8&Kp}7ABY?w8wp^CLWQIjMOBgc{_>CJZTKWh0yH3LbzzdH&2Y=KoPcR38hiaxd z7`$J}kaJM@V9m`jZeMumIWXKEf{i&iT<02gxKExgV_;+al52Z!fJ zE%6*A9z^*Z7@n!?IVil3Rh?t)uG9K+FnDzVIB2ms*_EKhl6fVmE2=Rk98 zV&F+T<$VFD{51!h+g+WpFhwv2ow+6U@!XyaXx;^QbFg`S2omO?^9&Qr0q3=?3pig{ zfnXCtwc0@ESzydT=SsqxgU<6qyB+YP?Q;dH{+eF_=c}Veat=D5G&ytq%8=7s*V6h0 zpO0P>Y_0;kIc9QW9|f96Au|V>YpV!gKGc0d=2Pt59Bkg6v`%xt`FKCp2DUOU+qxqr zbF!$4O9LNF#FDMUKr9k##PXw|LHadu1Y3}9#c6G8#V>V9# zm`7UoNM zqJ_3I54pf}p6U9&+XB?S$?GhD?Vd_qfLSa6ZL$EhM;c%OW_O!B^a9jgWUmFVy*^Om z3w=}^l?9l6ik(;h+H~{OsKl`Vw))@;aC_F$EkNz9F92rY zyzC*7MORtiw`$fFK=uso78uYk>?I2TYtt>j>%{|r>+zV{6Ry4wSY7RXL9iODxd5z} zwXXwKFAu&eVBOiV7l3uEGr9n%pVvVE6?nR9V!CbtMjs163o+{Hda$cB3nuiumGl8y^#el zU0ug8EkNn98WJm7-9{UlOMucFDR%)%X$}ihpT)-mrYq@b0Zt#{a4f(n9o1cQak*>( zP>EQG}005!C&T(@<7hh)qKyO=#m~~U^w*a0; zD+6?1Z&C-qlce6l_%M{Z0G}V}04>1hU7iAbY77>@6OQqj!RO_aU6?&E7z4k+uu@HB zfffMu6OyQabn`7Q)q_zDq$LPoR~PC5seQ2rr008EvInG8qh3=FPJQq^Ru5E92RmU8 zQ1M_bU|y-E9-P|3Jt)no)=h6Dso&HaO74x;dyq;GurTjlrz`eg^*a5n2dl^Ywlo&? zne=i`yloFwZ}cR3z^cLR0qaF>=|O62cn?&+sj~p;?aG?%0qWlC0MvbqQx8%Bc`nJd zzg<^<>;da3wE$ReH#)vOVEv`xN)K9hvC=(o{V3NHuLpyY9xHpcM#ZZ&172wsI&TkR zxBF^|9?*t4dQq#n?Sbv1M*-S%I|1B2SVzFt9T%)&C%q!!R;2i>u>l>={DeYwI6q4VCzg+A)Nu-ek7&#z;?gME^tfGXZrwLURccc zK=(111G-eUcL%y}(;a~BN4AK-&9Tr~QAH1OH(LK5Ojk0UHfdvJS=ZL&lx z<7L6^N@nQ+Zgoj;%RBoF7}kr%w+FP}`tsQqcSM7??LC6JBxx&*S& zn8!d1Z>Y^Bh<&tLmjL#hv+R;SC8qV348jt$u0umh;QBU(d)EZ`4vUaR{PfyizOfiIdJ3>6n~y>CW=EG zOHu6OrkjA|?E^l{64Se;AqIeqLlNc}0P|h*B?q8Pv-7M2P~LnyKEwc=FZ=I*r_~R3 ztN~C{dsfQ;tnZ%7)Zi}n9KiLVQN;kVcX^iqi+s%!4IulTR~`U1Ahm#1pe6@>W}12c z)>}>5fEm7v_cR#jn%f;fc0!v0U{n8jAdr2%GnyabI0G1&bpYPy$N1g@_^wVofNs2` z7=i(S-^!5#2;bts3;_HqH@aNf#sPHyoZdTtZhDda*Z{s)QuqMAFAv(00e~Mw_yEGU z415mY+vv$sALk;^1HcKk-3j;zO!A6AoIaD(;{e7FcBui3ztQO#0Qn_X2IGki-Di#q z3=aUDs7p_B0O4;8CkGI&eXvAe3Oa$(0L0hXLavhbpj}S9G63(-d_k~n-ek}$Kc3=a z48VJr8-Z>e%E7SrF}DHTub%tb0LCx+V1se5&eZ_O#g(4V0LoX!o&zW+AL>)aKKtB& zoRAZE4#50cGd+OwYYpN6%(-eB9iX4{1i^V-4W!e$ItK$#zdqzmLAz)g0Q=>v%0hWUpw)|bceEBzKfrjF)?a#`?x*~?x7fZrN9 z4d7j>&ps;6`V{QD1J8qEzE1Y7UJ<5R*1 zl+`H7$Gc=-_yS-a4;5UZe!v$@`i3I33{EIax*us%CFC8Z?SzFLV%4a5$cj$c8dxmZygV4q8lh(Z5e|n<>Ww&8;B<{KUEwT!53L z_sk(ZkxvSysZwxIL_$L)@}m1J*+hz0$;MMCwB~0bj7_^0oIsAQ&Z7QybuCySS>!yK zkv2_Nfk4F&PoT@GK2cB|Yc;cJZ$>52n8XG|rLxd({P{JElqVhtwU>8S|GG@C~ zxw-5^-59z!J>N~Aexs{{vku5iXra-;!m zc^@2s1xid$aY>++1Io*^2x+wL0c${o&o|apfHgUS^unefij(kjX`Y=V80uGe@G!Ox zyb%|h-^~UZhEoj0j2(~(k$|b)Kre$jz8b&)xO>@-GDDq39`rW{OQa@tW1CjxZ}UYj zR}Y#Z{p3(&!3&a?Hr32-`la>+Nm2$j(tp(_B@UzjeY}&{C<5zr36)Ao^s&#Of*?}C zl9>nljH;#L0)AZMQgd$p6C zR=qV0Vr0gTrVZ*xU7`9F0!hNq10WEBhSUTU@>&A8fN>W zuOffs1~?NB@znVpkIu1>Tmuf-JjCCo2^ffKtqF}>Z)Bh=yb99wDPb(DvhXZ$_G>2; zlKcoq;<#+#tx(7n4h9^N^9aq;MPQJVxd2)ijG6wWG$CvhgI7in4((H-7d9Taf<6e( z@{aHaPEh(F1aiAlvUUZ1&?|BuoQzM$GW3CAP4qz@x~eQEC3JF!IC_+Ek>6?3DRRT` zS-zCu?Ft6@CN%(me4HD=9yon^J)D9JFUIo!FYG)^n=1@8%f~4ZvE1izy64?tiavKkTKyJ#sUBMs+gMD@aFI-v*hN^jP zxFbPm9sv5N3Q2~I!xn~|Rhy8=+q|G0a;Y|eL%#Dx63y=A5L-2`2rKi zW2J~&Xv9ZJx@y3fz#ccv`%rnc*;my~O!(ap;?)^AA!8j)j5LyzK%2;IwSUyk(kl0+ryFwK8xUE)}F(pTbcQ{=^L-U&4FbrOse@lJ~fClE<9hZCseAt5yfNdNJQCUD8G4W%j&v}QnQ=hd8|i*PaL)4P`W3MV2Xg;BtJVfq|kRRYbL?~nBC@6>MRDiEAs+- z6H1unEKfqg7kR~aBC5XVpz-09emM%9x`$eBGDynVAfg`M$b}SkI!%}*KAl>yUD7EW z%P{pp0S0!0t`g-1MfkE_zyKzJ7CY^hMmiK3R?bCnOvr5{Vqq^}*xo*0L%UN+N`AsU zAvmLuF}4vtFigVZya623Oo9?JI3qsxM1m=vR7m#r&@cp}#24 zSasS>Ixz8VzcXQ?HhhcS^jfZKq(ZI4VIl-wk=<6R2G~ zwr;I{P4^_Mp@{AAP=1d>HX=j&gIc!qA;oO;TB}q&n6yjkt zFfFIOz`q}eO_QcVU?YlSXTa6rVbaua(M?RB zGQ4x^*f3G~4+`;Cd7d~qX1efZ8CQgDQq)%r6)VRho@%`f5>jH(I*2qhb`W?7zop(7 zXD80g6c(zdfrG9qHl0F3w+(=h5Eedt4GMb5IK|Zz0y1%3P>i4R&^owTmN?l{_(!Ev zMTimXt<%kv?46TXyHd6HGzR4=_nu<&)$Uza!_Huz^JBSUD)?U2<&^N58{`U-yS1r) zkCeGjs)TQit6()L*Xjy!XIt`a2I)XEI(X^}AL^F+Kliv> z*^AXm5@%*=MwL~IVP??IT3s+_I^LH%!Z_R3h*D13Th)e;s{<%kDh|(2;jJ08v#Az? zbs)slE1nP4n)ZNoDr{}yLrHrE-@H}~Ldn6!@TC&Ocvs2a5*#~vSp)2RaR{?|60}2J zFOijsL}b~Od7QyKW6)R40snki@sJ9|*K}R)2=|a>&$B^3l4BhI8Sc-~hf}P0NR=p7 zEJXx6&5V~!lP~yO>G}#5pUCW4DQXr?-`G$kT8t+>hkL%P>l%XC#8$G(MF-x9^)m|J zte)yc^LhCJ&*xB=gMBh+&C%YlR1JKYBigKYQO6j+B4;{+RA^Jphy$Jngn{5l-&{fC z#|^V{*oSnWy;)J?H!Z2Hu5n$Sp~^;bzVKL`tL~_ti(o74(0*tUWmO0M4^C#@6(UmF zm@-7vvf=fWL7=2~6g{pU%gtB$_?T8v_(<9z=vVr9c4V1Uh;C&H7?SvDb4che`HQ(J zU=$ts9THl#4y*+7q4J}1C`flWPk_^dyl9Lm=kU-WVTW4CbL4?n$|3QRaOG2vunH&B+w3hFp(h?ZAqsbh#tc7~ZHlocj z<@VR3;o!V%sdbf?_+GgTFyF)(Reb3r2pdBIW(!HwV7^W6MFr+wtsy_tZioGNOWd35 zGF8ej$@JzB!nh{T;dpo@%&HcX;lo&*1+;=0&fDPLWVB8_=1tru6`9vbD=5h{J72`4 zFjFa68hin%G>4^J0!5lsn-2iF>^6B6gQ{5gR5JAPlO({^Ykue0hk{MR%sdIyf_gbo zh_cOimC81oRa3ZmI>b3wx_PePUh(E<2Z5k^Q>j*JF@DXeMdb@v<*9-mWtf$S{@CI!q5}?3r+k2 zwjq`-|3!glgR2ZzC=$JG?$`p-xeL_boWw4zf5JO-hcvaJRf7%gjlNYuR`&*9 zfxAXAim45@RE-*3WxuXmoK#h#=Y7$m%F*xajtWPUOZ zCDo+RL3=F-=yd`MC8g{5>WWIo{-Cau{6?z7*lJf{UJw012ML{3my(#Tu_h&^O;r|X zD>QwGFWkdIMX#wAf2`ytO##!NdWvDg6(n~{(yykZ%xb9o9 zq7wBSVD~VQL5%K!YSd)>+H;%Lsb_+Xu4{Cohl;*v53Pg4!ldF*WrmfsHWACg7iUXh$ty901@3Bn)lK}Mj)*AllV~Cb)~vB zX|9|J9SQxY&G?C%w#ZAR>shi;>3SC!!bLn>s7IkIPO?8kLg(ivg@l+?Fro++IxAY{ zATvCu&;x8}$jG)i){0-@;pcNuI8k8*6P=(CT#Xf}g-w_u@fgL=*{ww_hRlDB;B=jvS`rZZKMq72c7%hk-9*|P_!o86@T)y;Br zqZGBXWK69I#k1&J%MVdOdtJFKCA4(rI>hc2A5smiiRIc2@1}Mjq}zDpO9-j7&Q}U) znUUljAfqIhl+rruD^CUw?TRea(_X*dZ_J*{LqnOF%>{V9bkJyO+xbp(gn4o(G{i7c z!Y+li8k?Y6ajle?&Zfe4&uE2@1Qor^D993*=rL($^|jj!a;vXJe-@LX#Fk!@b8x?^ zR$-vH*KHJ1r7gO{hjh@WZ6C{vTf##SeFA>fw(E%681aQ_OWUL5meIBhgDbi{S1L~3 zEj6BZgMdi6<*9Lqa0Id!`niq&v4noUex6~<5Hc;;JhdexM2M2qg*x0%9+JOSiAyXi zniS(2PgiBZLBK4#iGofMa;V8Q;>%6ppkL%c;2-34X?qyxJSLJA=aLff3NR2lfI8T~ zq|Rr&fTU*H-;gV@uKL`!?PBl`o@KBH`#3lxZ7b5n5o{iuLS6MH4I6L}1d^ko5(slH z;U5e@ZVv-php3h85<7?VqS?)>%0Q$I^DYn&iAAp^`8@Y?_$T9Gs&&y)%4^DX@8i`k z@qcKZbaWU8mTGb86239#oui#%RZf|@U8h|!f`8S0H4X@>+NJZA9z)?SoubAP)$OkK z?F@1*(ZzByHhOicU_{k!&~If8Fb-ZsjF-AyoZ7L@mF|*X4}Gt+6?Uy_*`*RohErWe_d`-%{A9@ZA-j6!|&==M`Wc zRU0WH<8o09cFyR|$vP5OR?`GZEufN?z?N&bHD*Qms_gfmq^hAU>?w!(iL-f ztMjV&&9VUeo@rKfXjNuVt>{-uZLbCO;LPWaa1TwC3W0h^Y8osl{w4WL*YXSW^P0SP zm01e`@I=APFZ7~QLJ5Vy7AtU24typ}o3jb5Nfox|hkpnh(tQW`=kr!IW_&^x1N8Id z&pZ$E`JzV6l?C56w?kzx@T%?x_mH&;*(wgcZPA$WV3Y=VUbtr;;#DB5`3jpW68^k? z=S8WPdQrINlvM2i^I#^UY#St>aksG%W32W3vd>NAS?Y!9qCPh650xl92iza!dF6bm z8U|!_47McWUvi?t;h)r~v&!L8sSHpMGSB>MYNYi+$jAJ6o*egQSB(Yv;GpDK#|)y) zQ9B%QR_&+%Tw+8A_=i;%o*N1Z`mD%M6|wKxE(!_VRM&xpwiyE*AR$r!%_CJu+`1n; zbZptT0Ty~FX{%ylnum2?)Dk<7nF2nZ8ZJuM<~Sb|v6IH=l;9pP7)OzBAW}6rC9Kv( zPNYcc-l%|*!&|!@FG$(>nTcyq3=vQ3>`-!?8!S+Q(HZg1vGC1S?}5G8tBiOi4bOKt z2Tz8^qfFwUJu0L_y_kn#w{aK{r^MTt9z&Nz(y7A3d_(4&)GI%@@`iagicoBfG)1zm z+DJ-SfCJ@5o(9(05UjA9tbG+QAdITEr^DB|)6+!zqz#EPLmBsjyoueKFGRKRtSI6^ zKDc}dMNC#CxfJ0Y)gzRk00QHr%J>GOSFM`<$&?9p*4%LfR!+;aPgBHMZBGH|C?8M5 znf+n7wjK{>2MgYg*KX~KeL1^%AC3m_jp!>gPbf0i@R;m2Wh*wl1>(V|C+mf_%h)zp zqRg$Cfs+@}=Q)5Cqc47ec%)!ThPQ`G%kvGVAfvMsO=)MqnKC?3WB-)(w_1OKd=7Ue zOOTO~VyowcdeHjRPF=f-Rye}JnCdk!k3)|+538lM;CBK8cuud5FNT?N6Gh}8Ej@xl zjW&%FO_8Zi@uDcgcKFBDy1G7CmGmdL$GX#!aTNLV5F174=coXUBGs#3o_f*!*`=ur zzmhLT3FkdGxt^II8qo8go|m~Sj?Z@oeO2~3nJ2FW?J#v*W6EA9c+~H@lKY!F-`v`% z38py6v0!J%u-X~-gTL}X5YOkEfE(U1NUr_@?T}~5o!}i*VtEyahumTw6xKOjOh0lt z!&xl>MOerAHEIj_M#s&JDkOb0S!vN%~x_>Bo%#3<;aAsMsNrSaV@p!lvJS*iKRSg%LK6-PoxABl&xN? zR7vvn#aT`i@e7)okuMpstaJ_E1Zg_~v{1kG1t!|3ub?>u2e~37gcFt?j7*O(%+Vxd zjj_?#@pok=~A7vp$W@Xo?8)X*RL1RE6 z)5>#Ci0D;DSlLTRvPVoyDdSBk^=N7HK#LDN= zoRT~vyrcXsPbBLg;K(o$=USb3?UWGt9bz~b^P!Vf)k8W^Z~6nAV_*~DF&;vY=xI<6 ziN2JamLeKdD(|*V38V-xw%-OX8bt}MMs(S#3|ymCJjZgBzCo15brgD+=B!yvo#!TV!xz`!tER#~epqGO@oP>5O2nTYoN$D$2XG#fX z#fQloQrRpZQUgaEqe#>_)2Ni0866n1_q|*(F9zpq4W&8A#8O5oDQla@38s8CWUnh} zN9=bKDCb8>?@{Eo3>el`AURqvxqB|jJ43Un2`HyfF@bQ>-b$JAvvVGVlR18QC@2Tn zk0vT|<`BDTW)jEI7qeT2+f;bMH~oP;AJl__$rGg>j!4mzGeb!1}&i2S`Ure%=kr zsW^iptjy5T1kPblBp2Y#s%c>yKoSQ;Ei+SD{T;^HI0{Xvb(+*4q=U)HZfcTgPAPJ) zhhsG_;nfWo&u%)Z()K!$`_c9CKDaq$QcK3%1a~ww!-%e^MhRO{K{RD^+NpJT=cuEW zBXlF{1t6YezBm-8H7nIo1d^Mm;whUGEVSX^VtaX zqts!r>wOsqrOYTcQrnJs6;n-MAJ&OT9YQ~!Rk8rxJz;vG2lg4^)*cED!cR$^K|*g#|v@qctXmVohIkQDQLz}So2v}QxP>KI;-tC zg?}*b)~Pb)kDHULj5&g*x`Nub^dObYHA8^I71xAC)iE>d6;D_pGdX#$2m2(ALrF1V zjkfT`$OkD-8CIptq@{Db1Je6=AS#&&p=viTSySgY*Ob*eC3CMK zD)q}4s>=N{=flxxRlrqKct6gdPxZh;3OOrWRm$v>)ivOvV{%no0MG7}M}vvpfaGwgAXqsZ&7 zC&~h8sU~RYO>u^cit2-w`=X|Kze&a$O9S;*f|;`874@F*Qf1xP3pdm_6J_Ir+y{4v z7Kqw}-FDsToKO;Z!PFa+v_6DXTygYlS_)fcjv|6R%2$MN=~7Tp((!2`$cXtSs>eBm zo@5lQu9*<9_BlqJ!BTdkPWM&Pm+*&NjgY+=I`GO$n(0$15fsE%QO;~^S5Q#W?6OH! z%c`1d3J<3yaF0)_s@b%7uG4p4#L}p19_^;WW{@2JCBo)JW*#mZ>GG*;PUbUZ!4l)D zvRQFqyEtBuDv*qla|)sReLMx_&32re3@6|!1(+(FNfx*pEoL_kLq!o@w~E9Uav(mQn5&d_#j3<>cIK)roF3>FNsr8jG;p3OWhwT;uR{&SL%FU{Rb{WA=Mt3U@W64TpG`) zY=TPXM-nQjFK9@vO*NVU%HSz39f|hQm2u6~avJL82+Ce&C0h=|8*@CC5>5lJTyI2F z=IChlW4kZO@8TTNz7bwhbcM4bUWLxPRS@k`s^Hl3;2|fNfcn4#E~;^{dIb3BCPOUM zO34K|DUb7(t>ZyQfZtt^k(HOnfs9DTW^yh+Oboc1lQn~p4i)0gAf%7d`{ARf=baiF z0;ck|6h7ZFP>>tIzr;u>l73=%sU8bfx^3M6QmT5=Ju)crV6#DoN>(X(#zj$Yzeki*`?kY)t-G*y>pUa(K_EBlE`N- z^?O;jG>?Mk400Eht9kyoWjYi+n+x=wF4lUZ42Y6szdD6Z)jhL1M2xhm=bze{T(VUu zW3`{_wvwRCer3SBcps{sK|w)H*)uCmTJ0|2nq`V9*?%PmmvCj#AWKk^llmz*3Rbs; zk2tPXh=OMfL*72==*ptFoGX?qJtzyhW`v5%;Hhj+QDyUmS7Z=CZg*kV2Z!qjON>Lu z%#rK6BH52cR_pAVi?)MW=d3J1DZ{8#b%i48omt(`Mvvfm%fNYiCMb$Y#DuGL1*Ffc zA(j`0p*~IK#HC*!%IQ$~+$P!Nc<4!yh}VUhPV|!$aaRtziu2VzgYVMcaTJ{xPqbn& zSA->t*BTw4=~d=la+H=1W0>cFp6=8hP3ExESG$^X$rpSU7AUVLgr9hcKR(m1Z(EO;UU(Ijce~}vNUiDfalQBW8~3H!F6<_YFXE)&eStv zCPR{eN}*S$T-!`hi>6dUf7$?&R8a=_xLoT!MGK0d9bU*g8?h;1p18CuV?~-o<f-{p?l-aLS_TJ2r*F zXtK9y45iVGfOvg{(aCBR!77i&G9@ZjAia$frb2pu6Zor;#_BusnRB^u+8_(jDg@4* zV54i8NlT|n`n-OwS(VbxyU8z#kTM&Vqo`)WwxTVi(zS>ZKj12*fs{$#MK{1s!lw;m zl3oQ`a=L2W7;0kKvFyf!K!P7%N3C@9UU^@rDM|VygV(HAOX-AoT6!yxc znaQaSpUY*&(4r8J3&;$W51Soh~|j5m}+u2g!R zaeYqCh^KcFp&{b zy(na~uRN)1c*^$qG_L~dJ@#+lymnRv*ER+h__PB04>P|*{KTj zvU5F34kk=&siOtVL{6fPVJ>R3j7q1;uH}}-Y*ig_(pNbWLUN70w6#g3JD$l4!%0~; zo>H)q@nzN02d8wg*Hli?u`BD+9a=JB0VicLXPOgMB3E2@qIM6il0$+H0ZnRTTj8;i zX%;WBZCz>iOf9CEEOym}nrTd1O?%)d*6z$ZshY+XL~O%u_>c@i-Gn^0V{#>C<~PMA zOxXAJlux5`4V2VRGatQ@vudboZ558eS7Lfzo{BiGrUZnXU5uAZlP#c-`m8E9Wu0%u z7wmRUMW{6@>RN4>6r`?bnm`pbqc%AhuZbbUTn=V>P0p#^7HT4B8Q%_Sx~?!oCG{Pt zr92zdbbhSSJVn$RkYaL$)c4h^T(9>~uM0I1AN!0jlPb*=rT^3n)YRr|L@mNmraf#- zm+oD+p?oFO%*!{rD2i7|4Qa#&a3$Y7QF|vNJ76r?Mox|YQDx`GWvA);mL6A6-KOeu zG&7G^mFZGX{dJxYrb6&Ri>J;ZEA7O1NmpDTtK?PTt8E!a71b#Ib2g0Cl8UbO`&6vt zioYIEJ*W9gHEiJ0zl7=a^e`5}(^MXmwazGzT58zjdxOzxs!8RfJ>f01JTk0`YV^{H zDb-X{-|>zVRbTewlR{dpGr?Fl?L_O^4qD6FawRO#zB(qHbr=05d1|VeASsE52&+du zF+X90&b5FmU!W}7O{!O3Z6~atU4b2}RRcV)R3pSHt!ccMX(!4Wg+ zav;3fyb(p#k=}ATL|p`kQ~5@1X*FcWBiKcl~y*0$V;0Ex61v|1T$P4{4cc z$YbhejW|+(4a;~gv~`yiQ7BTy>FtWIFMB(<+Us4jW@@jQ5RtvSXvs5{@Rc*S>mE=Q z>?Sk1_VFqzMv4;`#7=9COjVXz@inT5aD?LPHj=OSnp5)zW}w;#ysIG-K}q&5hm5VT z1Wz%Ow|HuaU*-RZ$eFOzx6LDPU1WxRlS`__k=3wsF5dE{xw0;h{CFryz!H+G0rJ!w zJoU19?h3J)(Oa(wP2qBtab21aKh*RH2H5CMhveI{WHQW=)`qE=GT#I-uAKZ`@`ft0 zfxEQs+R143inJAD+cq)$st#J__p9ZF4Zf>d*s>-sf3=^6*VH_YgQK8Ks{aDj#w9^8mU5H8> zIjPmCNEph{jG7oGMAyU04~Rz$#=ON zJ{XG#j(t=Ow@q1jGnZyoSX&3Wl*V>fqi2N5>IX(#;rENL)>~Ydv$GIIs4= z8FQdAdl_-d;G$~o^(Fqca&GV*Z_(A85rHLKRM5>piM>rlH^oH9158%KE(rw`bw6Z< zNseMvoM0*3>1xjT_fG5v=8li#La@v84PdiYx2?sYG}69RJuuoAIR!#PvlO3DNjGhl z8i3L;O$i=c|IwmM4QFW8rJw0VW$_1CZOiP9x+<*3T0k*( z3cCSr8En^phUvfeU4#`ctL|=#i142*8AixN5{e8LSLcV-fMa_^XpLd!7)-U@{Ld~K zL~S=_BCiakaX_orWV1#T!D>Kg6ci578hMVKz34uw#(~xFPRi~_<=rTKfYgrFK2X{z z4ytQJ6Lz;I#_ww5x7A*78k(l+a5`VOYPDtodIJ5;CTmpaeHWQ$tr%VD5ry8rY_A5N zy^bCS@qs)(RySW^a_b_|FJCd0m3g4-f0Jx+JiUq?F5gR4QKGpI$LdRbW0kYK0YzsF zJVy1(7b<;~ksiuIxCZJ_JqW6L{M0dfsD{er#}{G;xq{%5;xzeq3aJ8ohCXUvr zW~GQsg7>UfG6lw}7wwA~T;m50()%r02Vbz{n_4J%D3W6v z452!e0jV5(I2|GmpyFB8D>aV;_o{xyc{Aya42i+KQ{e)l)n`Hh!0#fIbw%}6?$l5($NdnhS)=}{Ze z<*gqM8Iz~XTRuc8OPOGx7O$pEP&+1K`KDISKCPm>{BDYP3eMW9%2QKr?gFJ%lII~R zR*??z1WSs_adHe+IY>7(SA@%)cpfw;D`z0UWCw>hGa>b%r2J{6L&;t$cpsHBN-i*U z7>|L?_-8X70+UhQ`aqQF)tMYX(fZ}Jb8+a5`8ic%iW~?ns>iU@C}$`0$th+`m_nT^ zRhqYT9+l-RMw1$c&>o);gV4k;X%fU`R5ZslsIwTOs~hI_FxjJi0mqnBT#_@|?T6dHEwld@4WBbEjEbI`w zr3gJ!1~^KymG;9qLt>G5l-+uR-%Gx+$D!(Y@RV^@$YQmQ4Snr!7suy;!W09AUariL z?f<_kL1k@3m}O5@4y7;PraAMfoyN12c~YN_gXs%FVZ`v#w`*xLYN7jv$`IQbE3Pp8 zAz>9ep{}G!DbmCD=}P9uN`b>(r>B`=phypEq^~sHY)>i#&cY5KQ{T8}NEtjD+ERK@ zZ4Gfb9FfEVKWVF>A1!WH)!u(wqpf^nK;fwvqEh_1D%^FmeSQ(>>u!8&*sHmToU2MI zom7cBtF|V`fQ*vFLNKpHoqv;8?vf()46JhiW-949iu5p-1f)dg-WfboBq2unrFJvZ zz$qQ<)jWI`H6!9o7$g}Bia6iWc}|75@}E`MO_GROD&Lc@_yf-ug}UV7?e`)W2BZi}M@= zvP*0W{BR`FV$7Qo3--I-OdbX&Y+Z0hBa6X6agfP?MHkd}dIH3*cI zxlimy?Jc76ruD<=s%rz&wG&OA4l2z}OSE8(;_x9ZrLcVlXpgWzpyvQBnhKwJWb){Bqo z9ksLu9-UZ?O$Qh0%RDB%zl`OoD2UJC9mM% zq}lxX0!PPAMcQ888}T{=*v*L~n68oS@kwDXWV7Y+DsM-V!}}2Ek*rP~5pg+Ov~DXE z@{=v?7(8y*hRD0E+Az=Z<+)+}C@DGN$m z!MosAPopQ6u?Y_KKcrVp$!W(7jG}JmKL^Bd;RsGmi@;@M-OYCKyI4LEz2plC!9tI! z_Z>`|y>W9h#KZBBs6$6dtnpR9)6@(XN_wieudu89UB4ub>l8`@cBwchjW|qC)P=km zJ!&-#pVgWj>~|#EDG?NTXHj20OBsA5_52VaF#?n5@zLxXOq$lEh&#gL=XD`6D!18W zHSiLCVgc9lCi|PaKxEBUxu7?PG@(4#Qg9WzRs~PYR-%7?$lOs=epDAkUM%F&-i!$5 zF`zOAkTFH33pvQ4+4?~y8=KBZGS)NsIS!-49louq(k0`a>h{Kw4KHQ(6~%2I; zW|lTX?|j3O?Hq}tI05)c>`~T?W#J0SCPGPR-P9BXc9^;%Aj?T8$M>zPLTy-#gp11K z&qtMW9Im>Aip;CQYz*Y&-JrI&hv(Xd*^rFo31K#tpAxKrA9{UO=m(eYuSA}ujM{5MYwLH>Ino(j=^`f2)xW9qgbh=8&4uRqJpzxgv?_Uf zZv2iZyrv|oY8hTjKDqTbow8jfmU00+Q1qo-OSL?aymjRQ;J9rPTfX6~;87Pql+5EA zR*;}#e$_%rCG!qjl#8Qi{<1UA7ILe9`on?|Q(S%iNDWAtH z2y;b0pSHlEPwT0om!gbvDjPC`>b$}D$dH}hwoUDF;Y=}2y}@;*u_^N>3hsjZn zpvtZ4>N!5u0n3pSD(kDX-hZ)d_L_eaKq`TM8l57y)>E$)xcCXn0?ePkVo_@$9DaiF z_!pfy3zBc^EU-ULdCXQQQ{-;R&f(>`ROab$j+2^9!MTcfTUi{krC(Zh6bXTY0okp$X^a#tE>lqJT)Tj0vQzC_SS*`BAqhhoLQ1waU| zjJ@i6RKn@Ou$|qeA}7`NkJV=(yT|(=8lJ0lX)s+8q}02@atPe>-z4|}ime=sOOEUC z75BR8cw34Tb4Ly=yU5~U^9DLW?!t9Kuz6`&j<;($9rgG42hV^|hVL>@u}@ieJlY99t&3& zBIlAM8U??~e2#f;1=k}d^r@eCbynA^vWwNJujUM+QW{)#Q=Lx+y^6dxgk*y#&#!4Y zVpw@TLbNJEu$Yci2djcZS!1cB&gzV=3LZk*-b7e}00^a=3moFdI*yw%@2J;6k*t zD!8%c<%y zB1Ju$4h@mf{oh}^?|z*3lga##00|0Qw45=>v^+$1tlXDNmV>rN2P*c_qC~`rCyFsC zqpEDbUI-#P-zWZd+445Nk%h}o56@K&kKsS3N5EqKLppp}`gWF*p~&0PpIci&VkpW| z!LZnOV^@I3_;+ThPzJazlqB=(f;L)boM7Bt6k@#(UjxeI7f;(iGh)a&!?C7NYFKElKIzjITZExi@Iiie}Az>74aJfS66n?^IE67e&Yd* zE9wiGM3&pgIix%i6{zB8a5Vey&9jaK!Hz?0$#$sVCt}RQ;hf!S78US2Mq03T;q&Gb zD&qIw&7`-g;wPi~{;Q(_t&gkaSNRxfU*MJT2+yx7g)#mX&QR1(o-DzD3!t0D9{(3Q zGm@v>kJ9Ea7k8&o_PaKXlcjR)>Sj+4Nr`Az zLoWdxMRL^qhs|0dfv+}o%XujCKOcA1h7i~rm?~$Ji>H{?**Sx#pfYA^{NIHy75E>w zWA}l*2n}_d`ty%sntqe7fGorL3&^aJEUuFNbqnN``jfo& z?ob(tzs6rKlKvImT>Gi-epTbq3%HEthyln7>6MUlfpk|Vul9w}i2J-ITy}64Athe; zx`yl2`x_XQ=~44f{FojPpRC9MkIcoD+p9xlOyYe$pXSbzQ-u1eatK6IT?+nPGKKpo z`1fCER|WNWWKJ{oCQAODAeKy@ihtVb{k0YTlPuNlM)v<$k+zGan{gX2XWV@v4Xo>- z>VLGog%2YS3#`NPF>Tm{t_|9|UoV8HRCDn0$6a4fzX7G18ZB78N_r1R*%(i`Wi63u+ zhi0}v4+gpM&XqB7E`al?GNkO<>JHV#H*_ih`cExy55*~N35mI`IzzQ}rOMjZ@sQgZ z4=K~XSi8Zwz_(c+i7zfl^?Xa{&HuFGG#q}Sv$faPKcgKX+gu!ibN%~PWpLa#6}vbg zU`zHdXA=qe?=Pp$2XMi=6v^-9l`nD&MgUOFsIfBwc%oE`$+frTUc;7pKeG-JJx$z^uK%NdSac)|C2x0#b#x>WOi~9ckoeQuzXK7u|(>X-| zmKt{Pb^d#rkat2r#L81=17Mr-urL`+8L!6q0I!bu7fT#szuq_B(Fp-$WokwMzbS)? z$;pg>iY%QHfLC9)!bN+9(Am3^L~~r$qCuuh6}iA8-!LSs7*&s{qvqmu?9SL{goAt` z%8Zx0bnG)jR!dk6S1ysX3$dq)#Dg;exJF%sup{QqnE-9Z2#ci1+6il2haKImsBjf_ zro~s7?ohaDAxXWxr4)JBTyD+~q0^ag}jq6JQCCMc&8CtYdg0W@G4K5+ByIS0kjyNsgz_6*+ z<(z<@`ENWZ(X%1gNaFeC;9FkGmD}OIni9?r=Ba#HrvxyrmlCdTm0+zef@LQFFlkT< zQT+<8hOi#chvi{@EHk9TSLMB7vb4a>|dw5ZcmHe zT)7>+SbHJ}%L~`!S4MX5^Bp1eQVQDD zRsF9~EL2NET`T~W>nTT17lF47Y268LA*ZcZg}KT@aSiv?cfpHlfBt6~ZIz?d3Y-z} zWqd-(KT&TYFV}ESezP4I7oR9Pl&11OgYgRgb(H0~Eat9kJI?`iIhiQ?@j4p%T+E#T z*Hj0N*4AlVIRxr(kg#0{!el^7JFfI!_r1<=7a>SxeR^-I1?E!TRS8Gc z{|542D$dq=?4>Z*e)E(8t&;w?#De1g6{>-p!MMsERcrf(&vTzuzxjv1wQ_#03vD%D z*u~vfZ@<^bu-5jWApmzv;}VWg|4$aE^>s*#EM)B40BfaO{T|*rB%E{(0IO)HYGEy= zx{a5GwhX@VB#>53rAwv2Si7>CgUPWhJj>W5z7Os(fp;1W;@UIFA1Z>l-hA(U;Vz<; zTHJv1@V+oJ+=oF;r308(FTpoW)~I*oD|PU|+8+7I1w=?a(|r8EP{Tt65|ruY-;x4i z-99ro>>#o-MutK8f+uv5l_~lV?FIQpm)C!4HjXh>%ts6iMINI9=bQ++xqf53YktIh zAXf8=IAj`(*3tRi{%h&X!8bcHZTAAtT%qj^{rn zXUl)o%ogX#Ee!N3+pF%=Y$x4Mvz>NdW;=5dCdbXY&$C^0zmaXVBT>r28aW z%xt!^?yGG1M+n(6B|6*Qf9uBi%kFox9sIw2jJUt({xDn2TDFsJn(egvy=-UQ?`J#j v{wUi;ca?45y~uXiO|l)jxmz$j%l4`pvYm9xY^U8K+gaCVJMVt$!w>!+%KLai diff --git a/src/delphi/eval/labelled_token_ids_dict_delphi-suite-delphi-llama2-100k.pkl b/src/delphi/eval/labelled_token_ids_dict_delphi-suite-delphi-llama2-100k.pkl deleted file mode 100644 index a429c5e9a0a8031609f799b771918ab60055e2ac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 274517 zcmbu|b-Y#8);DmvyHr3$1;j$Y7Ew|~IDjG;So8rY=@3Qo9K`@ZX}dANq7*x@yStMx z5#yaY-#b6+*?W%h8*BaUAMfY6_qosa{mn7QjJ5VY+i7JMmMl@S#J~TVx}aH?fB&Dk zHG7WjJ9_MxMiYjNEo?MqRNwvs7ZlH}+@bHNA!GXv8!~y|0QYaJbRN^F-GHG3`;Q$m z{@?#~)&KtesF7oaj2$v^g!>OF{`Vh@A2_<-|Nh(KCJq@kr0?iS?%&sU|H}>|M~)sa zWJKSwLq-g4)PLlNq2rwO-~Z{qR>y&32aX;-WW>PH?!R8c{Rf>#xc^}2xDo&RFO_%y zS+|knMl2|rTj}5baoq5Mqx%kX|K-Z=-}P|$hx8xzZ?J#=mpw+09P#gewRmnd|L;Z( z9PR&)|NnpTFFVd9`S1Vx2JXMpb6h_u-M<6tIca#mk^laG_HT;bevB2||JZ$O;eUav zyZ^ac-{AuXH0m^B?2xf;$m6<{bfqmURa{uQxUfudVcFusa>a$^iwi3h7gj7TtW;cB zxu~#8abeZs!fM5Z)r$*j6c^SkF055tSi88ePH|yfCzPUv(zH;97Ru5>Ia(-B3l(Ug zA}v&+h03&0g%+yPLN!{bP75_?p(ZWVqJ`SDP=^-k(n392*ohW)riEQ-p*}4%poNCC zuq!R>Mhm;sLL*w(gBJFrg}rE@F)cKqg{HKyH!U=yg?(sYUs~9Y7WSuw18AW+EgVP- zEoh-7EwrMAgJ|JkS~!Fj4yA>|XyI^LID!_Aq=nYBa1<@Hp@pMq;TT$IOAGC2p*<~h zpoNaK(1{j~rG?{Y;dol;ObaK_LKj--N(%+F(2W+l(?SngIFS}kqJ^Hca563QqJ`eH za0)G)N(-ma!s)cohZfGDg)?d4ELu357S5rCzO>Mf7W&h|09qJG3xjB3FfA0)!Vp>* zN(<-G!Z2DGP75PwVI(b#qJ{HlVKgm_p@p%uFpd_+)4~K=m`DqgXkjufOreGIX<;fY zTtEvK(!w-axQG^})4~i|m`Mw>Xkj)j%%O!MS}3N4i)rB!T9``w6Ka6R@1`0v~V9S+)oP+(87bX zu!a`a(!x4gSWgQZXyGARc$gM8(!wLO@F*=jMhlPA!V|RcBrQBe3s2L+Gqmt5Eo`EN z=V;-1T6lpLUZjPWXyIjAc!d^TrG?jM;dNSggBISTg|}$oZCZGT7T%?W_h{jLTKIq# zKBR?@XyId8_=FZdrG?LE;d5H}f)>7{g|BGgYg+h*7QUs0?`YwBTKIt$ex!xXw6KL1 zw$j2jTKI_;ex`+AXyI2{_>C5Rr-kjb@CPmIpoKqa;V)YFn-=~-Lb3eY3q}9__Ci64 zlK&A!^3OLY#D72EP*Bnn$}d19iK56aKopdM1o{z)g3>%uhOb3gB}LJHzlM>lMeNrw z3d->*$}1^szsXThflpBprbyNz+834h6qWfDRUiSovno$ig9I#Rb)KjJ30RAoJW?YQ5U9wlUffFa7K3G2ePx0!npD6q7cSit3FRO;A_z^q$tYxlSl=-@+o!; zDYE`BQ^D?hibgO6Z16qoL{a<~rV9476FEOpRrLRVq^e*qz7~yP3b<06D1>pDHB|`X zGTWOcnn5D;g*2)A*aPjMihq6MF#C7+@dpW+}s z#ld`vLqdwoN712y$b60*#uJAtgmL3LLLvOd7q%Q}zc|tp%t~zyQ^1va6eQq4+CT!P zIGV4;F?@=)3SsPvb_!wai}sL!iJoBk;v^|W_|aS`MW{tjg)sKT$qHfI_2L1xu1*7bQwAjzH6FFCua5)&w*J4aa zA)RKhFUAH!I*LL$rCN-G1WYlWuf>FrB6BS!212R@9LOY|m<)+9Bhn!-g(uGEiK(7o z7W)PE6jFCyXeXqVI*lL5MKA?yt?4{50}`RNq;k%*6H<51@&wZtvqK7Lv4??3Ddq$s z^I|XJ2T}}Egn>via@JFo#J0PcJICiYKmyL~E65a%xQ0)0EhO}Ex)j$bgz-`LdWA4P z>fYc9raNy;l_I%cga@me6v9}Gn-#+NM(GxwxYZL(Id2Ooq8Fv2!pg}tk}PLEA@q|3n_kkA|4?H4O~;!aO6<-9ATh?djv57H~RI}n-6d5@it9%)wbwOE}wg=;PU zvBJFyVO**ADTHy?x}UGb1AK}H?S!<<*4T;Y?bj`{wUE$Psym9-@x*$b*x-rGi`@<6 zp^zfF*x}ykh=&6qEp}MWjr>3!;Zr=yr+5se2fGJ=tUJOLG{ld-2OFZ#1 zUyE1x6tD8cYdrBfBtmORyVe^#@g`5am5RuD67bixw>`n^7w^CnaE-hRi7+G5Vt>z0 zNEhh$`C5EnOi?8MQ@IO1gaj<-N012Rlp6eFp7_KQOly4#Q@~n$#;5o^q<}|}RL(E> z6kmoE(m4Vr^(#9eEwiufgj9=fc;Z{0_zn`VwZ4Z0oRJ@R;zyp?oQg=EBjE+zmQ+Oa zRr*#>F#E+eKE+Qw@iR~S!V|wj0yg+>kbn*TJ0xI=?T~;e{(uBbv4bc6Ybnh&q5XN$rRS4sU8|C;~ln*H~Uz92Y zBL9|StVKnIFup#n#Mh!SOaU9b3QttcAu@baUoD5o@KJqro~YpoW~J7IDWsJeKHR7U z2{@^>`GM5oQ`F^&dOWccPwdPSyFen@7tyP0eMrE*XaEVAq9IT0>Ir6j?G`9#`3Sv$lN%%mO)Q@|NHOd*Ub^>BqSuGAwS5hhhS1dfCRtVL@`z!XREL>o^q zeQ|V1k@+F>m_Xz|WEv;6twh-O&UO-E|5L{H3Sk^b2cGEY31(6|@hOglL}+kn?>x>< zNGtVtp6JXIC-6iUp6JRG1$IK3)NXddO=|RV&>a$R`|aTgW{sQ(Q-lVWy7MHS=*bf& z^R?&|Qe-{^dIuu^5HMa2PEiQs_IoN{i_>5V*cYesL?3=2XTTJ2nVqQ+#z{R(A&irH zHcynFdN^1R4I}>czAI;P$7(cF-RedeK8mkp`6lbR%j=r zLtqF`4CRS)ApsZrFi3=QO0^iy6C-$HBu|X;1hdS}gDK#QjD`f9kuf|mmM6yX#CT6I zGcqBh$b8G0$fuYTQsiHh{Kf|-buv#(;feEkVrnWPdK6vY38pVD%$y>8Uu&8|7!SmY z6vEgS(;)%3oEe^A$~luyF-swg<($nAWR8-;ctt5v2;)GC6~dU}VxG8!C+70RrI3Jo z=VhK?8vJrT#T7g;k0<8y#Fc@_e5Ni4MCJo=Ax~Vz6Ib)ZBA!_638ul9q)HJzmX|^T z9!1L_0T=sno>;*X*YL!(JaL^Tn7+7PNnt$AZcqs0E$2o^z}C7660o&y=80Q8!OY05 zQVRP^soNyN{!;38p14CHjIFg&A&jkcCnR8N-31BQT6Y@><$F2zKmrbAl_!|)T%DgH z?UyI+%_nlcJOKxCA0)zzNUsU+j}k?N6{2e-=Rov7(e*%-C{7S?AP;(inUOUxMW}`J z>D^jLz=5n&2;*X3uMozkq76Lp5KlbJ6B`x6Sk6Zj!dT8ndEzmic-#}r8hL_G@uWf+ zXXGh`FwV%+kVr1}=r^&S@dQ)OXJHDsUu@!u=Xm0Io_GNga7JE)1T5!Ekbo&(=80E$ z;#Hn_%@a(6zpkV(F7`JR!noMqghZHB>6Y^rBw%a34T+FK`V8?Mo_N<2%%r{-Qe=Ct z<$l@feLlqpAw~YHF=K0es1U~1`iLLM$1p{*FOnDN@Da-=kbvF!DI{Qu&r%W5Gxc+x z_`(xRgMSHAz;b@Y6JPVhH;|AfHGG}rTTd|M{LY?2+J3*c6VkKU4+>%2FMd=A<9@N3 zC$>N$IjPZQwlx)z+;T!+Z1V)u7eB!iaGCwg6Tk2S`8A}-d?5Z7i2T#c(pv6J{hcJ- zufJ`FL})E(rTzg4IH@}z5mHDOr9VBvbmw2GQY6<%IF|ob2;-LXk3#sqfVC)5TK@eD zV~Uark)I-2i?Cmmf&}b~(w<-@wG2#=EN5~cVN%QTL^(SlEwl2FfaR>f6BQwmENAp6 zs$?gmT2#&_(!NTsl27EkN{0if$`jR65y|ch%dC1TB03{AJi#opnjuBz1F=>hG9QSw zNx~h7b$Ft#Czx{9<5TR!6Fc+7E_^NO!xV6(Hn0=YmebHqNY5j?LIUpKyFnt(VVZvfvHkNuP7~25z#A3OMW1&Ql&@^Bs>Wmq!7ma;$Vd^ zK3E+B33!ei>Ir7C9|lu|T}#?>4u=FB$Po%*oRK3H!nlLC_5@STqhJbH&Nh&MM6#UGy|WWf9Lp2O@x<|-VEUqSsua=gJOL80 zwYoq8PHI=4DBy{1kbt%54vA0;>Gs@^InUrzoCygyBWFP(lv7&lXWI#BM$WMlQi{G3VgFNu zei9M=8g=w!@9znwF9z@_2IdeMeg}F`4w3&m(4jA+NgWIc*qw!tfGLJR0;U+s6X!xg zABg*Gd6*|Mx0WM@^C?Csgyr$hr5LFYmd86sjDmz-PWQg^JV?O4810G7GvZQ=fhm#$ ziQaz4+6igz9H$VLNp%AmuMn0=b;Jaon5YoOflN{e<3J|!#1v03t#v-1VyZ$I%XvXQ zk@iX8!hAw{5|A2vnw^kNvx|6Qx+j=Poe@%G`&8F0vzdXAKGh9dj#SQBJTaRm=GX~0 zsnNZ&NFj_3UaSzt{o-PtxWp4oYt25Cg-isXz$%Xt$dU^#E*iCZ9%97yza z(5;?e%6S`~;&z_6gCEFBKE<7ofPHZnPu$HD_jrQoi&Zd1vOAN@EPQ!lH6&m;?^Oun z{o+1_FkTw(=ZOb+;z6ER1Bo!H(zDsxd?M`w_qu#Sdf=9BeCv5)15Z5U31*o+%%|80 z3AoH2fkfyFX+|F9iN`#_l=E?z0@mUQo_LZU$WwfZr+MNTg)nZv&nkp*``siF_K)74 zlL-4qZ_h&lw$=+g@uDY~HS!Wn0T=trkbt##1ro^=?tZbbWQmd`{{2^RVa?*gT1ADm z-Q$t}aj94BmUOJWW+$W_@O4ixQ}YI&;!Qgt)#5EXA=Tn-NWcbp#}iCB-wi1;zs-Ix z5YpRhIFR=tk?i#7bKVC$@gXGOKtAG$k0B9iA$8{`kbndE6cQnY)Zm{fgz;qjTp^4v zKEL3JFL~lCPY^40T(g3&`5fOs0(Qo?kbs@>9Z!7k38J6{KkzAjgaj<-W=Mo`O3Qr< zBw&iIkO(QH<-Uz4e&UIrApvXgizk@Y`jt=d8zjPvNPY1;Pi&V6`*Xz~o?yzkBc#Z` zq!>?21e}qoo?te>idEyA3IFct?dxB~3qf(_v zPHH$u+CTy>_M>^?7?N;*de#;a`j+E}c94LzXb%ZJh5I{@4xV88q9aTJ2hvF)jEntP zg)p9G$MMARo?yz^SxI56#R&>wtVNeVWWF494MhIsz&MZsg)k1J8zkTw>CO{9c;ZB! zILQ-CclHb^q`uJ0={CNT1CiesImfc}E~l5BaF6}bqo_Aui&K;o#s)uCA&d=v8c&?g z6MZ0|_k}x(&ftkNdEzWcz|-t(PcX~u96m)~p6JID{dr=5Czx^$gehQm4uS;ii@}h9 zDGGUF2u}?41T!P&rb-dr!G}Qt&d6{`z!@3A6C*vrlyelH;yg&`<#Z?eXhnLnwH z7?X;~e`a^YSe_W?iOe(NQjF(QOyG%$JTZwUCVPS@=M+B0`H+AOJ{1zl29I{<1(1L# zF7yOb&S@|O9LPmHF`XaC4449zb0$f+tLrRC=;d_8Y_b-Pn8T+i;)!COxR|fSB_W0M zoT~SQE9cxmNYAN|xRf8rWg&$$5ZGFm2SOSMB(C6zd3-JA!xV5vu2cx)Gy4LCFg~*{ zu1G~B z2jV|^bHp`}(3hDbu7yOhocXnI#C1GzJtW{jZs3UPN34KysIdA8QJ93B&-=JHWLuB{{-JLvf7f;-6C!|Te$4*Eu^jARw_Qh(RxEB(k z!KE)G-scHsvEL6519IWAqwLG!T6U>aPhbfYMk$)_^U26j*;G{kT z37FzxNWc^uJ;9Xo5tsrFdAw}kg%%=j8|Bz|iFP>Hi z<9_iBPdpn4SPSVi+Z2fWTKIuTeG$GX^;{|<{{rpyi{~K$Tk8d$c#$Vw@&wc1FT)hc z8Htwj6-dBxzN!$$HS(H57}v<_Jn@Dnm>GE!rhqf@7Eip*6YucEyPjam`JR%(*jn!^ zgt4_gfJAaqqqm$7Apw`!N1kBH`7ulZ2l7cOBD&>#suM+pm6a6!M|VEciQ@lDL{ogu z6JJ0APU@GQU?%k|m?BxuXf3|xiEkhQYw;~ld7fzRVo)CnWTNIN~ozz!ZP;#6OTo4kX&(CCZwA|3Z4m zENLgCSEi*h6XDOaN_&E7@G_}VJb=z-`awJJ|k$OPcY?d&!_015XQZ;qe58rPPfcD@x-x^2$L%9TF2Q5 z>GpekDk9oiogo2R>ja+Y0*O#gsTN)Bgft@skbt%5#uMFnq6bO1WBEi+WZo|vaS}|S zPpTt&^2EtJ(TlG|?^G$0D>dA;PEiQs-g&A*828T8AQ2{2T4tw10xq*YJaGmj!a$@g z=S)vzUL)=(IxD2e_RQ|q$k~CAp4s(*IN}_h=t~l=7X5glKTizs1T(1vLyByZ>S{5F zPcb;8$krDwMIlcN;fbL>G(PcY@21XIA)nrtMB3ah#9Oy0HJ z1GoPk-xNr|Nj;yh#Z;IgS&QUA!WY{wfCL=Kg*-705@8_HO1%gYa3Iqm5mHFI)(l9% z6f=2ZmM55HHk(f|2NI#2(xevg11YwrkS6tFp16c3=JLd)o?!aovXCO%vFsiuE)Rrs zEW;07#_QTLg)m;%mV1Kf&J{2PoYZR|0VnlZp16)DuJ;5}&KqC~shr`)cOxWVId9^L zn|b0Ep13s-(rKpG!kuQf1tRllb~{hp0f}T^L_f(~=?SJg?}RDza=JH4cR?ZyL^@ON zh6GG;4^OOuL>Nf^LuOH7T~~|Mo?vF=UYG*b;y#5i9!2*ngz+eP01~jA5AwtsPcSpG z7N&qRvQ8n4GqPSG{EUQhO3Q46L>Rxo?Do!wJdwGaj(Au~5xxT*1`>S|aKuKQc!Vb& z<%!2g!fiQ^dm{6UIN}MI0yg-Q3Sn&Urxe1tc}84{H&dlZuGH{g^_HEG_KUakiM0F0JNZP;egRwST}XtD zPrACkrx3;+{C$PUxuQrZK1fC6x0YL}AM(UUkkDturTCa9KH-T^dEzrqWL~MR7N5fu zQg?=LN__zdeMa1V@ufl-8~iJU$Zzoc?sO@>=8123;#*H-o>VuG?_dfzBj5AH4?OWB zUyIFriY<@`YeYJVwn75V$Tpt%i6?&c1k;_r@F{*(2;+Y7n?e}(i{B-}eqGxx5%%la zAD&=lWJjtL$?ID9tmw~FM0ET83leaR{LK^p1S0wp+j5OBQ4an71sq68PcV~Oice7* z5@F+uz7BFnQ5i_UT9oC9ay(ILn7-HprbzZhbW-=U6Vg-B zUUovdUo?gUEN2s*XbK5f&b>XsOlq@`B6@YT{2b~&fyn&*i+y=wKc3hh5^zQi@B}j> z&G{4u@ zoqg;?&h{H#+@7Hj#z{R>A@V0RdIvuX5^z$_hD0c*v{KLU1k)FN`4s(lqCZaz;E92r zV9GfNrht7hm?sJ$5n4;SYYnjzQg;sJiF0{k7*7oM1k)EI_!J{~ViZrD2MO5VqahKx zQ);a-kbo)1LIS232ML&BJS0L2X{Anp1WYjz5+Q{&Ba=MAtdYqu1suo}NQ8k%dyI)fCIUbCl+{uStASi6j$-Y)e2#3@I?w?Z1Ba9fPJyV6U>Y( zO_d_KUxaUgEQ18>&gGB@wU8G33OkW=riOuB0|_{gYdyit$aOG9s71~|!sXz4NWg*I z0EuLZ=uvbdUyGag6gS%mskLs25=CkLxb#-O7Pm!H_&;$q?pn7igmKrpgC|x>g#Ekm zcS=O~Zai%8yA;A$&bt-D_}29vNWg)t;)&Hfajz$sjqg5~BDv)xkD{>g-ESwP#r}Yu zkhb3kGZWz`TH^_(oNH61i0X8VkVJ(2BH3EeT0HCtW=1wDDU4gr zBNAb+#iJ5ouf=0L@i-)4UpxVcFsagFe=?s)`*`_KBCz!r?8>WDh`VLRL3yCnPZbtIU>Au489#6av30R8{c;Z8z_=qPy=7~={ z!F1=RFooV1?vG19g9Mz^&w1htp7;_H$yy|zio(OhSDs*INq@9qi@ufV$^hIe)ilV}*#qvMeri_Is{vQ!NN6PXl!as1U|os}fIC=4(*}rU+{!`c&lZ7gZqvS86qdFwRJIg)q)Y4NowwRg+Iq3ld>Q zq+P4FoscHA4o}qOiF%%3W@INm#m+fIhF_}LC5OoWrJ69Q(u~ySi3U8;kSBKKiQRZ& zcb;ej3Apj?;R&X}_k<~6YwZOIIFQDWfGL_l0;Xun6MK7tnUQ88g>?Icwb&;R`L|!= zfw-?i7!Sn#6vCKde}yonIKUIkj5Oy{9LN(bc%mgFU~9GFiGw`B%*erfibEg)_lrX% z!v2ZXVSFtP4=FP57e@plf4?v;vm+sqY^~_CS?g3p^da*oo@nC@`Qh_+T|o;ZOgy6{9-o+wZV<1MF~LKwH-?mW?hCr;#vlRUw!)SfU!SY|nE zBzz2caw;OaMtY?pk_Tc)(K{6pO>s&pBAVjVR75nzX*_W{PxSEw(^_ZnDbD1Hv-p9W z&8IkrC;IY4Kc48%69ZBa(PcIe67YU8h$jX^BDr@)7keQj;6R4(#88DW9?Rz{gz;D& z#uLLm!R%Ti_!J{a!hQX16eNmPDhOPMCNijVhowWedsloC&ux_ zc%GQR6B8i;_lrq9F_|Z(@WlC$NUo7+gHQDY)8H4t6#9&~HF6HCz$TMETqW%2Kw?qWPSsEg+dr_eDf5-c;lPz38tJ^!W7BH zo_`3q?py!~*jfvD;wp)-e?D@xM1;TSfwfrV31&tXD=CbvwL~F|t+iAkj475WgfYc( zNWd9c;R$9&uHjQ$%M;f@0xq-bArZz>2 zxS1zz;fY&$;xwhz%xAYEKh9WYw;YP;(18GNqvDQUgU|FXkuKmf|ntZY_8~Xe+3e- zpszwAq>$F?Ymk5`UWY_TAuY5wJi#pYH<9Dv5@9@;-OK0?QW42z7Cw>ukgvr@N(y6ZeXJ11+tw#M@hMMymP2ItTdB`;i2T2m zf(`x!PkgBm#!3B3A&isyHBWrw31+c>3sb<2?>nCOUMGqQD<@MV7khYW|3N1#|76-9 z?SyouZnhKdKuiuKd=|I`68ai(d*@b9FnzHNrch?2sIabkv-J}s^np0yXGp*lzd!<} z_!Sa9MUnhB`oDRCnUUW^3h9G9IFRju$oxUx9|~dIJ9j9A@vikJBw#K6@&q#@f5Q~X z?u@<=`UetlASEiuKZRkOk&>QZYEeo_VVu;`JW+-x%JQ`+$EPUI6BT%(B2QG}iOP0D zI+m*_gmH~jRS3(C&mGIvJdt^sIifmDp>H{ksKFC8CBpcG=u*^@i137{Z+woZ%@cJX z0n1sJC+hLUPCT)*Co*@ZyRPj*rf@`ko@f9Gy|rA5h6-WaFLqT3%YNZf?B)q(Ms|lO zk_{ex9%*DJr0sVPNWdA{6B5aRMDJRAd4eftV?IR_NWfY&g+#Iz$w>{L>+TH+n4%d^ z?BfY$M)u`X?8g)P^TYvsEtNrt@soNK?2s|V4gSx z60n?y+6id~Kg>=@@3jtx1T5zfo?!NiBVh_y&enE9D(6vlLMmq)NWgL)?Fpuw$4DuR zPXg|i(^ewFlK`BNc0AFZCpvh7DQ8DMMJI(YUR{q>2;(y(g~1& z%d87ebcIB6Akj0mKqBlbwVOoPS88{KFm`7Tg)qKpJW(R-wKz#4?6v606DRXTFP`Y_ z31;Ivg->xRPn^aRr}IP~o;ZUf+}pJ?dEzXdIGZQV;fcPU$lTzroc&UzNFGJu>n!~t z0gs{q3Sr!S2P%Yd`yB)cIFP}f$UGzN-dPA!z%z9SB;Y`XLIS2ZmnVkt#BiP%!4o5S zViZrD#}lJT!fiQYcw#J1j3WuRQpbCOS?m*F3b;}yDui+GoTL!O2dl|EF~t*1InO6k zxbB?F6BqErg=8(Z|%YfV=OV{6TTL^z71_gXU{0higVd?M{XN_BQVk@g>@ zI!7UlGg71w#u+K*iHkkKEcQ$I6muZ~TkFysBEz?jFUuh^eEaxvPcY@Yf=@9I5}};Z zpIyx72XbXdA$=iHPvQ2?1%Z&hkO+x|kbqmxRglnAxGT!lo?!Z75uai)Pb}eyr983B z6HGamr%I9k!0je=1tef=UBeUC^2BvKalI#)8My(bNG`MJjNAwbI3qVfLP`<7JaKa> zBKkJ|7M{43CvJlTEa&YCVZ12ap%BK4(n_AVlPB)-1hYo&=2P6m6RUV)H6-9NyVp)g z&m;Fqg#Ck_`z6BuLC*u8VEW=gKE)cISPKcbU#x>f=nH95*FyrP*pNeHc$j!7hsf|S z@vtYDzSsy;gmOwx0*^ofmh(}bc#J0=_XJbUCtwO#&L?@|DV}(mC!XPnXFb8p$fi^& zqL;?!AORcvd7gNIuf>Zn1?6M~>i45WsvoBQKp~6|Rv#*a@xkgNp7_`k%#3^j zQ^1YyQ%JzY{uxhvt`Npa{X!v(llmo3d=)Z%iexRKFK+*U1gymliLk$v`coq8FQxwCiN8I;%*a0>MYhin z-KR7qD)PU7p%BJ1wG>a3=7}<%U}mH&pQ4;XSavPP!Psk=e~rr3=qc2@|? zQRMCyjTFLi6ggrKPcRL>C!b<3NQ6n17JFkzz)5Yw6HPtAlyh%BMKegia_$2OSk8U< zTI|QC*dG$G76;e~Y2$0I5XQxRph6fIdkdau=?SL6Tk$ClQV3%?4^{|Y&d?XqIdTXj z;2Js96HGY|<5L{Y6G!mGkvT+$3v}xoBEtpxD4uBJ31(7{hAEQW8GXDv1`@D4+bV={ znYB|0KB>QzUn-=wk296Q@7| zPU@+UfCD*=Cr*b197rEeFq3))Op&ZbbeWx*iiq~bSv+wzPn_ckrks6KrHGcZA0%L3 z^oK;4ROxas01`08Ku<8`9K@#>%oBw?F+?GZ=g3fnFrFjlLL&5qbmJSQ5XONFR|w-k zMnEDtkm!~(QXz~38Kn@$ft<$^qj_SCCzzEw7N$tfNOI!~U!E8T3Ap8qheR?(v@a%* zg!?>oq9>SgPU2Hc=7}kg&}YQe;(SQxGvbJ;p2%EIM_dq6Wc%`jBQ6Yt^at3m7Ss5F zT*RlC&J#0uVkRV#eUaQR!moVIN=4)^b~mZBCBps%qd5{`|AJAGCo(s<``oOUPjN9M z^agh+E`daHQuAlT5pyNNJ|mY(gndRX^8_;^m-8vEfCQY8c|0+nuf>%xMY1pQ7rX0= z1qxxjt}Rpu%XQ7|7gzDb)jY8X63KE#m)T-Sz}8yA6H9qwnL-%*V!1;2zJTRi!4ua& z0?x>_o?v#Z>-ZGc^TZ912s0wRzqrv(NS~+PXMw-S}>S1ni4jdEz#OFfOy( z6~eg8?(hUNBP&CSY)=C2D7rHc*`5R(aTibA&DY`{C53TPS1E+$Om($btq{f(_bP<3 zweEuiTxRz}BJ5hymh*rom=}hH&d9S0VVsdoJnGan~;D5d5b6BhD358(F5@vo_Loh-s6e)J;C(F2QUSk z)DL;$BcAwJA&lq9CkkOaM?Qr_=uYYO`&kZ=;j6=+=MWjbI{bwvnAZ9-q>yeop%&63 z%~yet?iYcO%K0@UV1s`HiI74X$hSQ4ohO(X`Cdw4|9s>Ji3p#Mz;gZwiBL{yQa9TP z=?QU5Dk6H0Z1n^)BimpKSk9jy0n7O_Bw&hP_*(obrLcGBZxUhe&fgWnc#dpW2;({O z2T$yPMCc1?*ZR{FOoRW$r}!HZusi>O1gu4gO7ibt_(}CdQDL>5EyumfDK4z*URIWb z1RO{yPcSo5I;4Qdvb6n{2}JT(PW97mW%+@Wi>65a`Wu|o@{oWtQb8e%`$a{CFzy$X zAQ5UIO={(QBJDpBrb<4M_MZq-RUwS!tfmmga#rVw8lGTQYE3>xEl7mEkPd;`JW+=y z>heTAPcSpGQ>qlvw8i~hTV9{>q><7=)E#(^B@38p(+C@GAGKuexz1&L%|ME8q>c;aABFy%Z1 zrht8MC{G;56Nl#z8D1YBkwav7eS9QOw1xz1t)o1_^hKLgDWb>n(L8YsBw%;8g+#KP z(Sfvs1WeH$5->#vg)nZv9Tmd3cXr~5V<8c0Aw9DnXD6h`yW@GHGf$kr6I~zyyR$1v zxG(t^@I*IAguaj_wYw*njjsnx0n2$JB*H+X20zJ8TB)NU0S7V$5+Q}OM#csL zek2wKiS8FudEx?v zFm8MoDunTVF^wlK@&wbJ(_xC_j6@$MX7I#Jo|wfGvpvC-b55!h(FQNF6Vf42ED`pL z(!~;CzbIXjib&QXJnGI>2;)k(>sl`8ecJUbD6m~hAPd`Rf0 znIo>`i3L2dFiP0J^5q6{Rg_3xQNoPml+%C0?TD*Ak-0A%u_#rF{2km8i+N%RPb?(~ z_l3k|kkGr+5z9T1c}5(uBBaQ6`*p-MfyjLOy%rM5?#%BCm*P6U7T2dr5j_NMuoKe8 zcOxWVU);nKH+zC>Yuy7=z*?+Q2uo|ZEoZet7_V#h^2B|fV9I$vpW*?9FqZQ{g)o+L z4Nt7~1XIp+Fh%k-i!S!{kbn)ofhQhP#4nmueMqHovSGTR6VSc^wI!A$C-Fh!_E z^aAY$@|Z#x*T~}vVO%3m@Whjl2(^&<;webLNqrg;A%)ah&v=6Ai)Ud9IFL;|@f=S) z&l4~3#EX!Clll@*yv!4?@WiV;@tU2GR_g0HM27D)zL7&@_)g=Sfyn%v`c@z^Kc~LU z6YucEyPjZnt@mJxbVgJ6C zJBpe{3F-S<`PVf^?Cl9=Mw+Eck$j{H58V4GgmF^$RS4sx?#C1R^TYw3U}mH_OaZ&| zKs%B1NoJVT7O9A6cednf(F&%3hrmHRaWEv{j2yz(;!t}E>8^DcBw#rYheW7_l;Q|T zz!XRFL~BnlEA=QoMH_`M?wv;~gmLdYhOb3iK1Dl;uzz>0y+qi*yVijxI(mX>txl;@ zM33cTApw`!aSCDV&f^ur*qxm{!Ibj^m?F7rMa$VG6%joUyYfT;(rG!n`W7oY42ronrK6q#Rv^yX8X!lyVj5Sd@ZoTd=QUF&p( zFz#A?c;XC5gzl8q$eECUYve3Qz!YcO3F&#{9EC7WYF|$<%dDT0!uLg}g|uJvhXkCF z0g!+x2J*xpNWg&%_5?E{g?x%33Sn%mp$cK#FV5wOVV+>hIUJ^dtu=xtMk<7HMn)-w zaYoMLiP4^5W@HRZ0cT_^PmH69g(XUqEb;HZ3dYk`!30Ray=NjMU|lEi#AHt}4KjsK zalS$r*V|NuFs`=?c;Z4&Fy)*EQzZK$`KTEFj_)E!!0w#R6Ek>XCQr=biP=0c2NGdY zrJHn-osjOZ#hHoFS{FkCw$>#)F&7e{oKh_=RS4sjbD2UIx17s);tEeNTh2U~LMmtI zi}`j!YVa!|0b6SUPb`E)vKG~q~5|4w?ZO0snL7qZFWLB z1a7wz(hIITAOXv{k|*xuiMu?(tkk<<3TaZq7cuU!6Vjxvf&`qA)qE}Pg(;G?h@R~C z@x=W+@c>UeC=uq5y2mvuSR*0f!)`dOYk6WFB*L^x{kh&wNSn?EJ0Ycb$WBNPH4iI< z@u6m;LKq)v9`OXTZ#@cAgh`cVpMXRdh}0KPLIS3E3KAiORL-X*!n|V? zJR=cdzkn&8^#s#en_vo9&gXdIc}Rp>NPY1FB;Y_^ghWUo^~Fn`U}ogyoD>=UxaXA| zBEui|yb1|ei`RJKb)I-bA&iH>n+o9%0a%N-c;anHzy^N@63KxiA8Nw=_gzmg4gMZX z0SEHFLKr9Y1BEc2>>u*A_=r#OF;9HL6QAXTH zNEhgDQW4Q-#BV*p>=)mK6q#Qje;)|=0$D2O4|YN-=Z~p~XgN3Y#1=@v?RTpum`U9R zQ^2F>CxtK`0zWH+@nru660n@V^2BeRU}ofZm;%no_Ebc)!T*2+Eawi1uj|d8>+vaef&^^vofX1(UE4(=jL+=#0|6fFn!SkrhqH8DI}7$h+dTT zh6GH}j3@SiL>P#)*!P759LRo<2q~ofVt<}Ez!OYsHBXfyxkke4;{zc9%h`e_TJp7M z6;i-+M4Hrt0+IO~IhZF7;fX^%!L-(4sZvB+>+n=Wbk{n9CywNa);w{PCzu&&lPX1Y zMvhhpsc2 z%cnSwCyrMLV|R8|2xE7i0Ey(~Ai2!KHNHzKBD(Q)<%t5G=*AP>QxVB>hAT>sR7A9# zC-THekboDZo}S3OQr-KDllc_A6vDX7dMku+nVrHDr+R`Z=V>s7wAe#;o^B_kqo|La zh@K;^FV5hJGa&&R{47W$%Nbp%XY<54JkggY`gwxs&i<)VMEA}CkbsjqP$7)_#UO?7 z`vvTa!8}n230R9Eo?s?*C`QtBlw$=qa zaUoAk$>NWx%ou#|0Uqwd8tAe`{FW%F!sgeo?zC<6?}?$3Sq3pe1$O9 z;!2)apb*A^Ec663BUdRYd@aHnaSz-%4etLrWml&nqK~?Zcw(_c7++$$mAXVCj4!br zu~Z?9le$bH@@FJk&gGDRTh0nkFs*eBOp$D@=s>RJiR*acdLyBHdEy37Fy*|_n8N?# zdZOjLi6?I6iCcK$R-U*m6%pO|Zcjx-XXFlrFzy#CJ;Airol1)QEhk!wyC4C#oV$7A z9!P{ON4oK?f&?7MYDk0>(o5`ndE!1#Fnw{qlET=X4=9B3uJs@!U^&pj6t>IOc=Ly!nFA}zCrApvW#ktZJEiAO!b%*bOfMJQ*^9Xx#W_PCvpX5VFa_+3mlVQyn!T(L z#?$N-NWi{$l_y@~iPt^BOzIn{Qbg~aZ$bk0#aocjXT*JL>}{TS2NG}~??OT!h|6ON9Ls z?ivzd|Af0HPt@Xx+MZxqs}4*7_lvp;VcdS}DTHzR-H9i5=80W+qP{1XNo~NVXvh=0 z^2BaDu{%#R;)y*x!A$C&Aw}kIrSBDp%->3H3<>E#3?FVZQ3&IH(NrOf7wEk`!OTcA zm;&yd`zVC57W*oMaqrxZC-&!w13baZNOPDX*`3jg(t$kDf+t$?L@Q4)!*YlW|3I$co?se$1fOCgB*LUh3eGW4c{dSOB?Bnf(G>{39(047DVxlLQ2A>pCNH4KriphbHUScQ9nLLWZ zQ_&PVA(iudNWg(ig+y{7(Tmaro?vF=LO#VbNWfZL#1qqbVumM}a?a#a%z{Md3#mJ2 zLjumo9G)nGL>P#)@fCZ5nURZO3RsIvcw#OjLM^08y;LEL4-=Q=6KPMg%kznx(+rmL z3P`|O%u@(sE#@nP@%_b>Jh1>0u$&7$!K{(1_!L(|B1~%Z!OGn)7TF1DjV!hk(w4IX z5^zSA^29PvFq68RPqBh0uHlJmdEz=qNZlFUUtA9f*cUf=f|=ACVG3y=;r4qIB;Y`9 zhD1mqt<+l}p{H|+d z-3nnm+3!&Z<15oukO-|Mm2)*D^jf&qx)%~5MYO>kaUW0I?+KVI8z@^MMPhLY_t>7N_|8kj4vzQ#`mZvGMCd4k4Y(_ z?>m#_44+Iqo{GpXr%Uk!Pdv#JPw~Xlp2$2SZtr|1Rf^<{ga&^$6_Gz9F2yFEc#bEY z=ZP11;zds|lloGs6v;^q7wDHE0S|#!6vBARc~v2dx185J!IblLKE)f5fT!7;kbpDt z79^4>l6?^t``eI!Dc`lseT6VK_y-DMZ14{~!IbkOKE=m8@d;0S z3W;Q2M317+c;a(Nz}EVLC%()fGJIj+s~jT37Y4qD1e}p?Ji#>hx2aM@H@@#60Vnl) zNWfbBz!N|6#O72)w47T!!A$DbR4Jm1eH%~w1PRy|Kl8*dJn^e1m>KzvPw_iXZ0CtT zcw&bqm~#FZQb=DUf&0Z@fsnpP0=x5XejxwA6v=@k_lxj6Qlgst`xnNGQc0dD#S^7@ zq6|-zg+y{jqI+jKo+!@~6(9i{yrL(Vm0Ag=NR~6%;NiilG9=(Ysz4%{B08y6ApuiV zYp3RA!tIZPpp zGjg~>_!&voB6*sHCxIg%0S9uVCzx`!hAH4cjf5XM?`;)!D+5gJ_T&f{{33{L{b=Mc#!0b@BkD}=F}C-6iU zp6KccraKGx6y12DyFwTzwTD6&C-p>6Fy%Z6rU+|9+J1XxCc=Suaw;PFUaJ>R^yZ0E zJi*M!sUbz?XS34+k@?x|bUPvSMIVJQu8}i5!Ibk%C52yRQiF%?JPQ)=esMM=k}0AM zehwsHioQJ24-(0NByYdrT~2>bFq1kURf=di2ighgaxe%Ia7G400+zGT6HGaWkSW|Z zZihlbuZ1JdB?C(#rx?W(=kdg7iLk%89U~F>FK(q5 zbYpp998Zk*1k)E2Ql&_4zu`2S2nl_qx@9&A63JaFx<)4R#1v03NdU)Qdd9%*b>pMRc)88+-;N;H1vviCH``ntE4bKvtJ5{Pzz~BF0&KTfp|Gziz{FXxXk83 zBGf_}$b6o-QX!13wIH8JyG9n~6Ve)y`r;~yurxZV>?Id4dnB6&p#uRv~u1Z=IFAQ4(i+AnUF2>X6< z3s2lC5&2t=G?3dQ!ak7OB_cnCbQIme6DxV*PM)}nC+_wHv&`;+Dc~Aer4YuGeYHXu zZ+!Ri#C?zmi(Tr@`#r(T$OBRe`!ah_BEm9*lez{Huoi0}0aL7l1Wd6W5-`OEo_L5S z9`*#&S{s!V#+CYrLKs)-qdf5#Pdu&=##%g~5c#izl2?@Q!2Kj7V0S*{38pWehAEQE zEIE*H*LnsLa3Ig}#3o222NHd-dd?F}IiKfKykIA!>)MMFVLwM+l8A7Qz;eFK6R$u5 zw$`gW@fuIO4vA1sX;R;?6VessO-R6UzU2vKjl9jLcn1=poKh{`<%#zs!oJMjmk9eZ z`@l{}t@U9(k@oG{NBM;Gc1@a*k3GS()+hE9QY}8U6K){+2cr9&{WD1D%ghmi)qKJ3PTm>Yp%0SZ30({1;FB%@hCdM2YI?_b*^AO7cW0J0Wd3 zr4_>1;AK3)v{qRqh4B?gxl}~-C@OC!q;giU6VWx|u4@%Nk-40Xs035!8=oU8D}=Ex zswhPMq(*0?Do<47iRwI2!xNb&)!i>@@+oTB38}$r+lgp{yMff;zNj z%gohcXN52>vt1OzxXkMFL<2~`S~P?NJX3egAu{|0;BGlYhQ9#ZT_KD&zD7K;2T$zj z31*G##iwYj5XLpqL?Mhb(i9TO?u@>C*jpit18JrZ#)0g^6Z?9C>5Khf3RuqlApy6X z10a!15xt@`hXhPL5=3uI{p>`vwOoVuw-eE;pCbl% zB6B$%F%YJJ%WM!&42DElsX4U>x5q+Az<~_$MCNk3N5!FhigOjhc*_~45XJ*>I8Th= ziIF@piYLzV1k)FzVTxpfN4K0Y5@GxzhFk1oCBpbE3P+5CgucuiF&+}I784+$r*JEE zq9>S1odi=P2NHeeoopwf=ZG7~lpG?%9sK+pB6$Zl?iW)%!Ibj?KE;KQNX|%fnN8yd za#5-j$-W3Lb*4iCHuwxkz*@}YiCK_<1DVYeb07h0Q3MG%kYZ0TYvf`jh4FH5i9#3` z`&^#5lqW9ZiOV4o`a;@%udox+6XHBbz}A}Y38uBKgegKTa_-<^*IHmFq;f9IAu^n1 zSLG1-r&)3!(L>;BPcY?N1XI9CT?`30kR|*;mhvf<@x*eTSiuw5c!KDQam@;@O_d{g zJ@5~^?h1WfDk8td9dW&#kk;xAc0$_vZsduZc;aTBxWyAhXB6BDQ@|bLHlDbhC+^^h zl{|4LB*N|~HTYeSfHQJ8Bti=5RJ}(bjCa0O3SqqSt>%e)dE!2vxZe}ZN_`-t$o3(N zduo3$5YmS%u(j4eBH3E`hk#45mM7Nn#Ck};T5Ny>ti?k-@i0$p4)qPUw7%$K2Ln05XM@3s1UvuQg?=r$v;X(M7#52PcVJ)NvaglW%j9^koL~c zAOY9N=RENRPkadp*cV?x0@mVdp7@3*zU7JUJi#>h_k4;UAQ2{2YVaRsa#*5P5kbqmxKXyW>#kl7G4<0=_wEzGB diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 9f68d4ed..4e866e66 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -2,11 +2,15 @@ import spacy from spacy.tokens import Doc, Token +from spacy.util import is_package # make sure the english language model capabilities are installed by the equivalent of: # python -m spacy download en_core_web_sm # Should be run once, initially. Download only starts if not already installed. -spacy.cli.download("en_core_web_sm", False, False, "-q") +SPACY_MODEL = "en_core_web_trf" +NLP = None # global var to hold the language model +if not is_package(SPACY_MODEL): + spacy.cli.download("en_core_web_trf", False, False) TOKEN_LABELS: dict[str, Callable] = { @@ -148,8 +152,11 @@ def label_batch_sentences( corresponding token length where each entry provides the labels/categories for the token. Sentence -> Token -> Labels """ - # Load english language model - nlp = spacy.load("en_core_web_sm") + global NLP + + if NLP is None: + # Load english language model + NLP = spacy.load("en_core_web_trf") # labelled tokens, list holding sentences holding tokens holding corresponding token labels labelled_sentences: list[list[dict[str, bool]]] = list() @@ -157,14 +164,14 @@ def label_batch_sentences( for sentence in sentences: if tokenized: # sentence is a list of tokens - doc = Doc(nlp.vocab, words=sentence) + doc = Doc(NLP.vocab, words=sentence) # type: ignore # Apply the spaCy pipeline, except for the tokenizer - for name, proc in nlp.pipeline: + for name, proc in NLP.pipeline: if name != "tokenizer": doc = proc(doc) else: # sentence is a single string - doc = nlp(sentence) + doc = NLP(sentence) # type: ignore labelled_tokens = list() # list holding labels for all tokens of sentence labelled_tokens = label_sentence(doc) From f234ec597b7cf6f705219493069f397cd45baafc Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 15 Feb 2024 15:58:19 +0100 Subject: [PATCH 23/29] bugfix: handle tokens empty string '' --- src/delphi/eval/token_labelling.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 4e866e66..55cecc02 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -82,14 +82,14 @@ def explain_token_labels(token: Optional[Token] = None) -> None: print(" ", label.ljust(10), key) -def label_single_token(token: Token) -> dict[str, bool]: +def label_single_token(token: Token | None) -> dict[str, bool]: """ Labels a single token. A token, that has been analyzed by the spaCy library. Parameters ---------- - token : Token + token : Token | None The token to be labelled. Returns @@ -99,6 +99,13 @@ def label_single_token(token: Token) -> dict[str, bool]: corresponding boolean values. """ labels = dict() # The dict holding labels of a single token + # if token is None, then it is a '' empty strong token or similar + if token is None: + for label_name, category_check in TOKEN_LABELS.items(): + labels[label_name] = False + labels["Is Other"] = True + return labels + # all other cases / normal tokens for label_name, category_check in TOKEN_LABELS.items(): labels[label_name] = category_check(token) return labels @@ -120,6 +127,12 @@ def label_sentence(tokens: Doc | list[Token]) -> list[dict[str, bool]]: Returns a list of the tokens' labels. """ labelled_tokens = list() # list holding labels for all tokens of sentence + # if the list is empty it is because token is '' empty string or similar + if len(tokens) == 0: + labels = label_single_token(None) + labelled_tokens.append(labels) + return labelled_tokens + # in all other cases for token in tokens: labels = label_single_token(token) labelled_tokens.append(labels) From 4047be40814cf742737f87bada0f024e9ab382ec Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 15 Feb 2024 15:59:31 +0100 Subject: [PATCH 24/29] add argparse for label_all_tokens.py script --- scripts/label_all_tokens.py | 46 ++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py index eaa2ea41..e054fa07 100644 --- a/scripts/label_all_tokens.py +++ b/scripts/label_all_tokens.py @@ -1,5 +1,5 @@ +import argparse import pickle -import sys from pathlib import Path from tqdm.auto import tqdm @@ -17,30 +17,29 @@ def tokenize( # Decode a sentence def decode( - tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, token_ids: list[int] + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, token_ids: int | list[int] ) -> str: return tokenizer.decode(token_ids, skip_special_tokens=True) def main(): - print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n") + # Setup argparse + parser = argparse.ArgumentParser(description="Tokenization and labeling utility.") + parser.add_argument( + "--model_name", + type=str, + help="Name of the model to use for tokenization and labeling.", + default="delphi-suite/delphi-llama2-100k", + required=False, + ) + args = parser.parse_args() + # Access command-line arguments - args = sys.argv[1:] # Directory to save the results SAVE_DIR = Path("src/delphi/eval/") + model_name = args.model_name - # Check if arguments are provided - if len(args) == 0: - print("No arguments provided.") - return - - if len(args) > 1: - print("Too many arguments provided.") - return - - # Process arguments - model_name = args[0] - + print("\n", " LABEL ALL TOKENS ".center(50, "="), "\n") print(f"You chose the model: {model_name}\n") print( f"The language model will be loaded from Huggingface and its tokenizer used to do two things:\n\t1) Create a list of all tokens in the tokenizer's vocabulary.\n\t2) Label each token with its part of speech, dependency, and named entity recognition tags.\nThe respective results will be saved to files located at: '{SAVE_DIR}'\n" @@ -73,27 +72,22 @@ def main(): # let's label each token labelled_token_ids_dict: dict[int, dict[str, bool]] = {} # token_id: labels max_token_id = tokenizer.vocab_size # stop at which token id, vocab size - batch_size = 500 # we iterate (batchwise) over all token_ids, individually takes too much time - for start in tqdm(range(0, max_token_id, batch_size), desc="Labelling tokens"): - # create a batch of token_ids - end = min(start + batch_size, max_token_id) - token_ids = list(range(start, end)) + for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"): # decode the token_ids to get a list of tokens, a 'sentence' - tokens = decode(tokenizer, token_ids) # list of tokens == sentence + tokens = decode(tokenizer, token_id) # list of tokens == sentence # put the sentence into a list, to make it a batch of sentences - sentences = [tokens] # CHECK AGAIN + sentences = [tokens] # label the batch of sentences labels = token_labelling.label_batch_sentences( sentences, tokenized=True, verbose=False ) # create a dict with the token_ids and their labels - labelled_sentence_dict = dict(zip(token_ids, labels[0])) # update the labelled_token_ids_dict with the new dict - labelled_token_ids_dict.update(labelled_sentence_dict) + labelled_token_ids_dict[token_id] = labels[0][0] # Save the labelled tokens to a file - filename = "labelled_token_ids_dict_.pkl" + filename = "labelled_token_ids_dict.pkl" filepath = SAVE_DIR / filename with open(filepath, "wb") as f: pickle.dump(labelled_token_ids_dict, f) From 3c4a1a40090c9260a062f67ebc79c373bf864f51 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 15 Feb 2024 16:14:17 +0100 Subject: [PATCH 25/29] add tokenized dicts --- scripts/label_all_tokens.py | 2 +- src/delphi/eval/all_tokens_list.txt | Bin 0 -> 45121 bytes src/delphi/eval/labelled_token_ids_dict.pkl | Bin 0 -> 274517 bytes 3 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 src/delphi/eval/all_tokens_list.txt create mode 100644 src/delphi/eval/labelled_token_ids_dict.pkl diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py index e054fa07..01bf4cf1 100644 --- a/scripts/label_all_tokens.py +++ b/scripts/label_all_tokens.py @@ -72,7 +72,7 @@ def main(): # let's label each token labelled_token_ids_dict: dict[int, dict[str, bool]] = {} # token_id: labels max_token_id = tokenizer.vocab_size # stop at which token id, vocab size - # we iterate (batchwise) over all token_ids, individually takes too much time + # we iterate over all token_ids individually for token_id in tqdm(range(0, max_token_id), desc="Labelling tokens"): # decode the token_ids to get a list of tokens, a 'sentence' tokens = decode(tokenizer, token_id) # list of tokens == sentence diff --git a/src/delphi/eval/all_tokens_list.txt b/src/delphi/eval/all_tokens_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..438dddae5a703d7c5bd4efc3b840708916eb9dc7 GIT binary patch literal 45121 zcmZ9V>3Snanw`BbvgLhWpBaxmUUJ$+Y>Aw)-Bz{KtyW7dO_ydg-CjWuBvAzdXn-iH zz1Je&6#&vh?@|k%2_65pmA9#EIaG?!ynR_?z%I?f%$@AI!Qx{^1An?oWL9 z!J_+na`p7E8OEn_gDG6>i!y^lkTtcIqm)ipR?|7@;UGR7N3jmZ}ZuAe}~Uy z_jmamy1&P>z3BcvpI6;K;B(UbLq4b7KjL%N{bN4oKK`Qn4L`z_w+ zMfXqnyz2fLpOfyl`J8tDoX=VJFZi5y|B}x|_pkWuJFYWZcE7{t(EWglc+vf9KCikL zd``M6KBwJ;&sjI+bKcGPTy%3j`)VB8cN%wnvPP>1@=dAm; ze9pUn$LFH^A)kHs@A+JIAMrVKFRAty-H-UZ>OSUk(tX0`wEHoiv+gH+&bv?fTy#I> zv+q9RbJ_ij&!PLA)_T!>!RJ-?b3P~CmwZmUulSsGzt88q`vsqi?iHVX_cfo(?hp7J zx<90mUv%H_dDZ<9pOfyFd``Px@j2`M1E2HmKk~Wg{u7^l_n-M(cK?OXq5H2i@{1SU z%FR`G?Pk)g-Aub1H?wZzX5QVpS#)=9`tIJ%vfH{Dy4P&zVsG4Bbvrkc?!nEp+q;={ z2RHNX=w{LV+D+e`+$_7Zo1uGTLtneNx$54!nRHKXrro!0X5BkC^X@x0i|)U<>AU~# zX4(A@H$(S7+0g6$mz%5Z|Nfuy@yhhmwr9LDy|wnxVOwFmGTpZA8Lv#|ZF|Nm(}mlf@yc}Mwr9LD-MRMA zqg!FTGX1*k8Lv$5ZhOWn)5qJM@yhh{+Cyh=h4IRC`L<`gG9AC|8Lv$DZ+pfo1A?|^ zyfRRzJ)of##w&w~wr9LD$Y^`UD}#@=XS^~fsXYLt6~-$Am$qlTGN5UD#w!Dzwr9LD z0BU>Ml_#JmdnVwh721_2V5#kCSDt{Uwx?Zr0;bxYcI64Usy!g96~-%ruC`~qG6-vX z#w&xewr9LDNNan>D}%P$1KwI;yfT<;d&Vn+yS8V%GT3W-#w&xr+5-YxVZ8E)!3hSS z{WM+~Gtl;oSH>E&J>!+J3AJa$;AF($1mn!+JA#KliWh_bCGhP{c0;f$t;5{Td0f6xA1e*fkO#mOYq*01w! z593ZSEt_n6>3D*5d4xMBpl+4lK^^2j!N$PB6A%Y6Pp~puh;{Aa!D>qy%KM&fNsYZz>pMs|C zK07AeAv>nshRaQ{Osj0?-HunCVw(=!b_$SmBs5O>I%spJU}(j|OfgTqByTqbLmRJ$ ziL#2OAn1`AonoZ6-VgkI>UA+wR_zq@Y>OyoDbP7#2B+Anx7O1XGJ>@`o$FQ$!h|bL%uG%H+G*Xc=2IC3(m$Ath};qr(oqiPl>%+XUDADc|6wY-d3D~ zl(UbIv3ipOmU;kFEY;_1z{qu~2aN1J42Z0;`%@6Y>-#8}sdsj^DGuwzWc$13jzLN3`ep-$@{U~>ChrpfIxQ_R$M=e+<6=t-&9dzu25C+`JZ z?pNLfxa`w!0L;#I0x=is5y(_c1T%RTP;&rHQ&3aq()5qMB(OPqRdHi2100{&`vRW! z-r#4G%YdJaH3@#sE4v5?`n;YO3_aebUjw2q_g)h>?jtu;#w9HQj2^_-6d>KEOM|4d zT@xhnJn8M&sNHj!diJRR)Yo+)9W1R4rhXt8aP+nh+-wG_)?8@@s`lbxhPUTa%rH@p z?U0#%+zbnKxY$fHfJGfkCo{0cY4)!f_`2q%Gpy9v3jwgFItYmU1~;7nu&34jF-HTk zYe6y76RZLrW!jaRnt`>qbsU~$4imlJ2DEKx@fm2lu07ywl|7T-e+JxEcGMYo`>E%G zw@0xs196AEHGsPS=^1d_-sI4MhqbL{Anral0C7*&41oJ`FEVD>DFHD9aOrh35XZ6f zof)9J2TYLEoE457xgT2Q?j={f{ zlxBc$YfqnHrKoTV@V`$30PR0B@OOePXW$R-dj|Y&q?8%x;}YrFGXR)70>Fz81pprp z?|-dMJp+a-Av^s%WcKITMBl`xwD!o$I;QB1ctXhA1GW&^D}VxG945czHtXA zyw~8&Kp}7ABY?w8wp^CLWQIjMOBgc{_>CJZTKWh0yH3LbzzdH&2Y=KoPcR38hiaxd z7`$J}kaJM@V9m`jZeMumIWXKEf{i&iT<02gxKExgV_;+al52Z!fJ zE%6*A9z^*Z7@n!?IVil3Rh?t)uG9K+FnDzVIB2ms*_EKhl6fVmE2=Rk98 zV&F+T<$VFD{51!h+g+WpFhwv2ow+6U@!XyaXx;^QbFg`S2omO?^9&Qr0q3=?3pig{ zfnXCtwc0@ESzydT=SsqxgU<6qyB+YP?Q;dH{+eF_=c}Veat=D5G&ytq%8=7s*V6h0 zpO0P>Y_0;kIc9QW9|f96Au|V>YpV!gKGc0d=2Pt59Bkg6v`%xt`FKCp2DUOU+qxqr zbF!$4O9LNF#FDMUKr9k##PXw|LHadu1Y3}9#c6G8#V>V9# zm`7UoNM zqJ_3I54pf}p6U9&+XB?S$?GhD?Vd_qfLSa6ZL$EhM;c%OW_O!B^a9jgWUmFVy*^Om z3w=}^l?9l6ik(;h+H~{OsKl`Vw))@;aC_F$EkNz9F92rY zyzC*7MORtiw`$fFK=uso78uYk>?I2TYtt>j>%{|r>+zV{6Ry4wSY7RXL9iODxd5z} zwXXwKFAu&eVBOiV7l3uEGr9n%pVvVE6?nR9V!CbtMjs163o+{Hda$cB3nuiumGl8y^#el zU0ug8EkNn98WJm7-9{UlOMucFDR%)%X$}ihpT)-mrYq@b0Zt#{a4f(n9o1cQak*>( zP>EQG}005!C&T(@<7hh)qKyO=#m~~U^w*a0; zD+6?1Z&C-qlce6l_%M{Z0G}V}04>1hU7iAbY77>@6OQqj!RO_aU6?&E7z4k+uu@HB zfffMu6OyQabn`7Q)q_zDq$LPoR~PC5seQ2rr008EvInG8qh3=FPJQq^Ru5E92RmU8 zQ1M_bU|y-E9-P|3Jt)no)=h6Dso&HaO74x;dyq;GurTjlrz`eg^*a5n2dl^Ywlo&? zne=i`yloFwZ}cR3z^cLR0qaF>=|O62cn?&+sj~p;?aG?%0qWlC0MvbqQx8%Bc`nJd zzg<^<>;da3wE$ReH#)vOVEv`xN)K9hvC=(o{V3NHuLpyY9xHpcM#ZZ&172wsI&TkR zxBF^|9?*t4dQq#n?Sbv1M*-S%I|1B2SVzFt9T%)&C%q!!R;2i>u>l>={DeYwI6q4VCzg+A)Nu-ek7&#z;?gME^tfGXZrwLURccc zK=(111G-eUcL%y}(;a~BN4AK-&9Tr~QAH1OH(LK5Ojk0UHfdvJS=ZL&lx z<7L6^N@nQ+Zgoj;%RBoF7}kr%w+FP}`tsQqcSM7??LC6JBxx&*S& zn8!d1Z>Y^Bh<&tLmjL#hv+R;SC8qV348jt$u0umh;QBU(d)EZ`4vUaR{PfyizOfiIdJ3>6n~y>CW=EG zOHu6OrkjA|?E^l{64Se;AqIeqLlNc}0P|h*B?q8Pv-7M2P~LnyKEwc=FZ=I*r_~R3 ztN~C{dsfQ;tnZ%7)Zi}n9KiLVQN;kVcX^iqi+s%!4IulTR~`U1Ahm#1pe6@>W}12c z)>}>5fEm7v_cR#jn%f;fc0!v0U{n8jAdr2%GnyabI0G1&bpYPy$N1g@_^wVofNs2` z7=i(S-^!5#2;bts3;_HqH@aNf#sPHyoZdTtZhDda*Z{s)QuqMAFAv(00e~Mw_yEGU z415mY+vv$sALk;^1HcKk-3j;zO!A6AoIaD(;{e7FcBui3ztQO#0Qn_X2IGki-Di#q z3=aUDs7p_B0O4;8CkGI&eXvAe3Oa$(0L0hXLavhbpj}S9G63(-d_k~n-ek}$Kc3=a z48VJr8-Z>e%E7SrF}DHTub%tb0LCx+V1se5&eZ_O#g(4V0LoX!o&zW+AL>)aKKtB& zoRAZE4#50cGd+OwYYpN6%(-eB9iX4{1i^V-4W!e$ItK$#zdqzmLAz)g0Q=>v%0hWUpw)|bceEBzKfrjF)?a#`?x*~?x7fZrN9 z4d7j>&ps;6`V{QD1J8qEzE1Y7UJ<5R*1 zl+`H7$Gc=-_yS-a4;5UZe!v$@`i3I33{EIax*us%CFC8Z?SzFLV%4a5$cj$c8dxmZygV4q8lh(Z5e|n<>Ww&8;B<{KUEwT!53L z_sk(ZkxvSysZwxIL_$L)@}m1J*+hz0$;MMCwB~0bj7_^0oIsAQ&Z7QybuCySS>!yK zkv2_Nfk4F&PoT@GK2cB|Yc;cJZ$>52n8XG|rLxd({P{JElqVhtwU>8S|GG@C~ zxw-5^-59z!J>N~Aexs{{vku5iXra-;!m zc^@2s1xid$aY>++1Io*^2x+wL0c${o&o|apfHgUS^unefij(kjX`Y=V80uGe@G!Ox zyb%|h-^~UZhEoj0j2(~(k$|b)Kre$jz8b&)xO>@-GDDq39`rW{OQa@tW1CjxZ}UYj zR}Y#Z{p3(&!3&a?Hr32-`la>+Nm2$j(tp(_B@UzjeY}&{C<5zr36)Ao^s&#Of*?}C zl9>nljH;#L0)AZMQgd$p6C zR=qV0Vr0gTrVZ*xU7`9F0!hNq10WEBhSUTU@>&A8fN>W zuOffs1~?NB@znVpkIu1>Tmuf-JjCCo2^ffKtqF}>Z)Bh=yb99wDPb(DvhXZ$_G>2; zlKcoq;<#+#tx(7n4h9^N^9aq;MPQJVxd2)ijG6wWG$CvhgI7in4((H-7d9Taf<6e( z@{aHaPEh(F1aiAlvUUZ1&?|BuoQzM$GW3CAP4qz@x~eQEC3JF!IC_+Ek>6?3DRRT` zS-zCu?Ft6@CN%(me4HD=9yon^J)D9JFUIo!FYG)^n=1@8%f~4ZvE1izy64?tiavKkTKyJ#sUBMs+gMD@aFI-v*hN^jP zxFbPm9sv5N3Q2~I!xn~|Rhy8=+q|G0a;Y|eL%#Dx63y=A5L-2`2rKi zW2J~&Xv9ZJx@y3fz#ccv`%rnc*;my~O!(ap;?)^AA!8j)j5LyzK%2;IwSUyk(kl0+ryFwK8xUE)}F(pTbcQ{=^L-U&4FbrOse@lJ~fClE<9hZCseAt5yfNdNJQCUD8G4W%j&v}QnQ=hd8|i*PaL)4P`W3MV2Xg;BtJVfq|kRRYbL?~nBC@6>MRDiEAs+- z6H1unEKfqg7kR~aBC5XVpz-09emM%9x`$eBGDynVAfg`M$b}SkI!%}*KAl>yUD7EW z%P{pp0S0!0t`g-1MfkE_zyKzJ7CY^hMmiK3R?bCnOvr5{Vqq^}*xo*0L%UN+N`AsU zAvmLuF}4vtFigVZya623Oo9?JI3qsxM1m=vR7m#r&@cp}#24 zSasS>Ixz8VzcXQ?HhhcS^jfZKq(ZI4VIl-wk=<6R2G~ zwr;I{P4^_Mp@{AAP=1d>HX=j&gIc!qA;oO;TB}q&n6yjkt zFfFIOz`q}eO_QcVU?YlSXTa6rVbaua(M?RB zGQ4x^*f3G~4+`;Cd7d~qX1efZ8CQgDQq)%r6)VRho@%`f5>jH(I*2qhb`W?7zop(7 zXD80g6c(zdfrG9qHl0F3w+(=h5Eedt4GMb5IK|Zz0y1%3P>i4R&^owTmN?l{_(!Ev zMTimXt<%kv?46TXyHd6HGzR4=_nu<&)$Uza!_Huz^JBSUD)?U2<&^N58{`U-yS1r) zkCeGjs)TQit6()L*Xjy!XIt`a2I)XEI(X^}AL^F+Kliv> z*^AXm5@%*=MwL~IVP??IT3s+_I^LH%!Z_R3h*D13Th)e;s{<%kDh|(2;jJ08v#Az? zbs)slE1nP4n)ZNoDr{}yLrHrE-@H}~Ldn6!@TC&Ocvs2a5*#~vSp)2RaR{?|60}2J zFOijsL}b~Od7QyKW6)R40snki@sJ9|*K}R)2=|a>&$B^3l4BhI8Sc-~hf}P0NR=p7 zEJXx6&5V~!lP~yO>G}#5pUCW4DQXr?-`G$kT8t+>hkL%P>l%XC#8$G(MF-x9^)m|J zte)yc^LhCJ&*xB=gMBh+&C%YlR1JKYBigKYQO6j+B4;{+RA^Jphy$Jngn{5l-&{fC z#|^V{*oSnWy;)J?H!Z2Hu5n$Sp~^;bzVKL`tL~_ti(o74(0*tUWmO0M4^C#@6(UmF zm@-7vvf=fWL7=2~6g{pU%gtB$_?T8v_(<9z=vVr9c4V1Uh;C&H7?SvDb4che`HQ(J zU=$ts9THl#4y*+7q4J}1C`flWPk_^dyl9Lm=kU-WVTW4CbL4?n$|3QRaOG2vunH&B+w3hFp(h?ZAqsbh#tc7~ZHlocj z<@VR3;o!V%sdbf?_+GgTFyF)(Reb3r2pdBIW(!HwV7^W6MFr+wtsy_tZioGNOWd35 zGF8ej$@JzB!nh{T;dpo@%&HcX;lo&*1+;=0&fDPLWVB8_=1tru6`9vbD=5h{J72`4 zFjFa68hin%G>4^J0!5lsn-2iF>^6B6gQ{5gR5JAPlO({^Ykue0hk{MR%sdIyf_gbo zh_cOimC81oRa3ZmI>b3wx_PePUh(E<2Z5k^Q>j*JF@DXeMdb@v<*9-mWtf$S{@CI!q5}?3r+k2 zwjq`-|3!glgR2ZzC=$JG?$`p-xeL_boWw4zf5JO-hcvaJRf7%gjlNYuR`&*9 zfxAXAim45@RE-*3WxuXmoK#h#=Y7$m%F*xajtWPUOZ zCDo+RL3=F-=yd`MC8g{5>WWIo{-Cau{6?z7*lJf{UJw012ML{3my(#Tu_h&^O;r|X zD>QwGFWkdIMX#wAf2`ytO##!NdWvDg6(n~{(yykZ%xb9o9 zq7wBSVD~VQL5%K!YSd)>+H;%Lsb_+Xu4{Cohl;*v53Pg4!ldF*WrmfsHWACg7iUXh$ty901@3Bn)lK}Mj)*AllV~Cb)~vB zX|9|J9SQxY&G?C%w#ZAR>shi;>3SC!!bLn>s7IkIPO?8kLg(ivg@l+?Fro++IxAY{ zATvCu&;x8}$jG)i){0-@;pcNuI8k8*6P=(CT#Xf}g-w_u@fgL=*{ww_hRlDB;B=jvS`rZZKMq72c7%hk-9*|P_!o86@T)y;Br zqZGBXWK69I#k1&J%MVdOdtJFKCA4(rI>hc2A5smiiRIc2@1}Mjq}zDpO9-j7&Q}U) znUUljAfqIhl+rruD^CUw?TRea(_X*dZ_J*{LqnOF%>{V9bkJyO+xbp(gn4o(G{i7c z!Y+li8k?Y6ajle?&Zfe4&uE2@1Qor^D993*=rL($^|jj!a;vXJe-@LX#Fk!@b8x?^ zR$-vH*KHJ1r7gO{hjh@WZ6C{vTf##SeFA>fw(E%681aQ_OWUL5meIBhgDbi{S1L~3 zEj6BZgMdi6<*9Lqa0Id!`niq&v4noUex6~<5Hc;;JhdexM2M2qg*x0%9+JOSiAyXi zniS(2PgiBZLBK4#iGofMa;V8Q;>%6ppkL%c;2-34X?qyxJSLJA=aLff3NR2lfI8T~ zq|Rr&fTU*H-;gV@uKL`!?PBl`o@KBH`#3lxZ7b5n5o{iuLS6MH4I6L}1d^ko5(slH z;U5e@ZVv-php3h85<7?VqS?)>%0Q$I^DYn&iAAp^`8@Y?_$T9Gs&&y)%4^DX@8i`k z@qcKZbaWU8mTGb86239#oui#%RZf|@U8h|!f`8S0H4X@>+NJZA9z)?SoubAP)$OkK z?F@1*(ZzByHhOicU_{k!&~If8Fb-ZsjF-AyoZ7L@mF|*X4}Gt+6?Uy_*`*RohErWe_d`-%{A9@ZA-j6!|&==M`Wc zRU0WH<8o09cFyR|$vP5OR?`GZEufN?z?N&bHD*Qms_gfmq^hAU>?w!(iL-f ztMjV&&9VUeo@rKfXjNuVt>{-uZLbCO;LPWaa1TwC3W0h^Y8osl{w4WL*YXSW^P0SP zm01e`@I=APFZ7~QLJ5Vy7AtU24typ}o3jb5Nfox|hkpnh(tQW`=kr!IW_&^x1N8Id z&pZ$E`JzV6l?C56w?kzx@T%?x_mH&;*(wgcZPA$WV3Y=VUbtr;;#DB5`3jpW68^k? z=S8WPdQrINlvM2i^I#^UY#St>aksG%W32W3vd>NAS?Y!9qCPh650xl92iza!dF6bm z8U|!_47McWUvi?t;h)r~v&!L8sSHpMGSB>MYNYi+$jAJ6o*egQSB(Yv;GpDK#|)y) zQ9B%QR_&+%Tw+8A_=i;%o*N1Z`mD%M6|wKxE(!_VRM&xpwiyE*AR$r!%_CJu+`1n; zbZptT0Ty~FX{%ylnum2?)Dk<7nF2nZ8ZJuM<~Sb|v6IH=l;9pP7)OzBAW}6rC9Kv( zPNYcc-l%|*!&|!@FG$(>nTcyq3=vQ3>`-!?8!S+Q(HZg1vGC1S?}5G8tBiOi4bOKt z2Tz8^qfFwUJu0L_y_kn#w{aK{r^MTt9z&Nz(y7A3d_(4&)GI%@@`iagicoBfG)1zm z+DJ-SfCJ@5o(9(05UjA9tbG+QAdITEr^DB|)6+!zqz#EPLmBsjyoueKFGRKRtSI6^ zKDc}dMNC#CxfJ0Y)gzRk00QHr%J>GOSFM`<$&?9p*4%LfR!+;aPgBHMZBGH|C?8M5 znf+n7wjK{>2MgYg*KX~KeL1^%AC3m_jp!>gPbf0i@R;m2Wh*wl1>(V|C+mf_%h)zp zqRg$Cfs+@}=Q)5Cqc47ec%)!ThPQ`G%kvGVAfvMsO=)MqnKC?3WB-)(w_1OKd=7Ue zOOTO~VyowcdeHjRPF=f-Rye}JnCdk!k3)|+538lM;CBK8cuud5FNT?N6Gh}8Ej@xl zjW&%FO_8Zi@uDcgcKFBDy1G7CmGmdL$GX#!aTNLV5F174=coXUBGs#3o_f*!*`=ur zzmhLT3FkdGxt^II8qo8go|m~Sj?Z@oeO2~3nJ2FW?J#v*W6EA9c+~H@lKY!F-`v`% z38py6v0!J%u-X~-gTL}X5YOkEfE(U1NUr_@?T}~5o!}i*VtEyahumTw6xKOjOh0lt z!&xl>MOerAHEIj_M#s&JDkOb0S!vN%~x_>Bo%#3<;aAsMsNrSaV@p!lvJS*iKRSg%LK6-PoxABl&xN? zR7vvn#aT`i@e7)okuMpstaJ_E1Zg_~v{1kG1t!|3ub?>u2e~37gcFt?j7*O(%+Vxd zjj_?#@pok=~A7vp$W@Xo?8)X*RL1RE6 z)5>#Ci0D;DSlLTRvPVoyDdSBk^=N7HK#LDN= zoRT~vyrcXsPbBLg;K(o$=USb3?UWGt9bz~b^P!Vf)k8W^Z~6nAV_*~DF&;vY=xI<6 ziN2JamLeKdD(|*V38V-xw%-OX8bt}MMs(S#3|ymCJjZgBzCo15brgD+=B!yvo#!TV!xz`!tER#~epqGO@oP>5O2nTYoN$D$2XG#fX z#fQloQrRpZQUgaEqe#>_)2Ni0866n1_q|*(F9zpq4W&8A#8O5oDQla@38s8CWUnh} zN9=bKDCb8>?@{Eo3>el`AURqvxqB|jJ43Un2`HyfF@bQ>-b$JAvvVGVlR18QC@2Tn zk0vT|<`BDTW)jEI7qeT2+f;bMH~oP;AJl__$rGg>j!4mzGeb!1}&i2S`Ure%=kr zsW^iptjy5T1kPblBp2Y#s%c>yKoSQ;Ei+SD{T;^HI0{Xvb(+*4q=U)HZfcTgPAPJ) zhhsG_;nfWo&u%)Z()K!$`_c9CKDaq$QcK3%1a~ww!-%e^MhRO{K{RD^+NpJT=cuEW zBXlF{1t6YezBm-8H7nIo1d^Mm;whUGEVSX^VtaX zqts!r>wOsqrOYTcQrnJs6;n-MAJ&OT9YQ~!Rk8rxJz;vG2lg4^)*cED!cR$^K|*g#|v@qctXmVohIkQDQLz}So2v}QxP>KI;-tC zg?}*b)~Pb)kDHULj5&g*x`Nub^dObYHA8^I71xAC)iE>d6;D_pGdX#$2m2(ALrF1V zjkfT`$OkD-8CIptq@{Db1Je6=AS#&&p=viTSySgY*Ob*eC3CMK zD)q}4s>=N{=flxxRlrqKct6gdPxZh;3OOrWRm$v>)ivOvV{%no0MG7}M}vvpfaGwgAXqsZ&7 zC&~h8sU~RYO>u^cit2-w`=X|Kze&a$O9S;*f|;`874@F*Qf1xP3pdm_6J_Ir+y{4v z7Kqw}-FDsToKO;Z!PFa+v_6DXTygYlS_)fcjv|6R%2$MN=~7Tp((!2`$cXtSs>eBm zo@5lQu9*<9_BlqJ!BTdkPWM&Pm+*&NjgY+=I`GO$n(0$15fsE%QO;~^S5Q#W?6OH! z%c`1d3J<3yaF0)_s@b%7uG4p4#L}p19_^;WW{@2JCBo)JW*#mZ>GG*;PUbUZ!4l)D zvRQFqyEtBuDv*qla|)sReLMx_&32re3@6|!1(+(FNfx*pEoL_kLq!o@w~E9Uav(mQn5&d_#j3<>cIK)roF3>FNsr8jG;p3OWhwT;uR{&SL%FU{Rb{WA=Mt3U@W64TpG`) zY=TPXM-nQjFK9@vO*NVU%HSz39f|hQm2u6~avJL82+Ce&C0h=|8*@CC5>5lJTyI2F z=IChlW4kZO@8TTNz7bwhbcM4bUWLxPRS@k`s^Hl3;2|fNfcn4#E~;^{dIb3BCPOUM zO34K|DUb7(t>ZyQfZtt^k(HOnfs9DTW^yh+Oboc1lQn~p4i)0gAf%7d`{ARf=baiF z0;ck|6h7ZFP>>tIzr;u>l73=%sU8bfx^3M6QmT5=Ju)crV6#DoN>(X(#zj$Yzeki*`?kY)t-G*y>pUa(K_EBlE`N- z^?O;jG>?Mk400Eht9kyoWjYi+n+x=wF4lUZ42Y6szdD6Z)jhL1M2xhm=bze{T(VUu zW3`{_wvwRCer3SBcps{sK|w)H*)uCmTJ0|2nq`V9*?%PmmvCj#AWKk^llmz*3Rbs; zk2tPXh=OMfL*72==*ptFoGX?qJtzyhW`v5%;Hhj+QDyUmS7Z=CZg*kV2Z!qjON>Lu z%#rK6BH52cR_pAVi?)MW=d3J1DZ{8#b%i48omt(`Mvvfm%fNYiCMb$Y#DuGL1*Ffc zA(j`0p*~IK#HC*!%IQ$~+$P!Nc<4!yh}VUhPV|!$aaRtziu2VzgYVMcaTJ{xPqbn& zSA->t*BTw4=~d=la+H=1W0>cFp6=8hP3ExESG$^X$rpSU7AUVLgr9hcKR(m1Z(EO;UU(Ijce~}vNUiDfalQBW8~3H!F6<_YFXE)&eStv zCPR{eN}*S$T-!`hi>6dUf7$?&R8a=_xLoT!MGK0d9bU*g8?h;1p18CuV?~-o<f-{p?l-aLS_TJ2r*F zXtK9y45iVGfOvg{(aCBR!77i&G9@ZjAia$frb2pu6Zor;#_BusnRB^u+8_(jDg@4* zV54i8NlT|n`n-OwS(VbxyU8z#kTM&Vqo`)WwxTVi(zS>ZKj12*fs{$#MK{1s!lw;m zl3oQ`a=L2W7;0kKvFyf!K!P7%N3C@9UU^@rDM|VygV(HAOX-AoT6!yxc znaQaSpUY*&(4r8J3&;$W51Soh~|j5m}+u2g!R zaeYqCh^KcFp&{b zy(na~uRN)1c*^$qG_L~dJ@#+lymnRv*ER+h__PB04>P|*{KTj zvU5F34kk=&siOtVL{6fPVJ>R3j7q1;uH}}-Y*ig_(pNbWLUN70w6#g3JD$l4!%0~; zo>H)q@nzN02d8wg*Hli?u`BD+9a=JB0VicLXPOgMB3E2@qIM6il0$+H0ZnRTTj8;i zX%;WBZCz>iOf9CEEOym}nrTd1O?%)d*6z$ZshY+XL~O%u_>c@i-Gn^0V{#>C<~PMA zOxXAJlux5`4V2VRGatQ@vudboZ558eS7Lfzo{BiGrUZnXU5uAZlP#c-`m8E9Wu0%u z7wmRUMW{6@>RN4>6r`?bnm`pbqc%AhuZbbUTn=V>P0p#^7HT4B8Q%_Sx~?!oCG{Pt zr92zdbbhSSJVn$RkYaL$)c4h^T(9>~uM0I1AN!0jlPb*=rT^3n)YRr|L@mNmraf#- zm+oD+p?oFO%*!{rD2i7|4Qa#&a3$Y7QF|vNJ76r?Mox|YQDx`GWvA);mL6A6-KOeu zG&7G^mFZGX{dJxYrb6&Ri>J;ZEA7O1NmpDTtK?PTt8E!a71b#Ib2g0Cl8UbO`&6vt zioYIEJ*W9gHEiJ0zl7=a^e`5}(^MXmwazGzT58zjdxOzxs!8RfJ>f01JTk0`YV^{H zDb-X{-|>zVRbTewlR{dpGr?Fl?L_O^4qD6FawRO#zB(qHbr=05d1|VeASsE52&+du zF+X90&b5FmU!W}7O{!O3Z6~atU4b2}RRcV)R3pSHt!ccMX(!4Wg+ zav;3fyb(p#k=}ATL|p`kQ~5@1X*FcWBiKcl~y*0$V;0Ex61v|1T$P4{4cc z$YbhejW|+(4a;~gv~`yiQ7BTy>FtWIFMB(<+Us4jW@@jQ5RtvSXvs5{@Rc*S>mE=Q z>?Sk1_VFqzMv4;`#7=9COjVXz@inT5aD?LPHj=OSnp5)zW}w;#ysIG-K}q&5hm5VT z1Wz%Ow|HuaU*-RZ$eFOzx6LDPU1WxRlS`__k=3wsF5dE{xw0;h{CFryz!H+G0rJ!w zJoU19?h3J)(Oa(wP2qBtab21aKh*RH2H5CMhveI{WHQW=)`qE=GT#I-uAKZ`@`ft0 zfxEQs+R143inJAD+cq)$st#J__p9ZF4Zf>d*s>-sf3=^6*VH_YgQK8Ks{aDj#w9^8mU5H8> zIjPmCNEph{jG7oGMAyU04~Rz$#=ON zJ{XG#j(t=Ow@q1jGnZyoSX&3Wl*V>fqi2N5>IX(#;rENL)>~Ydv$GIIs4= z8FQdAdl_-d;G$~o^(Fqca&GV*Z_(A85rHLKRM5>piM>rlH^oH9158%KE(rw`bw6Z< zNseMvoM0*3>1xjT_fG5v=8li#La@v84PdiYx2?sYG}69RJuuoAIR!#PvlO3DNjGhl z8i3L;O$i=c|IwmM4QFW8rJw0VW$_1CZOiP9x+<*3T0k*( z3cCSr8En^phUvfeU4#`ctL|=#i142*8AixN5{e8LSLcV-fMa_^XpLd!7)-U@{Ld~K zL~S=_BCiakaX_orWV1#T!D>Kg6ci578hMVKz34uw#(~xFPRi~_<=rTKfYgrFK2X{z z4ytQJ6Lz;I#_ww5x7A*78k(l+a5`VOYPDtodIJ5;CTmpaeHWQ$tr%VD5ry8rY_A5N zy^bCS@qs)(RySW^a_b_|FJCd0m3g4-f0Jx+JiUq?F5gR4QKGpI$LdRbW0kYK0YzsF zJVy1(7b<;~ksiuIxCZJ_JqW6L{M0dfsD{er#}{G;xq{%5;xzeq3aJ8ohCXUvr zW~GQsg7>UfG6lw}7wwA~T;m50()%r02Vbz{n_4J%D3W6v z452!e0jV5(I2|GmpyFB8D>aV;_o{xyc{Aya42i+KQ{e)l)n`Hh!0#fIbw%}6?$l5($NdnhS)=}{Ze z<*gqM8Iz~XTRuc8OPOGx7O$pEP&+1K`KDISKCPm>{BDYP3eMW9%2QKr?gFJ%lII~R zR*??z1WSs_adHe+IY>7(SA@%)cpfw;D`z0UWCw>hGa>b%r2J{6L&;t$cpsHBN-i*U z7>|L?_-8X70+UhQ`aqQF)tMYX(fZ}Jb8+a5`8ic%iW~?ns>iU@C}$`0$th+`m_nT^ zRhqYT9+l-RMw1$c&>o);gV4k;X%fU`R5ZslsIwTOs~hI_FxjJi0mqnBT#_@|?T6dHEwld@4WBbEjEbI`w zr3gJ!1~^KymG;9qLt>G5l-+uR-%Gx+$D!(Y@RV^@$YQmQ4Snr!7suy;!W09AUariL z?f<_kL1k@3m}O5@4y7;PraAMfoyN12c~YN_gXs%FVZ`v#w`*xLYN7jv$`IQbE3Pp8 zAz>9ep{}G!DbmCD=}P9uN`b>(r>B`=phypEq^~sHY)>i#&cY5KQ{T8}NEtjD+ERK@ zZ4Gfb9FfEVKWVF>A1!WH)!u(wqpf^nK;fwvqEh_1D%^FmeSQ(>>u!8&*sHmToU2MI zom7cBtF|V`fQ*vFLNKpHoqv;8?vf()46JhiW-949iu5p-1f)dg-WfboBq2unrFJvZ zz$qQ<)jWI`H6!9o7$g}Bia6iWc}|75@}E`MO_GROD&Lc@_yf-ug}UV7?e`)W2BZi}M@= zvP*0W{BR`FV$7Qo3--I-OdbX&Y+Z0hBa6X6agfP?MHkd}dIH3*cI zxlimy?Jc76ruD<=s%rz&wG&OA4l2z}OSE8(;_x9ZrLcVlXpgWzpyvQBnhKwJWb){Bqo z9ksLu9-UZ?O$Qh0%RDB%zl`OoD2UJC9mM% zq}lxX0!PPAMcQ888}T{=*v*L~n68oS@kwDXWV7Y+DsM-V!}}2Ek*rP~5pg+Ov~DXE z@{=v?7(8y*hRD0E+Az=Z<+)+}C@DGN$m z!MosAPopQ6u?Y_KKcrVp$!W(7jG}JmKL^Bd;RsGmi@;@M-OYCKyI4LEz2plC!9tI! z_Z>`|y>W9h#KZBBs6$6dtnpR9)6@(XN_wieudu89UB4ub>l8`@cBwchjW|qC)P=km zJ!&-#pVgWj>~|#EDG?NTXHj20OBsA5_52VaF#?n5@zLxXOq$lEh&#gL=XD`6D!18W zHSiLCVgc9lCi|PaKxEBUxu7?PG@(4#Qg9WzRs~PYR-%7?$lOs=epDAkUM%F&-i!$5 zF`zOAkTFH33pvQ4+4?~y8=KBZGS)NsIS!-49louq(k0`a>h{Kw4KHQ(6~%2I; zW|lTX?|j3O?Hq}tI05)c>`~T?W#J0SCPGPR-P9BXc9^;%Aj?T8$M>zPLTy-#gp11K z&qtMW9Im>Aip;CQYz*Y&-JrI&hv(Xd*^rFo31K#tpAxKrA9{UO=m(eYuSA}ujM{5MYwLH>Ino(j=^`f2)xW9qgbh=8&4uRqJpzxgv?_Uf zZv2iZyrv|oY8hTjKDqTbow8jfmU00+Q1qo-OSL?aymjRQ;J9rPTfX6~;87Pql+5EA zR*;}#e$_%rCG!qjl#8Qi{<1UA7ILe9`on?|Q(S%iNDWAtH z2y;b0pSHlEPwT0om!gbvDjPC`>b$}D$dH}hwoUDF;Y=}2y}@;*u_^N>3hsjZn zpvtZ4>N!5u0n3pSD(kDX-hZ)d_L_eaKq`TM8l57y)>E$)xcCXn0?ePkVo_@$9DaiF z_!pfy3zBc^EU-ULdCXQQQ{-;R&f(>`ROab$j+2^9!MTcfTUi{krC(Zh6bXTY0okp$X^a#tE>lqJT)Tj0vQzC_SS*`BAqhhoLQ1waU| zjJ@i6RKn@Ou$|qeA}7`NkJV=(yT|(=8lJ0lX)s+8q}02@atPe>-z4|}ime=sOOEUC z75BR8cw34Tb4Ly=yU5~U^9DLW?!t9Kuz6`&j<;($9rgG42hV^|hVL>@u}@ieJlY99t&3& zBIlAM8U??~e2#f;1=k}d^r@eCbynA^vWwNJujUM+QW{)#Q=Lx+y^6dxgk*y#&#!4Y zVpw@TLbNJEu$Yci2djcZS!1cB&gzV=3LZk*-b7e}00^a=3moFdI*yw%@2J;6k*t zD!8%c<%y zB1Ju$4h@mf{oh}^?|z*3lga##00|0Qw45=>v^+$1tlXDNmV>rN2P*c_qC~`rCyFsC zqpEDbUI-#P-zWZd+445Nk%h}o56@K&kKsS3N5EqKLppp}`gWF*p~&0PpIci&VkpW| z!LZnOV^@I3_;+ThPzJazlqB=(f;L)boM7Bt6k@#(UjxeI7f;(iGh)a&!?C7NYFKElKIzjITZExi@Iiie}Az>74aJfS66n?^IE67e&Yd* zE9wiGM3&pgIix%i6{zB8a5Vey&9jaK!Hz?0$#$sVCt}RQ;hf!S78US2Mq03T;q&Gb zD&qIw&7`-g;wPi~{;Q(_t&gkaSNRxfU*MJT2+yx7g)#mX&QR1(o-DzD3!t0D9{(3Q zGm@v>kJ9Ea7k8&o_PaKXlcjR)>Sj+4Nr`Az zLoWdxMRL^qhs|0dfv+}o%XujCKOcA1h7i~rm?~$Ji>H{?**Sx#pfYA^{NIHy75E>w zWA}l*2n}_d`ty%sntqe7fGorL3&^aJEUuFNbqnN``jfo& z?ob(tzs6rKlKvImT>Gi-epTbq3%HEthyln7>6MUlfpk|Vul9w}i2J-ITy}64Athe; zx`yl2`x_XQ=~44f{FojPpRC9MkIcoD+p9xlOyYe$pXSbzQ-u1eatK6IT?+nPGKKpo z`1fCER|WNWWKJ{oCQAODAeKy@ihtVb{k0YTlPuNlM)v<$k+zGan{gX2XWV@v4Xo>- z>VLGog%2YS3#`NPF>Tm{t_|9|UoV8HRCDn0$6a4fzX7G18ZB78N_r1R*%(i`Wi63u+ zhi0}v4+gpM&XqB7E`al?GNkO<>JHV#H*_ih`cExy55*~N35mI`IzzQ}rOMjZ@sQgZ z4=K~XSi8Zwz_(c+i7zfl^?Xa{&HuFGG#q}Sv$faPKcgKX+gu!ibN%~PWpLa#6}vbg zU`zHdXA=qe?=Pp$2XMi=6v^-9l`nD&MgUOFsIfBwc%oE`$+frTUc;7pKeG-JJx$z^uK%NdSac)|C2x0#b#x>WOi~9ckoeQuzXK7u|(>X-| zmKt{Pb^d#rkat2r#L81=17Mr-urL`+8L!6q0I!bu7fT#szuq_B(Fp-$WokwMzbS)? z$;pg>iY%QHfLC9)!bN+9(Am3^L~~r$qCuuh6}iA8-!LSs7*&s{qvqmu?9SL{goAt` z%8Zx0bnG)jR!dk6S1ysX3$dq)#Dg;exJF%sup{QqnE-9Z2#ci1+6il2haKImsBjf_ zro~s7?ohaDAxXWxr4)JBTyD+~q0^ag}jq6JQCCMc&8CtYdg0W@G4K5+ByIS0kjyNsgz_6*+ z<(z<@`ENWZ(X%1gNaFeC;9FkGmD}OIni9?r=Ba#HrvxyrmlCdTm0+zef@LQFFlkT< zQT+<8hOi#chvi{@EHk9TSLMB7vb4a>|dw5ZcmHe zT)7>+SbHJ}%L~`!S4MX5^Bp1eQVQDD zRsF9~EL2NET`T~W>nTT17lF47Y268LA*ZcZg}KT@aSiv?cfpHlfBt6~ZIz?d3Y-z} zWqd-(KT&TYFV}ESezP4I7oR9Pl&11OgYgRgb(H0~Eat9kJI?`iIhiQ?@j4p%T+E#T z*Hj0N*4AlVIRxr(kg#0{!el^7JFfI!_r1<=7a>SxeR^-I1?E!TRS8Gc z{|542D$dq=?4>Z*e)E(8t&;w?#De1g6{>-p!MMsERcrf(&vTzuzxjv1wQ_#03vD%D z*u~vfZ@<^bu-5jWApmzv;}VWg|4$aE^>s*#EM)B40BfaO{T|*rB%E{(0IO)HYGEy= zx{a5GwhX@VB#>53rAwv2Si7>CgUPWhJj>W5z7Os(fp;1W;@UIFA1Z>l-hA(U;Vz<; zTHJv1@V+oJ+=oF;r308(FTpoW)~I*oD|PU|+8+7I1w=?a(|r8EP{Tt65|ruY-;x4i z-99ro>>#o-MutK8f+uv5l_~lV?FIQpm)C!4HjXh>%ts6iMINI9=bQ++xqf53YktIh zAXf8=IAj`(*3tRi{%h&X!8bcHZTAAtT%qj^{rn zXUl)o%ogX#Ee!N3+pF%=Y$x4Mvz>NdW;=5dCdbXY&$C^0zmaXVBT>r28aW z%xt!^?yGG1M+n(6B|6*Qf9uBi%kFox9sIw2jJUt({xDn2TDFsJn(egvy=-UQ?`J#j v{wUi;ca?45y~uXiO|l)jxmz$j%l4`pvYm9xY^U8K+gaCVJMVt$!w>!+%KLai literal 0 HcmV?d00001 diff --git a/src/delphi/eval/labelled_token_ids_dict.pkl b/src/delphi/eval/labelled_token_ids_dict.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5fe96a3904c3ef3c78e4e34cfa453e812f68eb99 GIT binary patch literal 274517 zcmbu|b97zX7Qo@uwr$(Ck=j<;lOk=?q&9PE+qP}nwrv|PZN{CW?=|Cc4&xA$)0>DAQ7)2mg|=Iy;&`}$jd{;$8Sk{&)D-t9cS zJiPt?dQ|^EDCgz>2d#a*{`pH`{Qs;{d*9#xsqnx5(YKw4chk21zl-4iyDI+e_iWzw zcf)@F%PQXOy#oGf9o>EqvJ-wMI}ht?6?=?LL;gmgMW zIUS*$jxbI~Sf?YL(-Ge3h~RWYbUGqA9g&@mC{9OIrz4uv5#8yC;dI3GbHwr&f@2}X ze-!-wYJ>An8$x0s6c$2bAq*D6Vj&zB!eb!<79##f;qRyph^RV3OScr;+X#Y_N zoSV`Ai{SVF6((RVV*D4uzeWXaLrg5h!a{5;#KA&bEX2b?d@LluLP9JgG6{aJ*Uf;a zPHYnXwekjz>LmYB2wdfoVj&q8l4BtS5CZlvDgUDocrH?5AvG4#SPFiYd%1v^Olv6w z-VFvGhjcCkKO6hWfHtIeA-LL+25v(JEM&w&CM;yeLKZA!#X>eLWXD1dEabEy__=sW z@{eji-wL^~kQ)nmu#guE`LK{53k9%H5DSHD2!B6!2d;93u}}mHMX^u}3&pWe0t+So z5&T@7q8N9BrLa&M3uUlS77OLDP#y~vuu#z?$ll&B(5YI$F38=s;D0U`t`!Sg4DIdRVBBg$7t? zh=oS~2sTgt1Mff@W1-1^6#hQJ2zYXD`X7bBy=aDo=2&Q9DcGq|9+rZg8r2dDt+3#* z6zq(Hr=?(L99mBj94ySm!aOX@$HD?E zEX2YhEG)*t5-cpm!ZIu@$HEFMti-}9EUd=D8Z4~E!a6Lh$HE3IY{bGQENsTY7A$PV z!Zs{y$HERQ?8L$@EbPX@9xUv|!agkQ$HDh$HEOP+{D5yEZoMz9W30%!aXe9 z$HD_FJjB8yEIh`-6D&N%!ZR#9$HEINyu`vQEWF0T8!WuV!aFRy$HE6Je8j>hEPTeo z7c6|m!Z$2@$HEUR{KUd9D!97^!$dGn1m{EuBVoFi;~tU|p{xlTH*?%WTN5@ed%1@( z5~dqQ?qQ9DaX7sqBE*BVJAAH83~C_7kBX7 zquVE3Tt0J;VI+(J9@9t|I}*#9urnjEtqD6b5{DCUIT4Q&@r6+5NCF`O=ZK%_eMW!2 zdF7tag7AMOlATFSWKGza)Wn=f!il7uNMau|t!jYXi_?e0146Ry%Mmk_RU#5kO}g$SG@HW$P^ zLa1AjSBSu^2zbq)u_O76gv^Mm;mmI&jIAhOB#f;nD1>@A3kl(BIE}3+%!wjK!Z@5o zjf8aMU#rXMQMe)Bkr82Bt(#7VP{4v3lU^T>=0ErQI!+btO+}DQJoVt zga~p*{99p6_L@e*7@f6*2y#;Gj74ou)Zs*3PSoQ>eNHs6ChVkHLm^y+mN70G389XQ z#++!viKd)rW=+_ck>=KfoiuB~2@fIEanVu;bzHRKgu|M!Gb5hXgq;~_&51UgXv>Lq zM#8uhc^L^~$!RZyIyyTD;VL?%Bd(rDI$9HULd)BluoGIHIN`$yUru!9L>EqU;!l(Yr;-|_vS<&PV_YruIh!H#`hCKUA6iP;i_sGYv%wV)U6oEi9tfB zJ2KdsuoIm_tO+~O>1QNd#f6+}IgNyIQipP47$=4cp$@GPLa5_nq>=d7j08T=A7!6# zb-Fg%NEjz|3@65NVjL&Nb7BG~CURmDCnj@Z3MZy=Vw#aKrrC5OVa$;koS4apS)7>7 zi8-8@%ZYiMn9qp?oLI<-MVweHgsU}G(#+LTv_uH?n!1z|%Q&%|6Dv5ek`t?}2|HWY z)k3(63uEnEBZRBCFoxDzBVlaCI!>(T#0E}mKC3&Q3FqrDb{&D%lyII*7-2RLz%6Nfl)m=i}hag-CsIB}d4CpdAE6Q?+F+DI5T z2WO0gadU8%6X!T_o)Z^1agh_3jD$>Tz=vLekH#(=3FD+*;lx!=T;s%bPTb(cO-|h6 z#BEO8;ly1|+_NU^Je%FOChUBy@PHE!Iq`@Sk2&!~2=$}xQz2YE>Kd!oGfq6`#0yTm zG!n81xH?gKWuN%hHQbR3LcCT!8s9v6CpVfN(gnDg%(1cW??uHmJ{JP5uOtf zgiy~&L?P5O5{VO$IT3{uQH_LgmlMrM7)wrcPQ>6uOd(u_mW+k}b}h&+V8;?d9a^zD z5r-3TIT4Q&@i~z|h#)7`&U0!)A=Hzah!cr9k%SXTIgyML$vKgN6Dc{7iW8|hk%kj# zIgySN={b>s6B&gFvgFt~*UBV>y5wZ$L>5kDPgKhgnCkQ8HvBQD1X0^L{5})8wulB60 z!ilDwXvT@=oM^!b4^Fh?L@Og<-2FO?gmL%lDMXOv*Um{}Ya!I-w+$!Sa-tn4yg1R` zny@n?9jpmEXE`0M2|KOuwkGVfq7x^4IN{5Q&PKwR>|Kn6G158!Ci4z=sK;4)EbZs9P~Y2z4t)S`ao%&L|5aaLEbs zG=8*^FfIaPI5CzJUbf7oK!m-r3u!Aouz0ZCnj-XvJmRwoFYV!!)a$MrgCB$ zC#G{^1}A26ViqT6b7BrB=5k^lC+2fv0Vfu6Vi6}6TM#x^vn3XU&8_QFPAsz~>?{Jy ztqD7YZv`h-a$*%HR&!zvC)RRe9VgZcp1l zdQx|AVkakdabhiEEs=E`&NRZU~`{ zialnwgnBGqbK;E<>W;h>Lfw&foOsWP51jbOiBFvP z%!x0Y_{xcIMnV=uSD)&Bw{(!87Go+A_XT>av~KcQVXGu zi!?&0<0365(s3fa5U$3;_%M+{2=!QGcj~jPWW=7Gbg%mqAMr5aiTjXdT^pACwg(Bw>4qs7Dyi<)D^t15b6rvj}!fkgz2oBtl`93PORg^ zdQNQM#70hR;>2c7Y!O1e2y7KXy$EdM#CA^X;KWW&>=HsfBfEuA&&VE5?B&EhPVDEz z0Zts`#34=`=EM^o&m)0xXp<>oVd%0dz`p$B%}_zx>oqWNEqwzLry%xgl}5+#~5*c!ilGxc*cq6 zLa1x_3nA16_$4P^apJW#VdseJjWuECi0dsU-f`kRCq8iEBPTuyp$_oRLZ}1$3n#vE z;u|NvbK(alehQ(U)L%lVC)F(^e-(xh>Q)5jL6uOisk&L~KsP;Y3_c#N$MK zA=H_gz(^P$GZPA-&eTMlNX&^OoJh)vWSmIOi4;PpLo1~a>bOY7iPT2I7~pA)gz+3D zEho}(B0VQEa3UioGI1g^C$b2k&XKG_sBuc>2&aJ8n&Ik>C4*yDsyw_>~yu3BL{N}s@qiJX{ZBxE>UeXKB9h#-g4&K6~g zHDPCqGF1rmj7;OibWY6R#7s`i;>2uD%;CgbBk^~D`#GWp{?gh!BVkPT`9?xo;i_sa zFcQXAEab!@PAulc5>71T#4;mcoYdt;!Z@ibII)rwt2nWm6KgoJmJ{nZv7QqfII)ov zn>ewV6I(d3l@r@Iv7Hk;II+{3u(RXaWlh-G@$Kfs9!~5PBFNpZosR4iLcMj}&xr#< zs5^2{2z5seapEv1j&R~ACysIAI44eU;v^?d389Y8(?SFpop!3$8Ee8$vY)jk?6l&X z5bCiwFGP@IVW%S(IB}5^mpE~m6IVEKl@r%Eah(%4giuH4O(E3Ld5aUbIdO**cR6v7 z6ZbjsfD;cn@rVb`6JLc;59c=_)Wi9m6F)fdlM}x<;TGzj_g|=oGnf$SNewQ9x)mWf5t0+3 zI1!o?VK@<%6XA@6@so-0La0M4f)K7k%h-yDM#A{{NF+`~=0p@B)WaE72v@^t9E)h2 zh|Y-^La4_grV#3}h{cK6oQT7TxSWW`iTIpIz=?#MNW_W6oJhinq?|~`iR7F}!HJZd zNX3cNoJhlow46xCiS(Svz=@33gq_p)OxA>*qx8(2$ij)NoXEzB?3~EKiJY9sC4_pH zlbaKHgivpk@^T^{C-QTm04EA^q7WwvbD{_*igKbDCyH~T1Sd*zq7)}ebD|6<%5tI{ zC(3i8f{`$uUsN;_#%FeSPE;}y(h*me{3{y?V@IlRqADkiDsN=&WRSB@US5K9O3<+Q!N$& z=bs-Y)zX4+{vnKuKr2o-IN`~O)|_a=iME_*#|bY^wC6+zPITmiHzzuA!iN*SoaoGn zE}ZDfiEf7Gj#$dCURmD zCnj@Z3MZy=Vj3r=b7F=N>gb#)ggQEBabh+n=5S&zC+2ZtJ|`A%Vj(9MabhtimT+Pz zCzf$yIVVhAPTc0i9ZuZk#62N`%n>`2dS3{2jy&MRLry&6#A8l8 z;lxu;JQE_wxUiF{&xKIO#S2cn6Y)8bKnV4u zCKN(Fsfjp|SO|4Tk_e&hNKzy5uS0AMA+y3-NE>4+Kdg@7g;L>^A$ zT{w2 zCmM315hogRq6sIOa-tb0nscHBCpiB6pG;e;mIE6XQ5Bo)Z%|G0{lKC)}=H{4>c&7(d~j%!w(Sn97N1oS4pu8Jw8OiCLVO&51dj zn9GTIoS4sv1)NyOiA9`P%!wtOSjvfIoLJ6@6`WYfiB+6fErfcnwMGb6do5WI1HQK$ z_{!H>BVk++*KuM!CpK_mBPTX-VlyYUaAKge1igsbQ@hSqjY?BK*sPVC~uZcgms z#9mJ924Qo&t4(i}Dt#xy&^iKCo2#);#cIKhdNoH%7o*g3>LZB5uY z#6H7`vz$1`iSwMez=?~TxFm#nS-vcUdRe~0iL0Er#)<2kxWS2=M#8uV+%gizMc_6k z?g*id&bvaWqw^jo?sMV+CmwR*5horCp`O$yLZ~P8DJPzB;<*s&v3MbbdMsXY;uR-e zbK(sr-g4p{C*E`711COm;u9x6bK(mpzH;IlC%$vy2Pb}V;uj~}!tnQBa3VM-LU1A^ zCqi)|G$+DvA}lAuaUwh?B5)!iCn9koGAE*NA}S}MaUwb=VsIiRCt`6THYeh6A}%N5 zaUwn^5^y3RClYZYF(;C6A}J@5aUwYXD9jU;Hibld% z!QG97v4U6PL}ek=Gg3te^^8>IL^UH}oRR8A!Z;%}I8l=mwK!3m6LmOImlO3kQJ)hH zIMI+3jX2Sm6HPeLloQQ3(Od|1?Q9{0x^{YSq9rF;Vp!EEIM(*hZDY> z=*)>OoaoAlZk*`Oi5{Hj$%$T^=*@{foaoDmew^sfi2+RE2=!PD<-{;f4CllMBOzN9SKsIvDTI1PMhT&wkfwiFs;bf}2|~x8J|yC#uX>TS6=l zLR~l(3K2N#{T$)_j}T-{3jFiU1Aa8jA|ceRSj>qfLIm!J^G`>NPu@#8v5XVTIkAEh zD><=>6RU+#ht?V))N!$v6YDs!o)a55v5^y-II)=%TR5?m6WchkT?loK><~hoBRe^< zixay!v4<0TIkAru`#Eud69+kQNC;oNA=G1Wf)gh>aS9W@ zY28m_#Qh8>&T`@$C(d)?0w*qV;u0q=bK;5+>Ri1lggRHRapF2BZgAozCvI`#HYe_I z;w~rdapFEF9&q9zCmwO)F(;mI;wdMdapE~AUU1?iCth*lH7DM1;w>lMapFBEK5*hA zCq8lFvk>ad!51Oan}e^M_{NFvocO_spPcx`3AeEPRT!KI&WRA52+4_1oCwW{Fq{a> ziEx|aA_^y>av~ZhqH`h!Ct`9U7AInJA`U0wav~ll;&UPa zClYcZ5hoIJA_*swav~Wgk_(|;M^Xr(UPn@LB9%2^=Sv)^tqD6{;z+}Zw46xCiS(Sv zAcQ)!G76y%txTNA%!w?V$jXUqoXF0J9Gu9>iCmn>&51mm$jga*oXF3K0-Pwwi9(zx z%!wkLC@O@y{1y{JU4Dymq68;Oa-tL`N^_zNC(3f794E?iq5>x>a>AVxl{itE6ID1- zl@rxCQJoVtgizPcnnI{+XDuPrt*9-8x)pUeQCA3cN9qZo?nr%3G~h%-BVlpP-_H@t z>FeIeNEk2JH|9hWPBi62Gfp(;L<>%MaH1tAT5-Z5ggV(hg-|DZYfiM`L|aa@abh?pMsQ*zCq{8%G$+Pz zVyqDA=o}}6Iy%R5Vge^7a$*uECUas6C#G^@8YiZ6Vg@H>a$*)IW?K_>E)2}EChS}o zn9GTIoR}|!x@s*DLS3~Ma$*rD7IR_=Czf(z87G!=Vg)Bwa$*%HR&!zvC)RReoe=6| zUoV6@**9=vBPTX-VlyYUaAGSbwsB%RCw6dRCnt7sVmBxDaAGef_HklACk_aqu386$ zP*<%(oH)#hBb+$OiDR5N&WRJ8ILV1qoH)&iGn_cfiF2GdZ%x>FHoIU=*m*X)D1^G? zToOWEaxQb?3MZ~|;u0LJ|`Y<;vpvqVNA21M#6Z%^A{)F!u|993w3k`6G9!G!8sAany~XQ5z?Bl z^Dq&L6QMZ~h7(~q5snk#IT3*q5jhcw6OlO)MF@4WM-@Vy?9n(8of9!Q5t9?KI1!r@ zaX1l|6Y)3^pA!is6B#*?i4&PQk%bdkIgyPM**TGe6FE7Nixasyk%tp`IgyVO`8iR569t7(Zw?9x zq23%6=0p)r6y-!QP81hHJtHNAP|rw7PL$$AX-<^kL|IOh<3xE*RNzEKPPlWT5+^Ei zq6#Ofa-te1s&k?SCu(w{7AI<36L!vi>sS+Z&VK6(p)387vD>RS_b#-f2WVP`BF za-tC@8grrvCz^7i87G=^q6H^BIMI?5t%Oi#szV5Mrh0OsH7D9w5H^1xZCeY%=8uAH z#|bY^wC6+zPITmiHzzuA!iN*SoaoGnE<&j5MOPtQ)eE^W5cual2fPrfn-J<&bQi)^ zD~xvzdT^pACwg(BHz)cCp`MYxLa1k?A1C^AVgM%wa$=AW>KPd2WY!p>v=6l=oHWB*i6 zOyk6KPR!uMOis+=#B5H?5kj5pbA?bR`#es}=fna|Eab!@PAulc5>71T#4=7S=fnz5 ztQ10>W~+oyr`c*wtl`93PORg^dLh&^vOx&-jBMn@CQfYT#1>9$<-|4%!sdDBb_>Gh zdFKvJ?Bv8QPVDBy9wF3ewpR#sn(gDnej(HyIUt0(BL_Kgh!cl7afA~`g;3APF(K45 za-0(dU34g-{RY8BUz##5qo!=fnk0T;#+hBO!6|uMao;95J1L ze%i%lBVqhx;tD6Oa^e~%u5;oBCvI}$7AJ0V;tnV7a^fB*?sMXS5Uwh?G0h$dp|0SM zIPsVhPmF|AEmy1AQzKy<&SyfXXXLpM>KS>#iI<#s#fjIPc*BXeoOs8H_d=*c>w^&L z(E7-UPeQ0W@>vLVN4{|4D<{5j;yWjP2%(;lpF*f-NJbQ ziO52zGc^h)qH-b{C!%vA1}9>2A{HlNb0UtBFjlR&M#5ON;&CFr5b8`#AcQ*25^^FD zClYfa2`7?rA{i%=3!#pS6hf%uA|)qMaUwM*(r_XzC(>~uJts17A|odiCmn>&51mm$jga*oXF3K0-Pwwi9(zx%!wkLD9VXqoG8wT z5<;lAC?$nZZ&6Aa3F9JA+DI4|figm<$D*td>ai%tiSnGNz=?{SaOXrNPE_VZ6;4#; zL^V!S=R^%o)Z|1hPSoZ^9ZuBcL_JQ_=R^ZeG~`4hPBi936HYYcL^Dn_=R^xmcyOX6 zCt7jB!3j@JwB|$`PPFAjJ5G3UqP-C6eP;(D)cekkobcvECr z6WxVS2Y3%5)B)a;6TLXmn-hIF(U%kbgiudve<9S9I)D=cg-~~7kPzyQ4CcfTPWW-c z$%&zy7{-a=oEX80k(?OCiP4-G!-=t+7{`h6oS49gi9)EW)+8a+RckUQrf^~^C#G2w zb}pVxwFSKLfw&FoY>8YJ)GFfiG7^d&xr$^ILL`ZoH)#hBb+$OiDR5N&WRJ8ILV1q zoH)&iGn_cfiF2Gd&xs41xX6i1oVd)1E1bB>iEEs=&WRhGxM@w;*>~QuChTk)Z*$@f zC+-TN-mcvfLcLwP&xr?|c*u!IoOsNMC!Bc7iD#U6&WRVCc*%)ZoOsQNH=KCOiFcfM z&xsG5_{fP*ocPR%FG8pnfv-ZS7lCh__|AzRocPI!Uz~7@z~6ttiQv|RoxN5FYr@W6 zDA|fXuaUwD&qHrQAC!$#sc9K21HDM>&V{jrSCt`6T zHYeh6A}%N5aUwn^5^y4+5bBLmA|ccprNo>_VkG3-kOA*?^>ajXI-)xrF`WOr*EOk; zFn${{87Go+A_XT>av~KcQgb2=C(?2v9VgOrA_FHfav~EaG7F)uT3LipSFNm^$i|86 zoXEk6oSewTiQJsXV@=rE@#VE9?CkjRaUwq_3UHzzCkk<*un_8cQA7xJy(r3wVw@<> zi4vSBDTI1PN(rH!ko@D>i3yyT$cagun9PYOoS4dqX`GnO zi5Z-j$%$E66(Yz5(ay)a z%Y;xbh|4*#f)gt_v5FI`IkAQlYdNuw6YDv#ffE~rQ0K@dA+$N-o7R0ZM%=e>Vk;-M zabi0sc5q^+HDTv*VwVu=;My&OHn`mPaAGef_HklACk_bVYDSD}`#~YpGjfO%hdFUX z2v=ia+@u^8LOmA8IB}d4CpdAE6Q?+FniFR@ah4P3IB}j67lcsf$VDO4IdX{;mpO5T z6IVHLjT6^7af1^#IdO{fc#6L&drj}!Mf@qiN#Iq`@Sk2&##6Hhtuj1$i}@q!aC zIq^ye^=kH72=!|Ah7)hC2|Leb@2m+shhy(K@qrT`Iq``TpE>b`6JL#l@$v4PkuW|L zeK!)uR{Rh`ovA;CP-p5dPPj$nufpI&a888aL`Y7A;zVdpgyBS3PK4t`cuqv%L_|(R z;zVRlMBzkKA=HI0nh@&37hMQ-D`E(tZbeK^#NtG3PQ>9vTu#K}M0`#p;6y?p)X|wp z2z7KO=0p-sB;`ahP9*0<3QnZtL@G|C=0qA!q~%09PNe5V22Nz;L?%vT=0p}wWaUIQ zPGsjq4o>9cL@rL`=0qM&!n6SK&FdkM`;6z1ExC^09_DVvilfAN$FwRI7BVnA8s+_3C ziRzrF!HJrjsKtreM#2~ub&P~DF6wfk9w+K^q5&rwa-tC@8grrvCz^7i87G=^q6H^B zIMI?5tvKP}geNCjTM&MZ@cti?OW_N6SFJP9yK3855Y9h@vGBF!L_1D+aiTpZI&h*R zC%ie)i4#7Y@a05jPITc!S59=}M0ZZ~;6zVO^x{NsPW0hKUrzM1AZ#uI{VfQai@*RO z)K7>5g-|~s4&uaMP7L9MA19og7|MxZoEXlD5u6ywiBX&w&51Fb7|V%qoEXoE37nY7 ziAkK8%!w(Sn97N1oS4pu8Jw8OiCLVO&51djn9GTIoS4sv1)NyOiA9`P%!wtOSjvfI zoLJ6@6+)uOkPAP_HA0IB}R0M>uhm6UR7loD(NFagr0K zIB}X2XEptPCVqqBThW##1l?D<-{{iJmkPDJHIG)_e4L<~;ELzJWj;tL;_AE zhLLzbJp zSvZlE6WKVCofA1Yk&_d-IFXwZc{q`m6ZtripA!W*QIHdbI8m4rML1EE6UD3vJJ%M9 zTN8F()?0!TB{@-w6Qwy(h7)BuQH~SkIZ=TV6*=L~iAtQP%!w+Ts49edyH-sI^>(c~ zCu(q_CMRlfqBbY$aH6gd>bR&UggP$jbD{w!8gil$CmM622`8Fzq8TTebE1VcVP~V{ zVNKZCD7EB7D^55#;mL{CoM^*|ww!3k2`^5x7eZaZI|!k!;2nifx58Tpbt^h?!iN*S zoaoGnE}ZDfiEfqv6>TWII)%!>o~EV6B{_O zkrSIZu~`Up1>YistMY5ik*%EA#)<8m*ujaNLa1kCmk{b1+0BVPoY>2WeVo|Oi36NC zD1|FO z6OTFZgcDCW@r)DCIq`xMFFEl_2z9-9ErhFjVNCWnoOsKLcbs_7i4UCk$cay!_{@nf zocPL#Z`OpJ4>!JB6Lvn__`!*vocP5Fx5)hc7n}&ri4dF!DTKQGh7v+uenWF23@5^J zA{-~eb0PvKB61=UCn9qq3MZlpq0W(LLa1{jIwxXqA|@wdaUwP+;&37^C*pA;J|_}z zA|WRdaUwA%l5ip^Cz5d@xe)5Al|l%0)k?{URGdi7i8P!@%ZYTHNY9B3oXE(DOq|Hf zi7cGR%86{8$j*rzoXE+ET%5?wi9DRh%ZYqKs273!LZ}yk0z#-;QBVkVD++OC4@S(N^_zNC(3f794E?iq5>x>a>AVxl{itE6ID1-l@rxC zQJoVtI8l=mwK!3m6LmOImlO4jgnZiX{BuL!&k@b(i0*X6a5`c-9kHCg?)8m?@dLmH zoM_02Mx1CYgnB`2B7}NDY|4pdoMRG!pVuDoEXN5;hY%3iIJQb#fj0J z7{iINoEXQ6@tl~ziHV$;#EHqAn8JyvoR}try8KQTLS24maAGDWW^rORC+2WsE+^)3 zVm>DpaAF}R7I9)RCzfzxDJPb3VmT*PaAGAVRtcf5ovVdV*UmMZSj&lZoLJ9^4V>7> ziA_SN<6^TA>bTg#iLIR2#)<8m*ujaNoY=*Q-JICNiM^cICxkjj_6woTkprAK$caOo zILwJ7La1lts1WKIImU_OoH)UWlbkrkiPM}o!-=z;ILC?eoVdV=i=4Q`iOWK$ll_Vi z>SVvliEEs=&WRhGxXFoIoVd-2JDj*HggQF!389Y8`Ku8^i8n&1JMvZtbw}PA3AqaB>Sev}g;00ogAnSDeB{I@PJHIX z7fyUN62?jWW=+_+!tvdjuycju2Pb}V;uj~}qVV@$a3VM-LU1A^Cqi)|G$+DvA}lAu zaUwh?B5)!iCn9koGAE*NA}S}MaUwb=VsIiRCt`6THYeh6A}%N5aUwn^5^y3RClYZY zu@LIrZxSKYyWgaoNXCieoJhfml$=P#iPW4(!-=$qQnJ z)b%1OC$e!OJ125*A}1$uaUwS-@)!wu><{=_=ifhg$?wnCI`bL{i4vSB$%#@zsAr@!C&~z+p474d#DtVt-TwS?Izp(e za=ba^IZ=TV6*=L~iAtQP%!w+Ts49dyS*i)4j?U_wsKJSvoTw#)dN^wfp&rgUoT$r* zdYq`wi3Xf#$caXrXv~QwLa3v&sSxVuY{rS^oM<70dMrGIP>)4RPPF2LgA<;dXw8W> zoM_95cAW6yM0-wj;6z7Gcypo?Cww^J%ZbjM=wc+~Aj8$Al&(g?c#zSJ6Wuw{gA+YD z(aT5}XQa20FwRIHPW0tOKTh=L!~jkVMF)&Yr@VqFt%`Ft2JTgOB~y*2|FFx&WRn?gq@D; z2Web$7XxY*B$1J;CpT*!>LdKdn||4IbDcXh}<;VLc; z8wq1v9O1-KP8{RJaZa2t62`bVX(Ws@a!LsG{^GO{uJ#wkj+`+P#*Um7!c|B9ZiSyC z>fbpM@FOSA83|)6&Kn77g{w5XU`^PmS{H>-&&VYqTn(pjEG`S7Zp9TL)UCKGM3Akp zGo06iP`BbbCvI@!rVv4ng`F+REh8ZT?rJr=ErfbF?+D?lBmZi}-^m_u5&5nVu3GUo z5oEo%XC#a{a-S0qIPp*jSHo%S$Ri4+Nm zXE43w#491xaq*fHZ#eN*2#ewLb3}4FA_sop_TOLl`MSTeLpc8;0?){MA=Jb9ffFBv qP>;nYJB0OEe6~Ya&&U@}eC5P9PJHLY4^I5##4l^Y#&*p&^Zx)TH9C<1 literal 0 HcmV?d00001 From 87e18b375e0a54201d7c57ea4eeaec538843516d Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 15 Feb 2024 16:25:26 +0100 Subject: [PATCH 26/29] update notebook --- notebooks/token_labelling.ipynb | 777 +++----------------------------- 1 file changed, 72 insertions(+), 705 deletions(-) diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb index a447a8d7..45423d8c 100644 --- a/notebooks/token_labelling.ipynb +++ b/notebooks/token_labelling.ipynb @@ -4,57 +4,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> a71e2a8 (improve notebook explanation) "# Giving tokens a label - How to categorize tokens\n", "\n", "\n", "The first part of this Notebook contains elements that explain how to label tokens and how the functions work.\n", "\n", -<<<<<<< HEAD - "The second part shows how all tokens are labelled that are used for our delphi language models.3\n", - "\n", - "# 1) How to use the token labelling functions" -<<<<<<< HEAD -======= - "# How to label tokens" ->>>>>>> bf8ef79 (add notebook) -======= ->>>>>>> a71e2a8 (improve notebook explanation) -======= "The second part shows how all tokens are labelled that are used for our delphi language models.3\n" ->>>>>>> e0ed3b4 (add the files containing token information/labels) ] }, { "cell_type": "code", -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - "execution_count": 90, -======= "execution_count": 23, ->>>>>>> e0ed3b4 (add the files containing token information/labels) - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], -======= - "execution_count": 2, - "metadata": {}, - "outputs": [], ->>>>>>> bf8ef79 (add notebook) -======= - "execution_count": 90, "metadata": {}, "outputs": [ { @@ -66,14 +26,11 @@ ] } ], ->>>>>>> a71e2a8 (improve notebook explanation) "source": [ "# autoreload\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", -<<<<<<< HEAD -<<<<<<< HEAD "from pprint import pprint \n", "\n", "import spacy\n", @@ -81,24 +38,6 @@ "\n", "import delphi\n", "\n", -<<<<<<< HEAD - "# from delphi.eval import token_labelling" -======= - "import spacy\n", - "\n", - "import token_labelling" ->>>>>>> bf8ef79 (add notebook) -======= - "from pprint import pprint \n", - "\n", - "import spacy\n", - "from tqdm.auto import tqdm\n", - "\n", - "import delphi\n", - "\n", - "# from delphi.eval import token_labelling" ->>>>>>> a71e2a8 (improve notebook explanation) -======= "from delphi.eval import token_labelling" ] }, @@ -108,7 +47,6 @@ "source": [ "\n", "# 1) How to use the token labelling functions" ->>>>>>> e0ed3b4 (add the files containing token information/labels) ] }, { @@ -121,19 +59,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - "execution_count": 2, -======= - "execution_count": 23, ->>>>>>> bf8ef79 (add notebook) -======= - "execution_count": 2, ->>>>>>> a71e2a8 (improve notebook explanation) -======= "execution_count": 15, ->>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { @@ -168,34 +94,13 @@ }, { "cell_type": "code", -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - "execution_count": 8, -======= - "execution_count": 46, ->>>>>>> bf8ef79 (add notebook) -======= - "execution_count": 8, ->>>>>>> a71e2a8 (improve notebook explanation) -======= "execution_count": 5, ->>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> a71e2a8 (improve notebook explanation) - "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", - "You can now load the package via spacy.load('en_core_web_sm')\n", -======= ->>>>>>> e0ed3b4 (add the files containing token information/labels) "{'Capitalized': True,\n", " 'Is Adjective': False,\n", " 'Is Adposition': False,\n", @@ -216,32 +121,14 @@ " 'Is Symbol': False,\n", " 'Is Verb': False,\n", " 'Starts with space': False}\n" -<<<<<<< HEAD -======= - "[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False]\n" ->>>>>>> bf8ef79 (add notebook) -======= ->>>>>>> a71e2a8 (improve notebook explanation) ] } ], "source": [ -<<<<<<< HEAD -<<<<<<< HEAD "from delphi.eval import token_labelling\n", "\n", "label = token_labelling.label_single_token(token)\n", "pprint(label)" -======= - "label = token_labelling.label_single_token(token)\n", - "print(label)" ->>>>>>> bf8ef79 (add notebook) -======= - "from delphi.eval import token_labelling\n", - "\n", - "label = token_labelling.label_single_token(token)\n", - "pprint(label)" ->>>>>>> a71e2a8 (improve notebook explanation) ] }, { @@ -254,19 +141,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - "execution_count": 9, -======= - "execution_count": 42, ->>>>>>> bf8ef79 (add notebook) -======= - "execution_count": 9, ->>>>>>> a71e2a8 (improve notebook explanation) -======= "execution_count": 6, ->>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { @@ -280,29 +155,6 @@ "---------------- Token labels ---------------\n", " 0 Starts with space False\n", " 1 Capitalized True\n", -<<<<<<< HEAD - " 2 Is Noun False\n", - " 3 Is Pronoun True\n", - " 4 Is Adjective False\n", - " 5 Is Verb False\n", - " 6 Is Adverb False\n", - " 7 Is Preposition False\n", - " 8 Is Conjunction False\n", - " 9 Is Interjunction False\n", -<<<<<<< HEAD -<<<<<<< HEAD - " 10 Is Named Entity False\n" -======= - " 10 Is Subject True\n", - " 11 Is Object False\n", - " 12 Is Root False\n", - " 13 Is auxiliary False\n", - " 14 Is Named Entity False\n" ->>>>>>> bf8ef79 (add notebook) -======= - " 10 Is Named Entity False\n" ->>>>>>> a71e2a8 (improve notebook explanation) -======= " 2 Is Adjective False\n", " 3 Is Adposition False\n", " 4 Is Adverb False\n", @@ -321,7 +173,6 @@ " 17 Is Verb False\n", " 18 Is Other False\n", " 19 Is Named Entity True\n" ->>>>>>> e0ed3b4 (add the files containing token information/labels) ] } ], @@ -333,389 +184,33 @@ "cell_type": "markdown", "metadata": {}, "source": [ -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> a71e2a8 (improve notebook explanation) "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument:\n", "```Python\n", ">>> token_labelling.explain_token_labels()\n", "```" -<<<<<<< HEAD -======= - "If you are interested in all the possible labels a token can have, that spaCy is capable of assigning, then call the same function but without any argument." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Explanation of all 302 token labels (POS, dependency, NER, ...):\n", - " ADJ adjective\n", - " ADP adposition\n", - " ADV adverb\n", - " AUX auxiliary\n", - " CONJ conjunction\n", - " CCONJ coordinating conjunction\n", - " DET determiner\n", - " INTJ interjection\n", - " NOUN noun\n", - " NUM numeral\n", - " PART particle\n", - " PRON pronoun\n", - " PROPN proper noun\n", - " PUNCT punctuation\n", - " SCONJ subordinating conjunction\n", - " SYM symbol\n", - " VERB verb\n", - " X other\n", - " EOL end of line\n", - " SPACE space\n", - " . punctuation mark, sentence closer\n", - " , punctuation mark, comma\n", - " -LRB- left round bracket\n", - " -RRB- right round bracket\n", - " `` opening quotation mark\n", - " \"\" closing quotation mark\n", - " '' closing quotation mark\n", - " : punctuation mark, colon or ellipsis\n", - " $ symbol, currency\n", - " # symbol, number sign\n", - " AFX affix\n", - " CC conjunction, coordinating\n", - " CD cardinal number\n", - " DT determiner\n", - " EX existential there\n", - " FW foreign word\n", - " HYPH punctuation mark, hyphen\n", - " IN conjunction, subordinating or preposition\n", - " JJ adjective (English), other noun-modifier (Chinese)\n", - " JJR adjective, comparative\n", - " JJS adjective, superlative\n", - " LS list item marker\n", - " MD verb, modal auxiliary\n", - " NIL missing tag\n", - " NN noun, singular or mass\n", - " NNP noun, proper singular\n", - " NNPS noun, proper plural\n", - " NNS noun, plural\n", - " PDT predeterminer\n", - " POS possessive ending\n", - " PRP pronoun, personal\n", - " PRP$ pronoun, possessive\n", - " RB adverb\n", - " RBR adverb, comparative\n", - " RBS adverb, superlative\n", - " RP adverb, particle\n", - " TO infinitival \"to\"\n", - " UH interjection\n", - " VB verb, base form\n", - " VBD verb, past tense\n", - " VBG verb, gerund or present participle\n", - " VBN verb, past participle\n", - " VBP verb, non-3rd person singular present\n", - " VBZ verb, 3rd person singular present\n", - " WDT wh-determiner\n", - " WP wh-pronoun, personal\n", - " WP$ wh-pronoun, possessive\n", - " WRB wh-adverb\n", - " SP space (English), sentence-final particle (Chinese)\n", - " ADD email\n", - " NFP superfluous punctuation\n", - " GW additional word in multi-word expression\n", - " XX unknown\n", - " BES auxiliary \"be\"\n", - " HVS forms of \"have\"\n", - " _SP whitespace\n", - " $( other sentence-internal punctuation mark\n", - " $, comma\n", - " $. sentence-final punctuation mark\n", - " ADJA adjective, attributive\n", - " ADJD adjective, adverbial or predicative\n", - " APPO postposition\n", - " APPR preposition; circumposition left\n", - " APPRART preposition with article\n", - " APZR circumposition right\n", - " ART definite or indefinite article\n", - " CARD cardinal number\n", - " FM foreign language material\n", - " ITJ interjection\n", - " KOKOM comparative conjunction\n", - " KON coordinate conjunction\n", - " KOUI subordinate conjunction with \"zu\" and infinitive\n", - " KOUS subordinate conjunction with sentence\n", - " NE proper noun\n", - " NNE proper noun\n", - " PAV pronominal adverb\n", - " PROAV pronominal adverb\n", - " PDAT attributive demonstrative pronoun\n", - " PDS substituting demonstrative pronoun\n", - " PIAT attributive indefinite pronoun without determiner\n", - " PIDAT attributive indefinite pronoun with determiner\n", - " PIS substituting indefinite pronoun\n", - " PPER non-reflexive personal pronoun\n", - " PPOSAT attributive possessive pronoun\n", - " PPOSS substituting possessive pronoun\n", - " PRELAT attributive relative pronoun\n", - " PRELS substituting relative pronoun\n", - " PRF reflexive personal pronoun\n", - " PTKA particle with adjective or adverb\n", - " PTKANT answer particle\n", - " PTKNEG negative particle\n", - " PTKVZ separable verbal particle\n", - " PTKZU \"zu\" before infinitive\n", - " PWAT attributive interrogative pronoun\n", - " PWAV adverbial interrogative or relative pronoun\n", - " PWS substituting interrogative pronoun\n", - " TRUNC word remnant\n", - " VAFIN finite verb, auxiliary\n", - " VAIMP imperative, auxiliary\n", - " VAINF infinitive, auxiliary\n", - " VAPP perfect participle, auxiliary\n", - " VMFIN finite verb, modal\n", - " VMINF infinitive, modal\n", - " VMPP perfect participle, modal\n", - " VVFIN finite verb, full\n", - " VVIMP imperative, full\n", - " VVINF infinitive, full\n", - " VVIZU infinitive with \"zu\", full\n", - " VVPP perfect participle, full\n", - " XY non-word containing non-letter\n", - " AD adverb\n", - " AS aspect marker\n", - " BA 把 in ba-construction\n", - " CS subordinating conjunction\n", - " DEC 的 in a relative clause\n", - " DEG associative 的\n", - " DER 得 in V-de const. and V-de-R\n", - " DEV 地 before VP\n", - " ETC for words 等, 等等\n", - " IJ interjection\n", - " LB 被 in long bei-const\n", - " LC localizer\n", - " M measure word\n", - " MSP other particle\n", - " NR proper noun\n", - " NT temporal noun\n", - " OD ordinal number\n", - " ON onomatopoeia\n", - " P preposition excluding 把 and 被\n", - " PN pronoun\n", - " PU punctuation\n", - " SB 被 in short bei-const\n", - " VA predicative adjective\n", - " VC 是 (copula)\n", - " VE 有 as the main verb\n", - " VV other verb\n", - " NP noun phrase\n", - " PP prepositional phrase\n", - " VP verb phrase\n", - " ADVP adverb phrase\n", - " ADJP adjective phrase\n", - " SBAR subordinating conjunction\n", - " PRT particle\n", - " PNP prepositional noun phrase\n", - " acl clausal modifier of noun (adjectival clause)\n", - " acomp adjectival complement\n", - " advcl adverbial clause modifier\n", - " advmod adverbial modifier\n", - " agent agent\n", - " amod adjectival modifier\n", - " appos appositional modifier\n", - " attr attribute\n", - " aux auxiliary\n", - " auxpass auxiliary (passive)\n", - " case case marking\n", - " cc coordinating conjunction\n", - " ccomp clausal complement\n", - " clf classifier\n", - " complm complementizer\n", - " compound compound\n", - " conj conjunct\n", - " cop copula\n", - " csubj clausal subject\n", - " csubjpass clausal subject (passive)\n", - " dative dative\n", - " dep unclassified dependent\n", - " det determiner\n", - " discourse discourse element\n", - " dislocated dislocated elements\n", - " dobj direct object\n", - " expl expletive\n", - " fixed fixed multiword expression\n", - " flat flat multiword expression\n", - " goeswith goes with\n", - " hmod modifier in hyphenation\n", - " hyph hyphen\n", - " infmod infinitival modifier\n", - " intj interjection\n", - " iobj indirect object\n", - " list list\n", - " mark marker\n", - " meta meta modifier\n", - " neg negation modifier\n", - " nmod modifier of nominal\n", - " nn noun compound modifier\n", - " npadvmod noun phrase as adverbial modifier\n", - " nsubj nominal subject\n", - " nsubjpass nominal subject (passive)\n", - " nounmod modifier of nominal\n", - " npmod noun phrase as adverbial modifier\n", - " num number modifier\n", - " number number compound modifier\n", - " nummod numeric modifier\n", - " oprd object predicate\n", - " obj object\n", - " obl oblique nominal\n", - " orphan orphan\n", - " parataxis parataxis\n", - " partmod participal modifier\n", - " pcomp complement of preposition\n", - " pobj object of preposition\n", - " poss possession modifier\n", - " possessive possessive modifier\n", - " preconj pre-correlative conjunction\n", - " prep prepositional modifier\n", - " prt particle\n", - " punct punctuation\n", - " quantmod modifier of quantifier\n", - " rcmod relative clause modifier\n", - " relcl relative clause modifier\n", - " reparandum overridden disfluency\n", - " root root\n", - " ROOT root\n", - " vocative vocative\n", - " xcomp open clausal complement\n", - " ac adpositional case marker\n", - " adc adjective component\n", - " ag genitive attribute\n", - " ams measure argument of adjective\n", - " app apposition\n", - " avc adverbial phrase component\n", - " cd coordinating conjunction\n", - " cj conjunct\n", - " cm comparative conjunction\n", - " cp complementizer\n", - " cvc collocational verb construction\n", - " da dative\n", - " dh discourse-level head\n", - " dm discourse marker\n", - " ep expletive es\n", - " hd head\n", - " ju junctor\n", - " mnr postnominal modifier\n", - " mo modifier\n", - " ng negation\n", - " nk noun kernel element\n", - " nmc numerical component\n", - " oa accusative object\n", - " oc clausal object\n", - " og genitive object\n", - " op prepositional object\n", - " par parenthetical element\n", - " pd predicate\n", - " pg phrasal genitive\n", - " ph placeholder\n", - " pm morphological particle\n", - " pnc proper noun component\n", - " rc relative clause\n", - " re repeated element\n", - " rs reported speech\n", - " sb subject\n", - " sbp passivized subject (PP)\n", - " sp subject or predicate\n", - " svp separable verb prefix\n", - " uc unit component\n", - " vo vocative\n", - " PERSON People, including fictional\n", - " NORP Nationalities or religious or political groups\n", - " FACILITY Buildings, airports, highways, bridges, etc.\n", - " FAC Buildings, airports, highways, bridges, etc.\n", - " ORG Companies, agencies, institutions, etc.\n", - " GPE Countries, cities, states\n", - " LOC Non-GPE locations, mountain ranges, bodies of water\n", - " PRODUCT Objects, vehicles, foods, etc. (not services)\n", - " EVENT Named hurricanes, battles, wars, sports events, etc.\n", - " WORK_OF_ART Titles of books, songs, etc.\n", - " LAW Named documents made into laws.\n", - " LANGUAGE Any named language\n", - " DATE Absolute or relative dates or periods\n", - " TIME Times smaller than a day\n", - " PERCENT Percentage, including \"%\"\n", - " MONEY Monetary values, including unit\n", - " QUANTITY Measurements, as of weight or distance\n", - " ORDINAL \"first\", \"second\", etc.\n", - " CARDINAL Numerals that do not fall under another type\n", - " PER Named person or family.\n", - " MISC Miscellaneous entities, e.g. events, nationalities, products or works of art\n", - " EVT Festivals, cultural events, sports events, weather phenomena, wars, etc.\n", - " PROD Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas\n", - " DRV Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')\n", - " GPE_LOC Geo-political entity, with a locative sense, e.g. 'John lives in Spain'\n", - " GPE_ORG Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'\n" - ] - } - ], - "source": [ - "token_labelling.explain_token_labels()" ->>>>>>> bf8ef79 (add notebook) -======= ->>>>>>> a71e2a8 (improve notebook explanation) ] }, { "cell_type": "markdown", "metadata": {}, "source": [ -<<<<<<< HEAD -<<<<<<< HEAD - "### Batched token labelling\n", -======= ->>>>>>> bf8ef79 (add notebook) -======= "### Batched token labelling\n", ->>>>>>> a71e2a8 (improve notebook explanation) "Next, let us analyze a batch of sentences and have them labelled.\n", "> In the example below the input sentences are not yet tokenized, so spaCy uses its internal tokenizer." ] }, { "cell_type": "code", -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - "execution_count": 18, -======= - "execution_count": 55, ->>>>>>> bf8ef79 (add notebook) -======= - "execution_count": 18, ->>>>>>> a71e2a8 (improve notebook explanation) -======= "execution_count": 9, ->>>>>>> e0ed3b4 (add the files containing token information/labels) "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ -<<<<<<< HEAD - "Token: This\n", -<<<<<<< HEAD -<<<<<<< HEAD - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | True | False | True | False | False | False | False | False | False | False \n", -======= "Token: Peter\n", "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", "False | True | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | True \n", ->>>>>>> e0ed3b4 (add the files containing token information/labels) "---\n", "Token: is\n", "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", @@ -730,59 +225,13 @@ "False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False \n", "---\n", "Token: .\n", -<<<<<<< HEAD - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False \n", -======= - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | True | False | True | False | False | False | False | False | False | True | False | False | False | False \n", -======= - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | True | False | True | False | False | False | False | False | False | False \n", ->>>>>>> a71e2a8 (improve notebook explanation) - "---\n", - "Token: is\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | False | False | False | False | True | False | False | False | False | False \n", - "---\n", - "Token: a\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: sentence\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | False | True | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: .\n", -<<<<<<< HEAD - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", ->>>>>>> bf8ef79 (add notebook) -======= - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False \n", ->>>>>>> a71e2a8 (improve notebook explanation) -======= "Starts with space | Capitalized | Is Adjective | Is Adposition | Is Adverb | Is Auxiliary | Is Coordinating conjuction | Is Determiner | Is Interjunction | Is Noun | Is Numeral | Is Particle | Is Pronoun | Is Proper Noun | Is Punctuation | Is Subordinating conjuction | Is Symbol | Is Verb | Is Other | Is Named Entity\n", "False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False \n", ->>>>>>> e0ed3b4 (add the files containing token information/labels) "---\n", "\n", "\n", "5\n", -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': False, 'Is Pronoun': True, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': True, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" -======= - "[[False, True, False, True, False, False, False, False, False, False, True, False, False, False, False], [False, False, False, False, False, True, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" ->>>>>>> bf8ef79 (add notebook) -======= - "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': False, 'Is Pronoun': True, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': True, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" ->>>>>>> a71e2a8 (improve notebook explanation) -======= "[{'Starts with space': False, 'Capitalized': True, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': True, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': True}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': True, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': True, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': True, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': False, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Adjective': False, 'Is Adposition': False, 'Is Adverb': False, 'Is Auxiliary': False, 'Is Coordinating conjuction': False, 'Is Determiner': False, 'Is Interjunction': False, 'Is Noun': False, 'Is Numeral': False, 'Is Particle': False, 'Is Pronoun': False, 'Is Proper Noun': False, 'Is Punctuation': True, 'Is Subordinating conjuction': False, 'Is Symbol': False, 'Is Verb': False, 'Is Other': False, 'Is Named Entity': False}]\n" ->>>>>>> e0ed3b4 (add the files containing token information/labels) ] } ], @@ -790,15 +239,7 @@ "sentences = [\n", " \"Peter is a person.\"\n", "]\n", -<<<<<<< HEAD -<<<<<<< HEAD - "labels = token_labelling.label_batch_sentences(sentences, tokenized=False, verbose=True)\n", -======= - "labels = token_labelling.label_batch_token(sentences, tokenized=False, verbose=True)\n", ->>>>>>> bf8ef79 (add notebook) -======= "labels = token_labelling.label_batch_sentences(sentences, tokenized=False, verbose=True)\n", ->>>>>>> a71e2a8 (improve notebook explanation) "\n", "print(len(labels[0]))\n", "print(labels[0])" @@ -813,55 +254,15 @@ }, { "cell_type": "code", -<<<<<<< HEAD -<<<<<<< HEAD "execution_count": 19, -======= - "execution_count": null, ->>>>>>> bf8ef79 (add notebook) -======= - "execution_count": 19, ->>>>>>> a71e2a8 (improve notebook explanation) "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ -<<<<<<< HEAD -<<<<<<< HEAD - "5\n", - "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': True, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': True, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" -======= - "Token: This \n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | True | True | False | False | False | False | False | False | False | False | False | True | False | False \n", - "---\n", - "Token: is \n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | False | True | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: a \n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | True | False | False | False | False | False | False | False | False | False | False \n", - "---\n", - "Token: sentence\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | True | False | False | False | False | False | False | False | False | True | False | False | False \n", - "---\n", - "Token: .\n", - "Starts with space | Capitalized | Is Noun | Is Pronoun | Is Adjective | Is Verb | Is Adverb | Is Preposition | Is Conjunction | Is Interjunction | Is Subject | Is Object | Is Root | Is auxiliary | Is Named Entity\n", - "False | False | False | False | False | False | False | False | False | False | False | False | False | False | False \n", - "---\n", - "\n", - "\n", - "5\n", - "[[False, True, True, False, False, False, False, False, False, False, False, False, True, False, False], [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False], [False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False, False, True, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]\n" ->>>>>>> bf8ef79 (add notebook) -======= "5\n", "[{'Starts with space': False, 'Capitalized': True, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': True, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': True, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': True, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}, {'Starts with space': False, 'Capitalized': False, 'Is Noun': False, 'Is Pronoun': False, 'Is Adjective': False, 'Is Verb': False, 'Is Adverb': False, 'Is Preposition': False, 'Is Conjunction': False, 'Is Interjunction': False, 'Is Named Entity': False}]\n" ->>>>>>> a71e2a8 (improve notebook explanation) ] } ], @@ -869,34 +270,16 @@ "sentences = [\n", " [\"This \", \"is \", \"a \", \"sentence\", \".\"]\n", "]\n", -<<<<<<< HEAD -<<<<<<< HEAD - "labelled_sentences = token_labelling.label_batch_sentences(sentences, tokenized=True, verbose=False)\n", - "\n", - "print(len(labelled_sentences[0]))\n", - "print(labelled_sentences[0])" -======= - "labels = token_labelling.label_batch_token(sentences, tokenized=True, verbose=False)\n", - "\n", - "print(len(labels[0]))\n", - "print(labels[0])" ->>>>>>> bf8ef79 (add notebook) -======= "labelled_sentences = token_labelling.label_batch_sentences(sentences, tokenized=True, verbose=False)\n", "\n", "print(len(labelled_sentences[0]))\n", "print(labelled_sentences[0])" ->>>>>>> a71e2a8 (improve notebook explanation) ] }, { "cell_type": "markdown", "metadata": {}, "source": [] -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> a71e2a8 (improve notebook explanation) }, { "cell_type": "markdown", @@ -904,14 +287,28 @@ "source": [ "# 2) Labelling all tokens in the dataset\n", "\n", - "Now we want to label all the tokens that our tokenizer knows - its entire vocabulary." + "Now we want to label all the tokens that our tokenizer knows - its entire vocabulary.\n", + "\n", + "Using thy script in `scripts/label_all_tokens.py` we get the files:\n", + "- `src\\delphi\\eval\\all_tokens_list.txt`\n", + "- `src\\delphi\\eval\\labelled_token_ids_dict.pkl`\n", + "\n", + "Let's load the tokenizer so that we can look at the labelled tokens.\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\joshu\\anaconda3\\envs\\delphi2\\lib\\site-packages\\transformers\\utils\\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -922,11 +319,8 @@ ], "source": [ "# Get all the tokens of the tokenizer\n", - "from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast\n", + "from transformers import AutoTokenizer, PreTrainedTokenizer\n", "\n", - "def tokenize(tokenizer: PreTrainedTokenizer, sample_txt: str) -> list[int]:\n", - " # supposedly this can be different than prepending the bos token id\n", - " return tokenizer.encode(tokenizer.bos_token + sample_txt, return_tensors=\"pt\")[0]\n", "\n", "# Decode a sentence\n", "def decode(tokenizer: PreTrainedTokenizer, token_ids: list[int]) -> str:\n", @@ -939,101 +333,82 @@ ] }, { - "cell_type": "code", - "execution_count": 48, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 20: \u0011 21: \u0012 22: \u0013 23: \u0014 24: \u0015 25: \u0016 26: \u0017 27: \u0018 28: \u0019 29: \u001a \n", - " 800: te 801: happened 802: flow 803: food 804: list 805: just 806: Her 807: animals 808: hig 809: didn \n", - "1200: ice 1201: ount 1202: worked 1203: okay 1204: irt 1205: making 1206: dress 1207: enjoy 1208: advent 1209: bright \n", - "2300: lift 2301: ign 2302: ba 2303: line 2304: Doggy 2305: clouds 2306: dogs 2307: yard 2308: wolf 2309: spray \n", - "4086: 1 4087: 0 4088: 2 4089: 5 4090: 4 4091: 9 4092: 8 4093: 6 4094: 7 4095: $ \n" - ] - } - ], "source": [ - "# Let's have a look at some tokens\n", - "ranges = [(20,30), (800,810), (1200,1210), (2300, 2310), (vocab_size-10, vocab_size)]\n", - "for start, end in ranges:\n", - " for i in range(start, end):\n", - " print(f\"{i:4}:\",decode(tokenizer, i).ljust(10), end=\" \")\n", - " print()" + "Load the pickle." ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ea9ff9bbe9364a3ea1ab9acc11abe338", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Labelling tokens: 0%| | 0/9 [00:00>>>>>> bf8ef79 (add notebook) -======= ->>>>>>> a71e2a8 (improve notebook explanation) } ], "metadata": { @@ -1052,15 +427,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", -<<<<<<< HEAD -<<<<<<< HEAD - "version": "3.10.13" -======= - "version": "3.8.8" ->>>>>>> bf8ef79 (add notebook) -======= "version": "3.10.13" ->>>>>>> a71e2a8 (improve notebook explanation) } }, "nbformat": 4, From ef0f2e4f172abfc7c3a0f63538db3f76e8ac0a97 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Thu, 15 Feb 2024 16:26:52 +0100 Subject: [PATCH 27/29] undo __init__ --- src/delphi/__init__.py | 2 -- src/delphi/eval/__init__.py | 1 - 2 files changed, 3 deletions(-) diff --git a/src/delphi/__init__.py b/src/delphi/__init__.py index 36c553b4..b9b115cf 100644 --- a/src/delphi/__init__.py +++ b/src/delphi/__init__.py @@ -1,5 +1,3 @@ from beartype.claw import beartype_this_package # <-- hype comes -from . import eval - beartype_this_package() # <-- hype goes diff --git a/src/delphi/eval/__init__.py b/src/delphi/eval/__init__.py index 30afc7c2..e69de29b 100644 --- a/src/delphi/eval/__init__.py +++ b/src/delphi/eval/__init__.py @@ -1 +0,0 @@ -from . import token_labelling From 5af8a6f6d22c1fdfb0962ea0aa71efbc21195276 Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Fri, 16 Feb 2024 11:52:44 +0100 Subject: [PATCH 28/29] change spacy model from "trf" to "sm" --- src/delphi/eval/token_labelling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index 55cecc02..d2d2b45e 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -7,10 +7,10 @@ # make sure the english language model capabilities are installed by the equivalent of: # python -m spacy download en_core_web_sm # Should be run once, initially. Download only starts if not already installed. -SPACY_MODEL = "en_core_web_trf" +SPACY_MODEL = "en_core_web_sm" # small: "en_core_web_sm", large: "en_core_web_trf" NLP = None # global var to hold the language model if not is_package(SPACY_MODEL): - spacy.cli.download("en_core_web_trf", False, False) + spacy.cli.download(SPACY_MODEL, False, False) TOKEN_LABELS: dict[str, Callable] = { From c292da7066136e598ef7ffafde5e2b050e359fee Mon Sep 17 00:00:00 2001 From: Joshua Wendland <80349780+joshuawe@users.noreply.github.com> Date: Fri, 16 Feb 2024 11:58:25 +0100 Subject: [PATCH 29/29] bug fix --- src/delphi/eval/token_labelling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/delphi/eval/token_labelling.py b/src/delphi/eval/token_labelling.py index d2d2b45e..80673e03 100644 --- a/src/delphi/eval/token_labelling.py +++ b/src/delphi/eval/token_labelling.py @@ -165,11 +165,11 @@ def label_batch_sentences( corresponding token length where each entry provides the labels/categories for the token. Sentence -> Token -> Labels """ - global NLP + global NLP, SPACY_MODEL if NLP is None: # Load english language model - NLP = spacy.load("en_core_web_trf") + NLP = spacy.load(SPACY_MODEL) # labelled tokens, list holding sentences holding tokens holding corresponding token labels labelled_sentences: list[list[dict[str, bool]]] = list()