-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added spacy component for relations extractions using prompts (#14)
* feat: Added spacy component for relations extractions using prompts * feat: Added spacy component for relations extractions using prompts * Auto dropping of examples if prompt is too long * Auto dropping of examples if prompt is too long * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * chore: clean tagged files * chore: clean triplets * Moved data loader to project (paper) folder * feat: Integrated in the prompt templates * fix: fixed tests with new triplets * fix: tests now pass with the exception of create_prompt * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test of prompt creation now use made up data * add space and fix all tests * few tweets threads * chore: remove unused files * add registry for chatgpt api prompting * ci: updated tests * changed to use ._.relation_triplets attribute * update default config specification * feat: Added scoring function for span relations * ChatGPT template, deafult task desc, init inherit * fix: changed a hashtag * fix: Updated entry points * fix: Type hint of dict to Dict * docs: Added example of how to evaluate prompts * fix: the relation extraction pipeline can now be extracted as intended. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * docs: Updated tutorial * fix: removed unused variable * context threads * chatgpt tests pass * Run evaluation of templates * twitter contexts * add notebook for testing speed of api calling * optimize twitter_threads * update threads * add test for single call to API * extract triplets from news, cluster, visualize * concat contexts with end token * tweet fron concat context text * add network creation code, argparse in umap_hdb.py * fetching tweets * better documentation and file naming * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm unrelated files, add twitter triplet extraction * better try-except, remove unused function * recreate empty generator after catching keyerror * note down tweet ids * add openai exceptions, save non-headword triplets * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: Adde uncomitted changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ci: removed mypy from ci * ci: removed support for 3.8 --------- Co-authored-by: stinenyhus <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: SaraMøllerØstergaard#1413 <[email protected]> Co-authored-by: thearol <[email protected]>
- Loading branch information
1 parent
570ee1f
commit 2c3d5e3
Showing
43 changed files
with
4,038 additions
and
532 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .load import load_gold_triplets, load_api_key # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from pathlib import Path | ||
from typing import List, Optional | ||
|
||
import spacy | ||
from conspiracies import docs_from_jsonl | ||
from spacy.tokens import Doc | ||
|
||
|
||
def load_gold_triplets( | ||
path: Optional[str] = None, | ||
nlp: Optional[spacy.Language] = None, | ||
) -> List[Doc]: | ||
if nlp is None: | ||
nlp = spacy.blank("en") | ||
nlp.add_pipe("sentencizer") | ||
if path is None: | ||
path = "/data/conspiracies/gold_triplets/gold_triplets_tweet.jsonl" | ||
# check if it existing | ||
if not Path(path).exists(): | ||
raise FileNotFoundError( | ||
f"File {path} not found. You are probably not running this from Grundtvig" | ||
+ ", in which case you will have to specify the path.", | ||
) | ||
return docs_from_jsonl(path, nlp) | ||
|
||
|
||
def load_api_key() -> str: | ||
path = Path("/data") / "conspiracies" / "api_key.txt" | ||
if not path.exists(): | ||
raise FileNotFoundError( | ||
f"File {path} not found. You are probably not running this from Grundtvig" | ||
+ ", in which case you will have to specify the path.", | ||
) | ||
with open(path, "r") as f: | ||
key = f.read() | ||
return key |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
"""Extract examples of tweets and triplets. Examples are on ucloud in | ||
data/triplet-extraction-gpt/tagged/ and on Grundtvig in. | ||
/data/conspiracies/triplet-extraction-gpt | ||
""" | ||
|
||
import random | ||
import re | ||
from typing import List, Tuple, Dict | ||
from spacy.tokens import Doc | ||
from conspiracies.prompt_relation_extraction.data_classes import ( | ||
SpanTriplet, | ||
DocTriplets, | ||
) | ||
|
||
|
||
def has_multiple_triplets(spacy_triplets: DocTriplets): | ||
return len(spacy_triplets) > 1 | ||
|
||
|
||
def has_no_triplets(spacy_triplets: DocTriplets): | ||
return len(spacy_triplets) == 0 | ||
|
||
|
||
def has_one_mention(text_doc: Doc): | ||
return bool(re.search(r"@[\w]+", text_doc.text)) | ||
|
||
|
||
def has_multi_word_verb(spacy_triplets: DocTriplets): | ||
for triplet in spacy_triplets: | ||
if len(triplet.predicate) > 1: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def has_single_word_verb(spacy_triplets: DocTriplets): | ||
for triplet in spacy_triplets: | ||
if len(triplet.predicate) == 1: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def has_multi_word_subj(spacy_triplets: DocTriplets): | ||
for triplet in spacy_triplets: | ||
if len(triplet.subject) > 1: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def has_multi_word_obj(spacy_triplets: DocTriplets): | ||
for triplet in spacy_triplets: | ||
if len(triplet.object) > 1: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def criteria_sampling( | ||
criteria_keys: dict, | ||
n_target: int, | ||
input_examples: List[Doc], | ||
): | ||
"""This function tries to extract examples that fulfill the criteria, so | ||
that as many of the criteria as possible are fulfilled in the target | ||
tweets. | ||
If that is not possible, it will just sample randomly. | ||
""" | ||
target_examples: List[Doc] = [] | ||
while len(target_examples) < n_target: | ||
try: | ||
criteria = random.choice(list(criteria_keys.keys())) | ||
subset_function = criteria_keys[criteria] | ||
# function_key = "doc" if criteria == "has_one_mention" else "triplets" | ||
|
||
# Filter only examples that fulfill the criteria | ||
if criteria == "has_one_mention": | ||
useful_examples = list( | ||
filter(lambda x: subset_function(x), input_examples), | ||
) | ||
else: | ||
useful_examples = list( | ||
filter( | ||
lambda x: subset_function(x._.relation_triplets), | ||
input_examples, | ||
), | ||
) | ||
|
||
target = random.choice(useful_examples) | ||
criteria_keys.pop(criteria) | ||
|
||
except IndexError: | ||
target = random.choice(input_examples) | ||
|
||
target_examples.append(target) | ||
input_examples.remove(target) | ||
return target_examples, input_examples | ||
|
||
|
||
def extract_examples( | ||
examples: List[Doc], | ||
n_target: int, | ||
cv: int = 1, | ||
) -> Tuple[List[List[Doc]], List[List[Doc]]]: | ||
"""This function extracts examples and targets for the triplet extraction | ||
task. It tries to extract examples that fulfill the criteria, so that as | ||
many of the criteria as possible are fulfilled in the target tweets. It can | ||
also be used to create cross validation sets. | ||
Args: | ||
examples (List[Doc]): tagged examples to extract from | ||
n_target (int): number of target examples to extract | ||
cv (int, optional): number of cv-sets to make. Defaults to 1, | ||
which means only one set of targets and examples are extracted. | ||
Returns: | ||
Tuple[List[List[Doc]], List[List[Doc]]]: _description_ | ||
""" | ||
# Manually create copy since spacy spans cannot be deepcopied | ||
example_dicts = [d for d in examples] | ||
|
||
target_list: List[Doc] = [] | ||
example_list: List[Doc] = [] | ||
for _ in range(cv): | ||
criteria_keys = { | ||
"has_multiple_triplets": has_multiple_triplets, | ||
"has_no_triplets": has_no_triplets, | ||
"has_one_mention": has_one_mention, | ||
"has_multi_word_verb": has_multi_word_verb, | ||
"has_single_word_verb": has_single_word_verb, | ||
"has_multi_word_subj1": has_multi_word_subj, | ||
"has_multi_word_obj": has_multi_word_obj, | ||
} | ||
extracted_targets, extracted_examples = criteria_sampling( | ||
criteria_keys, | ||
n_target, | ||
example_dicts, | ||
) | ||
example_list.append( | ||
[ex for prev_target in target_list for ex in prev_target] | ||
+ extracted_examples, | ||
) | ||
target_list.append(extracted_targets) | ||
|
||
return target_list, example_list |
Oops, something went wrong.