diff --git a/paper/extract_examples.py b/paper/extract_examples.py index cb72cb5..850f819 100644 --- a/paper/extract_examples.py +++ b/paper/extract_examples.py @@ -6,12 +6,9 @@ import random import re -from typing import List, Tuple, Dict +from typing import List, Tuple from spacy.tokens import Doc -from conspiracies.prompt_relation_extraction.data_classes import ( - SpanTriplet, - DocTriplets, -) +from conspiracies.relationextraction.gptprompting.data_classes import DocTriplets def has_multiple_triplets(spacy_triplets: DocTriplets): diff --git a/paper/extract_triplets_newspapers.py b/paper/extract_triplets_newspapers.py index c2c818f..332ab95 100644 --- a/paper/extract_triplets_newspapers.py +++ b/paper/extract_triplets_newspapers.py @@ -1,18 +1,14 @@ from spacy.tokens import Span import time import os -import ndjson from pathlib import Path -from typing import List, Union, Generator, Tuple +from typing import List, Generator import spacy from transformers import AutoTokenizer import argparse # Conspiracies -from conspiracies.HeadWordExtractionComponent import contains_ents -from conspiracies.relationextraction import SpacyRelationExtractor -from conspiracies import wordpiece_length_normalization -from conspiracies.coref import CoreferenceComponent +from conspiracies.preproc import wordpiece_length_normalization from extract_utils import load_ndjson, write_txt diff --git a/paper/extract_triplets_tweets.py b/paper/extract_triplets_tweets.py index a0bc357..04ac9be 100644 --- a/paper/extract_triplets_tweets.py +++ b/paper/extract_triplets_tweets.py @@ -2,24 +2,23 @@ import os import random from pathlib import Path -from typing import List, Optional, Generator, Union, Dict -import spacy +from typing import List, Optional, Generator, Union + +import ndjson +import torch from spacy.tokens import Doc, Span import argparse -import sys from data import load_gold_triplets import spacy from extract_examples import extract_examples -from conspiracies.prompt_relation_extraction import ( +from conspiracies.relationextraction.gptprompting import ( MarkdownPromptTemplate2, PromptTemplate, ) import openai # Conspiracies -from conspiracies.coref import CoreferenceComponent -from conspiracies.relationextraction import SpacyRelationExtractor from extract_utils import write_txt, ndjson_gen from src.concat_split_contexts import ( @@ -324,6 +323,7 @@ def multi2oie_extraction( continue except StopIteration: print("Stopping iteration because of StopIteration exception") + # TODO: the last iteration happens twice with this logic run = False subjects, predicates, objects, triplets = [], [], [], [] for triplet in doc._.relation_triplets: diff --git a/paper/src/ents_heads_extraction.py b/paper/src/ents_heads_extraction.py index 4236a19..3e20e39 100644 --- a/paper/src/ents_heads_extraction.py +++ b/paper/src/ents_heads_extraction.py @@ -2,7 +2,7 @@ import spacy from relationextraction import SpacyRelationExtractor # noqa -from conspiracies.HeadWordExtractionComponent import contains_ents +from conspiracies.headwordextraction.headwordextraction_comp import contains_ents def main(): diff --git a/pyproject.toml b/pyproject.toml index 7bdcd4b..65a4789 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,8 @@ dependencies = [ "numpy>=1.19.5,<1.24.0", "pandas>=1.1.5,<1.5.0", "jsonlines>=3.1.0,<3.2.0", + "openai", + "ndjson" ] [project.license] @@ -80,7 +82,7 @@ content-type = "text/markdown" [project.entry-points.spacy_factories] -"conspiracies/prompt_relation_extraction" = "conspiracies.prompt_relation_extraction.prompt_relation_component:create_prompt_relation_extraction_component" +"conspiracies/relationextraction/gptprompting" = "conspiracies.relationextraction.gptprompting:create_prompt_relation_extraction_component" [build-system] diff --git a/src/conspiracies/__init__.py b/src/conspiracies/__init__.py index fe0b7be..f7ec6e1 100644 --- a/src/conspiracies/__init__.py +++ b/src/conspiracies/__init__.py @@ -1,16 +1,2 @@ -from .HeadWordExtractionComponent import HeadwordsExtractionComponent # noqa F401 -from .HeadWordExtractionComponent import create_headwords_component # noqa F401 from .registry import registry # noqa F401 -from .prompt_relation_extraction import ( # noqa F401 - PromptTemplate1, - PromptTemplate2, - MarkdownPromptTemplate1, - MarkdownPromptTemplate2, - XMLStylePromptTemplate, - chatGPTPromptTemplate, - SpanTriplet, - StringTriplet, - DocTriplets, -) -from .utils import docs_from_jsonl, docs_to_jsonl # noqa F401 -from .wordpiece_length_normalization import wordpiece_length_normalization # noqa F401 +from .doc_utils import docs_from_jsonl, docs_to_jsonl # noqa F401 diff --git a/src/conspiracies/coref/__init__.py b/src/conspiracies/coref/__init__.py index 3d12529..fa728c7 100644 --- a/src/conspiracies/coref/__init__.py +++ b/src/conspiracies/coref/__init__.py @@ -1,2 +1,2 @@ -from .CoreferenceModel import CoreferenceModel # noqa -from .CoreferenceComponent import CoreferenceComponent, create_coref_component # noqa +from .coref_model import CoreferenceModel # noqa +from .coref_comp import CoreferenceComponent, create_coref_component # noqa diff --git a/src/conspiracies/coref/CoreferenceComponent.py b/src/conspiracies/coref/coref_comp.py similarity index 100% rename from src/conspiracies/coref/CoreferenceComponent.py rename to src/conspiracies/coref/coref_comp.py diff --git a/src/conspiracies/coref/CoreferenceModel.py b/src/conspiracies/coref/coref_model.py similarity index 100% rename from src/conspiracies/coref/CoreferenceModel.py rename to src/conspiracies/coref/coref_model.py diff --git a/src/conspiracies/utils.py b/src/conspiracies/doc_utils.py similarity index 95% rename from src/conspiracies/utils.py rename to src/conspiracies/doc_utils.py index 70e958b..680c60c 100644 --- a/src/conspiracies/utils.py +++ b/src/conspiracies/doc_utils.py @@ -5,7 +5,10 @@ from spacy.language import Language from spacy.tokens import Doc -from .prompt_relation_extraction import DocTriplets, SpanTriplet +from conspiracies.relationextraction.gptprompting import ( + DocTriplets, + SpanTriplet, +) def _doc_to_json(doc: Doc): diff --git a/src/conspiracies/headwordextraction/__init__.py b/src/conspiracies/headwordextraction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/conspiracies/HeadWordExtractionComponent.py b/src/conspiracies/headwordextraction/headwordextraction_comp.py similarity index 100% rename from src/conspiracies/HeadWordExtractionComponent.py rename to src/conspiracies/headwordextraction/headwordextraction_comp.py diff --git a/src/conspiracies/preproc/__init__.py b/src/conspiracies/preproc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/conspiracies/wordpiece_length_normalization.py b/src/conspiracies/preproc/wordpiece_length_normalization.py similarity index 100% rename from src/conspiracies/wordpiece_length_normalization.py rename to src/conspiracies/preproc/wordpiece_length_normalization.py diff --git a/src/conspiracies/relationextraction/__init__.py b/src/conspiracies/relationextraction/__init__.py index 251952c..3c04b0f 100644 --- a/src/conspiracies/relationextraction/__init__.py +++ b/src/conspiracies/relationextraction/__init__.py @@ -1,2 +1,2 @@ -from .knowledge_triplets import KnowledgeTriplets # noqa F401 -from .wrap_model_spacy import SpacyRelationExtractor # noqa F401 +from .multioie2.knowledge_triplets import KnowledgeTriplets # noqa F401 +from .multioie2.multi2oie_comp import SpacyRelationExtractor # noqa F401 diff --git a/src/conspiracies/prompt_relation_extraction/__init__.py b/src/conspiracies/relationextraction/gptprompting/__init__.py similarity index 87% rename from src/conspiracies/prompt_relation_extraction/__init__.py rename to src/conspiracies/relationextraction/gptprompting/__init__.py index 9e23b37..f079293 100644 --- a/src/conspiracies/prompt_relation_extraction/__init__.py +++ b/src/conspiracies/relationextraction/gptprompting/__init__.py @@ -9,7 +9,7 @@ chatGPTPromptTemplate, ) -from .prompt_relation_component import ( # noqa F401 +from .prompt_relation_comp import ( # noqa F401 create_prompt_relation_extraction_component, score_open_relations, ) diff --git a/src/conspiracies/prompt_relation_extraction/data_classes.py b/src/conspiracies/relationextraction/gptprompting/data_classes.py similarity index 100% rename from src/conspiracies/prompt_relation_extraction/data_classes.py rename to src/conspiracies/relationextraction/gptprompting/data_classes.py diff --git a/src/conspiracies/prompt_relation_extraction/prompt_apis.py b/src/conspiracies/relationextraction/gptprompting/prompt_apis.py similarity index 94% rename from src/conspiracies/prompt_relation_extraction/prompt_apis.py rename to src/conspiracies/relationextraction/gptprompting/prompt_apis.py index 9dcba7a..939681c 100644 --- a/src/conspiracies/prompt_relation_extraction/prompt_apis.py +++ b/src/conspiracies/relationextraction/gptprompting/prompt_apis.py @@ -2,10 +2,10 @@ import time from typing import Any, Dict, List -from ..registry import registry +from conspiracies.registry import registry from spacy.tokens import Doc -from conspiracies.prompt_relation_extraction import PromptTemplate +from conspiracies.relationextraction.gptprompting.prompt_templates import PromptTemplate @registry.prompt_apis.register("conspiracies/openai_gpt3_api") @@ -77,8 +77,9 @@ def openai_prompt(targets: List[str]) -> List[str]: openai.api_key = api_key message_example = prompt_template.create_prompt("test") - assert ( - type(message_example) == list and type(message_example[0]) == dict + assert isinstance(message_example, list) and isinstance( + message_example[0], + dict, ), "ChatGPT requires a list of message dicts. Consider using chatGPTPromptTemplate as template." # noqa: E501 responses: List[str] = [] diff --git a/src/conspiracies/prompt_relation_extraction/prompt_relation_component.py b/src/conspiracies/relationextraction/gptprompting/prompt_relation_comp.py similarity index 99% rename from src/conspiracies/prompt_relation_extraction/prompt_relation_component.py rename to src/conspiracies/relationextraction/gptprompting/prompt_relation_comp.py index a7be7c9..cf3ee25 100644 --- a/src/conspiracies/prompt_relation_extraction/prompt_relation_component.py +++ b/src/conspiracies/relationextraction/gptprompting/prompt_relation_comp.py @@ -9,7 +9,7 @@ from spacy.tokens import Doc from spacy.training.example import Example -from ..registry import registry +from conspiracies.registry import registry from .data_classes import DocTriplets, SpanTriplet from .prompt_apis import create_openai_chatgpt_prompt_api # noqa: F401 diff --git a/src/conspiracies/prompt_relation_extraction/prompt_templates.py b/src/conspiracies/relationextraction/gptprompting/prompt_templates.py similarity index 100% rename from src/conspiracies/prompt_relation_extraction/prompt_templates.py rename to src/conspiracies/relationextraction/gptprompting/prompt_templates.py diff --git a/src/conspiracies/prompt_relation_extraction/utils.py b/src/conspiracies/relationextraction/gptprompting/prompt_utils.py similarity index 100% rename from src/conspiracies/prompt_relation_extraction/utils.py rename to src/conspiracies/relationextraction/gptprompting/prompt_utils.py diff --git a/src/conspiracies/relationextraction/multioie2/__init__.py b/src/conspiracies/relationextraction/multioie2/__init__.py new file mode 100644 index 0000000..a4e4268 --- /dev/null +++ b/src/conspiracies/relationextraction/multioie2/__init__.py @@ -0,0 +1,2 @@ +from .knowledge_triplets import KnowledgeTriplets # noqa F401 +from .multi2oie_comp import SpacyRelationExtractor # noqa F401 diff --git a/src/conspiracies/relationextraction/dataset.py b/src/conspiracies/relationextraction/multioie2/dataset.py similarity index 100% rename from src/conspiracies/relationextraction/dataset.py rename to src/conspiracies/relationextraction/multioie2/dataset.py diff --git a/src/conspiracies/relationextraction/extract.py b/src/conspiracies/relationextraction/multioie2/extract.py similarity index 100% rename from src/conspiracies/relationextraction/extract.py rename to src/conspiracies/relationextraction/multioie2/extract.py diff --git a/src/conspiracies/relationextraction/knowledge_triplets.py b/src/conspiracies/relationextraction/multioie2/knowledge_triplets.py similarity index 100% rename from src/conspiracies/relationextraction/knowledge_triplets.py rename to src/conspiracies/relationextraction/multioie2/knowledge_triplets.py diff --git a/src/conspiracies/relationextraction/model.py b/src/conspiracies/relationextraction/multioie2/model.py similarity index 100% rename from src/conspiracies/relationextraction/model.py rename to src/conspiracies/relationextraction/multioie2/model.py diff --git a/src/conspiracies/relationextraction/wrap_model_spacy.py b/src/conspiracies/relationextraction/multioie2/multi2oie_comp.py similarity index 99% rename from src/conspiracies/relationextraction/wrap_model_spacy.py rename to src/conspiracies/relationextraction/multioie2/multi2oie_comp.py index 988f0b7..ecab251 100644 --- a/src/conspiracies/relationextraction/wrap_model_spacy.py +++ b/src/conspiracies/relationextraction/multioie2/multi2oie_comp.py @@ -9,7 +9,7 @@ from transformers import AutoTokenizer from .knowledge_triplets import KnowledgeTriplets -from .util import ( +from .multioie2_utils import ( install_extension, match_extraction_spans_to_wp, wp2tokid, diff --git a/src/conspiracies/relationextraction/util.py b/src/conspiracies/relationextraction/multioie2/multioie2_utils.py similarity index 100% rename from src/conspiracies/relationextraction/util.py rename to src/conspiracies/relationextraction/multioie2/multioie2_utils.py diff --git a/src/conspiracies/relationextraction/other/__init__.py b/src/conspiracies/relationextraction/multioie2/other/__init__.py similarity index 100% rename from src/conspiracies/relationextraction/other/__init__.py rename to src/conspiracies/relationextraction/multioie2/other/__init__.py diff --git a/src/conspiracies/relationextraction/other/bio.py b/src/conspiracies/relationextraction/multioie2/other/bio.py similarity index 100% rename from src/conspiracies/relationextraction/other/bio.py rename to src/conspiracies/relationextraction/multioie2/other/bio.py diff --git a/src/conspiracies/relationextraction/other/utils.py b/src/conspiracies/relationextraction/multioie2/other/utils.py similarity index 100% rename from src/conspiracies/relationextraction/other/utils.py rename to src/conspiracies/relationextraction/multioie2/other/utils.py diff --git a/tests/test_coref_component.py b/tests/test_coref_comp.py similarity index 100% rename from tests/test_coref_component.py rename to tests/test_coref_comp.py diff --git a/tests/test_CoreferenceModel.py b/tests/test_coref_model.py similarity index 88% rename from tests/test_CoreferenceModel.py rename to tests/test_coref_model.py index 508df5c..d21f52b 100644 --- a/tests/test_CoreferenceModel.py +++ b/tests/test_coref_model.py @@ -1,6 +1,6 @@ from .utils import nlp_da # noqa -from conspiracies.coref.CoreferenceModel import CoreferenceModel +from conspiracies.coref.coref_model import CoreferenceModel def test_CoreferenceModel(nlp_da): # noqa diff --git a/tests/test_data/prompt_data.py b/tests/test_data/prompt_data.py index 1bd7b85..79280d4 100644 --- a/tests/test_data/prompt_data.py +++ b/tests/test_data/prompt_data.py @@ -1,7 +1,11 @@ from typing import List import spacy -from conspiracies import DocTriplets, SpanTriplet, StringTriplet +from conspiracies.relationextraction.gptprompting import ( + DocTriplets, + SpanTriplet, + StringTriplet, +) from spacy.tokens import Doc test_thread = """@user2: I was hurt. END diff --git a/tests/test_ents_filter.py b/tests/test_ents_filter.py index 777fba0..cc5efba 100644 --- a/tests/test_ents_filter.py +++ b/tests/test_ents_filter.py @@ -1,7 +1,7 @@ import spacy from spacy.tokens import Doc, Span -from conspiracies.HeadWordExtractionComponent import contains_ents +from conspiracies.headwordextraction.headwordextraction_comp import contains_ents def test_ents_filter(): diff --git a/tests/test_prompt_data_classes.py b/tests/test_prompt_data_classes.py index 2b16e42..9a0be98 100644 --- a/tests/test_prompt_data_classes.py +++ b/tests/test_prompt_data_classes.py @@ -1,6 +1,6 @@ import pytest import spacy -from conspiracies.prompt_relation_extraction import ( +from conspiracies.relationextraction.gptprompting import ( DocTriplets, SpanTriplet, StringTriplet, diff --git a/tests/test_prompt_relation_evaluate.py b/tests/test_prompt_relation_evaluate.py index 2c88dc9..cb70585 100644 --- a/tests/test_prompt_relation_evaluate.py +++ b/tests/test_prompt_relation_evaluate.py @@ -1,7 +1,12 @@ import numpy as np -from conspiracies.prompt_relation_extraction import DocTriplets, score_open_relations +from conspiracies.relationextraction.gptprompting import ( + DocTriplets, +) from spacy.training import Example +from conspiracies.relationextraction.gptprompting.prompt_relation_comp import ( + score_open_relations, +) from .utils import docs_with_triplets # noqa F401 diff --git a/tests/test_prompt_relationextraction_component.py b/tests/test_prompt_relationextraction_component.py index 6d5905b..a156655 100644 --- a/tests/test_prompt_relationextraction_component.py +++ b/tests/test_prompt_relationextraction_component.py @@ -2,7 +2,7 @@ import pytest from confection import registry -from conspiracies import SpanTriplet, StringTriplet +from conspiracies.relationextraction.gptprompting import SpanTriplet, StringTriplet from spacy.language import Language from .test_prompt_template_parse_prompt import ( diff --git a/tests/test_prompt_template_create_prompt.py b/tests/test_prompt_template_create_prompt.py index 3d7f04b..3c34be2 100644 --- a/tests/test_prompt_template_create_prompt.py +++ b/tests/test_prompt_template_create_prompt.py @@ -1,5 +1,5 @@ import pytest -from conspiracies import ( +from conspiracies.relationextraction.gptprompting.prompt_templates import ( MarkdownPromptTemplate1, MarkdownPromptTemplate2, PromptTemplate1, diff --git a/tests/test_prompt_template_parse_prompt.py b/tests/test_prompt_template_parse_prompt.py index a57574f..83e8fec 100644 --- a/tests/test_prompt_template_parse_prompt.py +++ b/tests/test_prompt_template_parse_prompt.py @@ -1,5 +1,5 @@ import pytest -from conspiracies import ( +from conspiracies.relationextraction.gptprompting.prompt_templates import ( MarkdownPromptTemplate1, MarkdownPromptTemplate2, PromptTemplate1, diff --git a/tests/test_relationextraction_component.py b/tests/test_relationextraction_component.py index f92c285..ab93f07 100644 --- a/tests/test_relationextraction_component.py +++ b/tests/test_relationextraction_component.py @@ -1,6 +1,5 @@ import pytest -from conspiracies.relationextraction import SpacyRelationExtractor # noqa F401 from .utils import nlp_da # noqa F401 diff --git a/tests/test_utils.py b/tests/test_utils.py index 79a4c48..5756aff 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,8 @@ import pytest import spacy -from conspiracies import DocTriplets, SpanTriplet, docs_from_jsonl, docs_to_jsonl +from conspiracies import docs_from_jsonl, docs_to_jsonl +from conspiracies.relationextraction.gptprompting import DocTriplets, SpanTriplet from spacy.tokens import Doc from .utils import docs_with_triplets # noqa: F401