From 99360c979e66016b4902840a159f4d07af71b3b7 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 29 Oct 2024 08:35:30 -0500 Subject: [PATCH 01/12] Throw error in writer on extra columns --- src/koza/io/writer/tsv_writer.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/koza/io/writer/tsv_writer.py b/src/koza/io/writer/tsv_writer.py index 5c586bd..7bae5be 100644 --- a/src/koza/io/writer/tsv_writer.py +++ b/src/koza/io/writer/tsv_writer.py @@ -2,8 +2,9 @@ # NOTE - May want to rename to KGXWriter at some point, if we develop writers for other models non biolink/kgx specific from pathlib import Path -from typing import Dict, Iterable, List, Literal, Set, Union +from typing import Dict, Iterable, List, Literal, Set, Tuple, Union +from numpy.f2py.auxfuncs import throw_error from ordered_set import OrderedSet from koza.converter.kgx_converter import KGXConverter @@ -69,6 +70,13 @@ def write_row(self, record: Dict, record_type: Literal["node", "edge"]) -> None: fh = self.nodeFH if record_type == "node" else self.edgeFH columns = self.node_columns if record_type == "node" else self.edge_columns row = build_export_row(record, list_delimiter=self.list_delimiter) + + # Throw error if the record has extra columns + columns_tuple = tuple(columns) + row_keys_tuple = tuple(row.keys()) + if self.has_extra_columns(row_keys_tuple, columns_tuple): + throw_error(f"Record has extra columns: {set(row.keys()) - set(columns)} not defined in {record_type}") + values = [] if record_type == "node": row["id"] = record["id"] @@ -87,6 +95,19 @@ def finalize(self): if hasattr(self, "edgeFH"): self.edgeFH.close() + @staticmethod + def has_extra_columns(row_keys: Tuple[str, ...], columns_tuple: Tuple[str, ...]) -> bool: + """Check if a row has extra columns. + + Args: + row_keys: Tuple[str, ...] - A tuple of row keys + columns_tuple: Tuple[str, ...] - A tuple of columns + + Returns: + bool - True if row has extra columns, False otherwise + """ + return not set(row_keys).issubset(set(columns_tuple)) + @staticmethod def _order_columns(cols: Set, record_type: Literal["node", "edge"]) -> OrderedSet: """Arrange node or edge columns in a defined order. From 9a4ab383340c354e7c549803d9197042b646adb4 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:59:57 -0600 Subject: [PATCH 02/12] Abstract init and write up to KozaWriter --- src/koza/io/writer/jsonl_writer.py | 35 ++++++------------------ src/koza/io/writer/tsv_writer.py | 35 ++++++++---------------- src/koza/io/writer/writer.py | 44 ++++++++++++++++++++++++++---- 3 files changed, 59 insertions(+), 55 deletions(-) diff --git a/src/koza/io/writer/jsonl_writer.py b/src/koza/io/writer/jsonl_writer.py index 2987879..71add14 100644 --- a/src/koza/io/writer/jsonl_writer.py +++ b/src/koza/io/writer/jsonl_writer.py @@ -8,19 +8,9 @@ class JSONLWriter(KozaWriter): - def __init__( - self, - output_dir: str, - source_name: str, - node_properties: List[str], - edge_properties: Optional[List[str]] = [], - sssom_config: SSSOMConfig = None, - ): - self.output_dir = output_dir - self.source_name = source_name - self.sssom_config = sssom_config - - self.converter = KGXConverter() + def __init__(self, output_dir: str, source_name: str, node_properties: List[str], + edge_properties: Optional[List[str]] = None, sssom_config: SSSOMConfig = None): + super().__init__(output_dir, source_name, node_properties, edge_properties, sssom_config) os.makedirs(output_dir, exist_ok=True) if node_properties: @@ -28,20 +18,13 @@ def __init__( if edge_properties: self.edgeFH = open(f"{output_dir}/{source_name}_edges.jsonl", "w") - def write(self, entities: Iterable): - (nodes, edges) = self.converter.convert(entities) - - if nodes: - for n in nodes: - node = json.dumps(n, ensure_ascii=False) - self.nodeFH.write(node + '\n') + def write_edge(self, edge: dict): + edge = json.dumps(edge, ensure_ascii=False) + self.edgeFH.write(edge + '\n') - if edges: - for e in edges: - if self.sssom_config: - e = self.sssom_config.apply_mapping(e) - edge = json.dumps(e, ensure_ascii=False) - self.edgeFH.write(edge + '\n') + def write_node(self, node: dict): + node = json.dumps(node, ensure_ascii=False) + self.nodeFH.write(node + '\n') def finalize(self): if hasattr(self, 'nodeFH'): diff --git a/src/koza/io/writer/tsv_writer.py b/src/koza/io/writer/tsv_writer.py index 7bae5be..5d266f5 100644 --- a/src/koza/io/writer/tsv_writer.py +++ b/src/koza/io/writer/tsv_writer.py @@ -7,7 +7,7 @@ from numpy.f2py.auxfuncs import throw_error from ordered_set import OrderedSet -from koza.converter.kgx_converter import KGXConverter +# from koza.converter.kgx_converter import KGXConverter from koza.io.utils import build_export_row from koza.io.writer.writer import KozaWriter from koza.model.config.sssom_config import SSSOMConfig @@ -22,43 +22,31 @@ def __init__( edge_properties: List[str] = None, sssom_config: SSSOMConfig = None, ): - self.basename = source_name - self.dirname = output_dir + super().__init__(output_dir, source_name, node_properties, edge_properties, sssom_config) self.delimiter = "\t" self.list_delimiter = "|" - self.converter = KGXConverter() - self.sssom_config = sssom_config - Path(self.dirname).mkdir(parents=True, exist_ok=True) + Path(self.output_dir).mkdir(parents=True, exist_ok=True) if node_properties: # Make node file - self.node_columns = TSVWriter._order_columns(node_properties, "node") - self.nodes_file_name = Path(self.dirname if self.dirname else "", f"{self.basename}_nodes.tsv") + self.node_columns = TSVWriter._order_columns(set(node_properties), "node") + self.nodes_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_nodes.tsv") self.nodeFH = open(self.nodes_file_name, "w") self.nodeFH.write(self.delimiter.join(self.node_columns) + "\n") if edge_properties: # Make edge file if sssom_config: edge_properties = self.add_sssom_columns(edge_properties) - self.edge_columns = TSVWriter._order_columns(edge_properties, "edge") - self.edges_file_name = Path(self.dirname if self.dirname else "", f"{self.basename}_edges.tsv") + self.edge_columns = TSVWriter._order_columns(set(edge_properties), "edge") + self.edges_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_edges.tsv") self.edgeFH = open(self.edges_file_name, "w") self.edgeFH.write(self.delimiter.join(self.edge_columns) + "\n") - def write(self, entities: Iterable) -> None: - """Write an entities object to separate node and edge .tsv files""" + def write_edge(self, edge: dict): + self.write_row(edge, record_type="edge") - nodes, edges = self.converter.convert(entities) - - if nodes: - for node in nodes: - self.write_row(node, record_type="node") - - if edges: - for edge in edges: - if self.sssom_config: - edge = self.sssom_config.apply_mapping(edge) - self.write_row(edge, record_type="edge") + def write_node(self, node: dict): + self.write_row(node, record_type="node") def write_row(self, record: Dict, record_type: Literal["node", "edge"]) -> None: """Write a row to the underlying store. @@ -118,6 +106,7 @@ def _order_columns(cols: Set, record_type: Literal["node", "edge"]) -> OrderedSe Returns: OrderedSet - A set with elements in a defined order """ + core_columns = set() if record_type == "node": core_columns = OrderedSet(["id", "category", "name", "description", "xref", "provided_by", "synonym"]) elif record_type == "edge": diff --git a/src/koza/io/writer/writer.py b/src/koza/io/writer/writer.py index 881a5ea..e08d117 100644 --- a/src/koza/io/writer/writer.py +++ b/src/koza/io/writer/writer.py @@ -1,18 +1,50 @@ from abc import ABC, abstractmethod -from typing import Iterable +from pathlib import Path +from typing import Iterable, List, Union + +from koza.converter.kgx_converter import KGXConverter +from koza.model.config.sssom_config import SSSOMConfig class KozaWriter(ABC): """ An abstract base class for all koza writers - - # @abstractmethod - # def writeheader(self) -> Optional[int]: - # pass """ + def __init__( + self, + output_dir: Union[str, Path], + source_name: str, + node_properties: List[str] = None, + edge_properties: List[str] = None, + sssom_config: SSSOMConfig = None, + ): + self.output_dir = output_dir + self.source_name = source_name + self.node_columns = node_properties + self.edge_columns = edge_properties + self.sssom_config = sssom_config + + self.converter = KGXConverter() - @abstractmethod def write(self, entities: Iterable): + nodes, edges = self.converter.convert(entities) + + if nodes: + for node in nodes: + self.write_node(node) + + if edges: + for edge in edges: + if self.sssom_config: + edge = self.sssom_config.apply_mapping(edge) + self.write_edge(edge) + + @abstractmethod + def write_edge(self, edge: dict): + pass + + @abstractmethod + def write_node(self, node: dict): pass @abstractmethod From 275f9912119d03d441bd83f00d66c29256ab0623 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:58:58 -0600 Subject: [PATCH 03/12] Add extra parameter check to writer --- src/koza/io/writer/writer.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/koza/io/writer/writer.py b/src/koza/io/writer/writer.py index e08d117..b965930 100644 --- a/src/koza/io/writer/writer.py +++ b/src/koza/io/writer/writer.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod +from functools import lru_cache from pathlib import Path -from typing import Iterable, List, Union +from typing import Iterable, List, Union, Tuple from koza.converter.kgx_converter import KGXConverter from koza.model.config.sssom_config import SSSOMConfig @@ -26,19 +27,34 @@ def __init__( self.converter = KGXConverter() - def write(self, entities: Iterable): + def write(self, entities: Iterable, skip_checks: bool = False): nodes, edges = self.converter.convert(entities) if nodes: for node in nodes: + self.check_extra_fields(tuple(node.keys()), tuple(self.node_columns)) self.write_node(node) if edges: for edge in edges: if self.sssom_config: edge = self.sssom_config.apply_mapping(edge) + self.check_extra_fields(tuple(edge.keys()), tuple(self.edge_columns)) self.write_edge(edge) + @staticmethod + @lru_cache(maxsize=None) + def check_extra_fields(row_keys: Tuple, columns: Tuple) -> None: + """ + Check for extra fields in the row that are not in the columns + """ + + extra_fields = not set(row_keys).issubset(set(columns)) + if extra_fields: + raise ValueError(f"Extra fields found in row: {set(row_keys) - set(columns)}") + + pass + @abstractmethod def write_edge(self, edge: dict): pass From 017f7a9259af70650ed20d7fce8628a8346f8c7c Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:40:55 -0600 Subject: [PATCH 04/12] Fix tests for extra parameters check --- .../declarative-protein-links-detailed.yaml | 44 ++++++++++++- .../custom-map-protein-links-detailed.yaml | 46 ++++++++++++- .../map-protein-links-detailed.yaml | 46 ++++++++++++- examples/string/protein-links-detailed.yaml | 42 ++++++++++++ tests/unit/test_tsvwriter_node_and_edge.py | 65 ++++++++++++++++++- tests/unit/test_tsvwriter_node_only.py | 20 +++++- 6 files changed, 256 insertions(+), 7 deletions(-) diff --git a/examples/string-declarative/declarative-protein-links-detailed.yaml b/examples/string-declarative/declarative-protein-links-detailed.yaml index aa0345c..f49fe2b 100644 --- a/examples/string-declarative/declarative-protein-links-detailed.yaml +++ b/examples/string-declarative/declarative-protein-links-detailed.yaml @@ -38,6 +38,17 @@ node_properties: - 'id' - 'category' - 'provided_by' + - 'iri' + - 'name' + - 'synonym' + - 'has_attribute' + - 'deprecated' + - 'full_name' + - 'in_taxon' + - 'xref' + - 'in_taxon_label' + - 'description' + - 'type' edge_properties: - 'id' @@ -46,4 +57,35 @@ edge_properties: - 'object' - 'category' - 'relation' - - 'provided_by' \ No newline at end of file + - 'provided_by' + - 'object_closure' + - 'negated' + - 'qualifier' + - 'name' + - 'deprecated' + - 'original_subject' + - 'has_evidence' + - 'description' + - 'subject_label_closure' + - 'aggregator_knowledge_source' + - 'has_attribute' + - 'type' + - 'timepoint' + - 'subject_category_closure' + - 'object_category' + - 'primary_knowledge_source' + - 'original_object' + - 'knowledge_source' + - 'iri' + - 'subject_namespace' + - 'subject_closure' + - 'object_namespace' + - 'object_category_closure' + - 'object_label_closure' + - 'agent_type' + - 'knowledge_level' + - 'publications' + - 'retrieval_source_ids' + - 'original_predicate' + - 'subject_category' + - 'qualifiers' diff --git a/examples/string-w-custom-map/custom-map-protein-links-detailed.yaml b/examples/string-w-custom-map/custom-map-protein-links-detailed.yaml index 6bf01cb..433a49e 100644 --- a/examples/string-w-custom-map/custom-map-protein-links-detailed.yaml +++ b/examples/string-w-custom-map/custom-map-protein-links-detailed.yaml @@ -35,6 +35,19 @@ node_properties: - 'id' - 'category' - 'provided_by' + - 'deprecated' + - 'full_name' + - 'in_taxon_label' + - 'has_attribute' + - 'type' + - 'symbol' + - 'in_taxon' + - 'has_biological_sequence' + - 'xref' + - 'name' + - 'iri' + - 'synonym' + - 'description' edge_properties: - 'id' @@ -43,4 +56,35 @@ edge_properties: - 'object' - 'category' - 'relation' - - 'provided_by' \ No newline at end of file + - 'provided_by' + - 'knowledge_level' + - 'type' + - 'has_attribute' + - 'original_subject' + - 'subject_category' + - 'object_closure' + - 'description' + - 'object_category_closure' + - 'subject_closure' + - 'original_predicate' + - 'has_evidence' + - 'object_category' + - 'subject_label_closure' + - 'iri' + - 'aggregator_knowledge_source' + - 'original_object' + - 'name' + - 'primary_knowledge_source' + - 'subject_namespace' + - 'subject_category_closure' + - 'deprecated' + - 'timepoint' + - 'qualifiers' + - 'agent_type' + - 'object_namespace' + - 'retrieval_source_ids' + - 'object_label_closure' + - 'publications' + - 'qualifier' + - 'knowledge_source' + - 'negated' \ No newline at end of file diff --git a/examples/string-w-map/map-protein-links-detailed.yaml b/examples/string-w-map/map-protein-links-detailed.yaml index 53dab1c..2f5cd33 100644 --- a/examples/string-w-map/map-protein-links-detailed.yaml +++ b/examples/string-w-map/map-protein-links-detailed.yaml @@ -35,6 +35,19 @@ node_properties: - 'id' - 'category' - 'provided_by' + - 'deprecated' + - 'has_attribute' + - 'iri' + - 'in_taxon' + - 'xref' + - 'symbol' + - 'description' + - 'type' + - 'name' + - 'synonym' + - 'full_name' + - 'in_taxon_label' + - 'has_biological_sequence' edge_properties: - 'id' @@ -43,4 +56,35 @@ edge_properties: - 'object' - 'category' - 'relation' - - 'provided_by' \ No newline at end of file + - 'provided_by' + - 'subject_closure' + - 'object_closure' + - 'name' + - 'subject_namespace' + - 'aggregator_knowledge_source' + - 'object_category' + - 'type' + - 'original_predicate' + - 'subject_label_closure' + - 'retrieval_source_ids' + - 'agent_type' + - 'primary_knowledge_source' + - 'iri' + - 'knowledge_source' + - 'qualifiers' + - 'timepoint' + - 'object_namespace' + - 'negated' + - 'object_category_closure' + - 'deprecated' + - 'original_object' + - 'original_subject' + - 'subject_category' + - 'has_attribute' + - 'publications' + - 'subject_category_closure' + - 'qualifier' + - 'object_label_closure' + - 'description' + - 'knowledge_level' + - 'has_evidence' \ No newline at end of file diff --git a/examples/string/protein-links-detailed.yaml b/examples/string/protein-links-detailed.yaml index d41cb4a..9875bbc 100644 --- a/examples/string/protein-links-detailed.yaml +++ b/examples/string/protein-links-detailed.yaml @@ -24,6 +24,17 @@ node_properties: - 'id' - 'category' - 'provided_by' + - 'iri' + - 'name' + - 'synonym' + - 'has_attribute' + - 'deprecated' + - 'full_name' + - 'in_taxon' + - 'xref' + - 'in_taxon_label' + - 'description' + - 'type' edge_properties: - 'id' @@ -33,3 +44,34 @@ edge_properties: - 'category' - 'relation' - 'provided_by' + - 'object_closure' + - 'negated' + - 'qualifier' + - 'name' + - 'deprecated' + - 'original_subject' + - 'has_evidence' + - 'description' + - 'subject_label_closure' + - 'aggregator_knowledge_source' + - 'has_attribute' + - 'type' + - 'timepoint' + - 'subject_category_closure' + - 'object_category' + - 'primary_knowledge_source' + - 'original_object' + - 'knowledge_source' + - 'iri' + - 'subject_namespace' + - 'subject_closure' + - 'object_namespace' + - 'object_category_closure' + - 'object_label_closure' + - 'agent_type' + - 'knowledge_level' + - 'publications' + - 'retrieval_source_ids' + - 'original_predicate' + - 'subject_category' + - 'qualifiers' diff --git a/tests/unit/test_tsvwriter_node_and_edge.py b/tests/unit/test_tsvwriter_node_and_edge.py index 9456653..70be079 100644 --- a/tests/unit/test_tsvwriter_node_and_edge.py +++ b/tests/unit/test_tsvwriter_node_and_edge.py @@ -23,7 +23,24 @@ def test_tsv_writer(): ) ent = [g, d, a] - node_properties = ["id", "category", "symbol", "in_taxon", "provided_by", "source"] + node_properties = [ + "id", + "category", + "symbol", + "in_taxon", + "provided_by", + "source", + 'has_biological_sequence', + 'type', + 'xref', + 'description', + 'in_taxon_label', + 'synonym', + 'deprecated', + 'has_attribute', + 'full_name', + 'iri', 'name', + ] edge_properties = [ "id", "subject", @@ -34,6 +51,46 @@ def test_tsv_writer(): "has_total", "publications", "provided_by", + 'subject_category', + 'object_direction_qualifier', + 'sex_qualifier', + 'negated', + 'has_percentage', + 'aggregator_knowledge_source', + 'has_evidence', + 'qualified_predicate', + 'qualifiers', + 'object_category', + 'timepoint', + 'subject_label_closure', + 'agent_type', + 'has_attribute', + 'category', + 'original_predicate', + 'iri', + 'frequency_qualifier', + 'type', + 'subject_namespace', + 'subject_closure', + 'object_label_closure', + 'object_namespace', + 'original_object', + 'subject_category_closure', + 'name', + 'has_quotient', + 'knowledge_level', + 'knowledge_source', + 'description', + 'subject_direction_qualifier', + 'deprecated', + 'original_subject', + 'object_category_closure', + 'qualifier', + 'retrieval_source_ids', + 'primary_knowledge_source', + 'object_aspect_qualifier', + 'object_closure', + 'subject_aspect_qualifier' ] outdir = "output/tests" @@ -50,13 +107,15 @@ def test_tsv_writer(): # read the node and edges tsv files and confirm the expected values with open("{}/{}_nodes.tsv".format(outdir, outfile), "r") as f: lines = f.readlines() - assert lines[1] == "HGNC:11603\tbiolink:Gene\t\tNCBITaxon:9606\t\tTBX4\n" + # assert lines[1] == "HGNC:11603\tbiolink:Gene\t\tNCBITaxon:9606\t\tTBX4\n" + assert lines[1] == "HGNC:11603\tbiolink:Gene\t\t\t\t\t\t\t\t\t\tNCBITaxon:9606\t\t\t\tTBX4\t\n" assert len(lines) == 3 with open("{}/{}_edges.tsv".format(outdir, outfile), "r") as f: lines = f.readlines() assert ( lines[1].strip() - == "uuid:5b06e86f-d768-4cd9-ac27-abe31e95ab1e\tHGNC:11603\tbiolink:contributes_to\tMONDO:0005002\t\t\t0\t20" + == "uuid:5b06e86f-d768-4cd9-ac27-abe31e95ab1e\tHGNC:11603\tbiolink:contributes_to\tMONDO:0005002\t" + + "biolink:GeneToDiseaseAssociation\t\tnot_provided\t\t\t\t\t\t\t0\t\t\t\t20\t\tnot_provided" ) assert len(lines) == 2 diff --git a/tests/unit/test_tsvwriter_node_only.py b/tests/unit/test_tsvwriter_node_only.py index 0fa8eb8..cc40123 100644 --- a/tests/unit/test_tsvwriter_node_only.py +++ b/tests/unit/test_tsvwriter_node_only.py @@ -14,7 +14,25 @@ def test_tsv_writer(): ent = [g, d] - node_properties = ['id', 'category', 'symbol', 'in_taxon', 'provided_by', 'source'] + node_properties = [ + 'id', + 'category', + 'symbol', + 'in_taxon', + 'provided_by', + 'source', + 'has_biological_sequence', + 'iri', + 'type', + 'xref', + 'description', + 'synonym', + 'in_taxon_label', + 'deprecated', + 'full_name', + 'name', + 'has_attribute' + ] outdir = "output/tests" outfile = "tsvwriter-node-only" From 6c87e0fde72d16f6bb1dc7c191b8bd5c5bfa3c6f Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:43:01 -0600 Subject: [PATCH 05/12] Fix linting --- src/koza/io/writer/jsonl_writer.py | 13 +++++++++---- src/koza/io/writer/tsv_writer.py | 2 +- src/koza/io/writer/writer.py | 1 + tests/unit/test_tsvwriter_node_and_edge.py | 9 +++++---- tests/unit/test_tsvwriter_node_only.py | 2 +- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/koza/io/writer/jsonl_writer.py b/src/koza/io/writer/jsonl_writer.py index 71add14..9d1aa8b 100644 --- a/src/koza/io/writer/jsonl_writer.py +++ b/src/koza/io/writer/jsonl_writer.py @@ -1,15 +1,20 @@ import json import os -from typing import Iterable, List, Optional +from typing import List, Optional -from koza.converter.kgx_converter import KGXConverter from koza.io.writer.writer import KozaWriter from koza.model.config.sssom_config import SSSOMConfig class JSONLWriter(KozaWriter): - def __init__(self, output_dir: str, source_name: str, node_properties: List[str], - edge_properties: Optional[List[str]] = None, sssom_config: SSSOMConfig = None): + def __init__( + self, + output_dir: str, + source_name: str, + node_properties: List[str], + edge_properties: Optional[List[str]] = None, + sssom_config: SSSOMConfig = None, + ): super().__init__(output_dir, source_name, node_properties, edge_properties, sssom_config) os.makedirs(output_dir, exist_ok=True) diff --git a/src/koza/io/writer/tsv_writer.py b/src/koza/io/writer/tsv_writer.py index 5d266f5..45019d3 100644 --- a/src/koza/io/writer/tsv_writer.py +++ b/src/koza/io/writer/tsv_writer.py @@ -2,7 +2,7 @@ # NOTE - May want to rename to KGXWriter at some point, if we develop writers for other models non biolink/kgx specific from pathlib import Path -from typing import Dict, Iterable, List, Literal, Set, Tuple, Union +from typing import Dict, List, Literal, Set, Tuple, Union from numpy.f2py.auxfuncs import throw_error from ordered_set import OrderedSet diff --git a/src/koza/io/writer/writer.py b/src/koza/io/writer/writer.py index b965930..abd638f 100644 --- a/src/koza/io/writer/writer.py +++ b/src/koza/io/writer/writer.py @@ -11,6 +11,7 @@ class KozaWriter(ABC): """ An abstract base class for all koza writers """ + def __init__( self, output_dir: Union[str, Path], diff --git a/tests/unit/test_tsvwriter_node_and_edge.py b/tests/unit/test_tsvwriter_node_and_edge.py index 70be079..2454d47 100644 --- a/tests/unit/test_tsvwriter_node_and_edge.py +++ b/tests/unit/test_tsvwriter_node_and_edge.py @@ -39,7 +39,8 @@ def test_tsv_writer(): 'deprecated', 'has_attribute', 'full_name', - 'iri', 'name', + 'iri', + 'name', ] edge_properties = [ "id", @@ -90,7 +91,7 @@ def test_tsv_writer(): 'primary_knowledge_source', 'object_aspect_qualifier', 'object_closure', - 'subject_aspect_qualifier' + 'subject_aspect_qualifier', ] outdir = "output/tests" @@ -115,7 +116,7 @@ def test_tsv_writer(): lines = f.readlines() assert ( lines[1].strip() - == "uuid:5b06e86f-d768-4cd9-ac27-abe31e95ab1e\tHGNC:11603\tbiolink:contributes_to\tMONDO:0005002\t" + - "biolink:GeneToDiseaseAssociation\t\tnot_provided\t\t\t\t\t\t\t0\t\t\t\t20\t\tnot_provided" + == "uuid:5b06e86f-d768-4cd9-ac27-abe31e95ab1e\tHGNC:11603\tbiolink:contributes_to\tMONDO:0005002\t" + + "biolink:GeneToDiseaseAssociation\t\tnot_provided\t\t\t\t\t\t\t0\t\t\t\t20\t\tnot_provided" ) assert len(lines) == 2 diff --git a/tests/unit/test_tsvwriter_node_only.py b/tests/unit/test_tsvwriter_node_only.py index cc40123..ce9849e 100644 --- a/tests/unit/test_tsvwriter_node_only.py +++ b/tests/unit/test_tsvwriter_node_only.py @@ -31,7 +31,7 @@ def test_tsv_writer(): 'deprecated', 'full_name', 'name', - 'has_attribute' + 'has_attribute', ] outdir = "output/tests" From 979e25bd6cf3a89ddfcbd3988eaf2abfaf17740b Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:07:07 -0600 Subject: [PATCH 06/12] Add tests for extra params --- src/koza/io/writer/writer.py | 4 +- ...st_tsvwriter_node_and_edge_extra_params.py | 190 ++++++++++++++++++ .../test_tsvwriter_node_only_extra_params.py | 44 ++++ 3 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 tests/unit/test_tsvwriter_node_and_edge_extra_params.py create mode 100644 tests/unit/test_tsvwriter_node_only_extra_params.py diff --git a/src/koza/io/writer/writer.py b/src/koza/io/writer/writer.py index abd638f..7ef5063 100644 --- a/src/koza/io/writer/writer.py +++ b/src/koza/io/writer/writer.py @@ -52,9 +52,7 @@ def check_extra_fields(row_keys: Tuple, columns: Tuple) -> None: extra_fields = not set(row_keys).issubset(set(columns)) if extra_fields: - raise ValueError(f"Extra fields found in row: {set(row_keys) - set(columns)}") - - pass + raise ValueError(f"Extra fields found in row: {sorted(set(row_keys) - set(columns))}") @abstractmethod def write_edge(self, edge: dict): diff --git a/tests/unit/test_tsvwriter_node_and_edge_extra_params.py b/tests/unit/test_tsvwriter_node_and_edge_extra_params.py new file mode 100644 index 0000000..81eecd9 --- /dev/null +++ b/tests/unit/test_tsvwriter_node_and_edge_extra_params.py @@ -0,0 +1,190 @@ +import re + +import pytest +from biolink_model.datamodel.pydanticmodel_v2 import Disease, Gene, GeneToDiseaseAssociation + +from koza.io.writer.tsv_writer import TSVWriter + + +def test_tsv_writer_extra_node_params(): + """ + Writes a test tsv file + """ + g = Gene(id="HGNC:11603", in_taxon=["NCBITaxon:9606"], symbol="TBX4") + d = Disease(id="MONDO:0005002", name="chronic obstructive pulmonary disease") + a = GeneToDiseaseAssociation( + id="uuid:5b06e86f-d768-4cd9-ac27-abe31e95ab1e", + subject=g.id, + object=d.id, + predicate="biolink:contributes_to", + knowledge_level="not_provided", + agent_type="not_provided", + has_count=0, + has_total=20, + ) + ent = [g, d, a] + + node_properties = [ + "id", + "category", + "symbol", + "in_taxon", + "provided_by", + "source", + 'has_biological_sequence', + 'type', + 'xref', + 'description', + 'in_taxon_label', + 'synonym', + 'iri', + 'full_name', + ] + edge_properties = [ + "id", + "subject", + "predicate", + "object", + "category" "qualifiers", + "has_count", + "has_total", + "publications", + "provided_by", + 'subject_category', + 'object_direction_qualifier', + 'sex_qualifier', + 'negated', + 'has_percentage', + 'aggregator_knowledge_source', + 'has_evidence', + 'qualified_predicate', + 'qualifiers', + 'object_category', + 'timepoint', + 'subject_label_closure', + 'agent_type', + 'has_attribute', + 'category', + 'original_predicate', + 'iri', + 'frequency_qualifier', + 'type', + 'subject_namespace', + 'subject_closure', + 'object_label_closure', + 'object_namespace', + 'original_object', + 'subject_category_closure', + 'name', + 'has_quotient', + 'knowledge_level', + 'knowledge_source', + 'description', + 'subject_direction_qualifier', + 'deprecated', + 'original_subject', + 'object_category_closure', + ] + + outdir = "output/tests" + outfile = "tsvwriter-node-and-edge" + + t = TSVWriter(outdir, outfile, node_properties, edge_properties) + expected_message = "Extra fields found in row: ['deprecated', 'has_attribute', 'name']" + with pytest.raises(ValueError, match=re.escape(expected_message)): + t.write(ent) + + +def test_tsv_writer_extra_edge_params(): + """ + Writes a test tsv file + """ + g = Gene(id="HGNC:11603", in_taxon=["NCBITaxon:9606"], symbol="TBX4") + d = Disease(id="MONDO:0005002", name="chronic obstructive pulmonary disease") + a = GeneToDiseaseAssociation( + id="uuid:5b06e86f-d768-4cd9-ac27-abe31e95ab1e", + subject=g.id, + object=d.id, + predicate="biolink:contributes_to", + knowledge_level="not_provided", + agent_type="not_provided", + has_count=0, + has_total=20, + ) + ent = [g, d, a] + + node_properties = [ + "id", + "category", + "symbol", + "in_taxon", + "provided_by", + "source", + 'has_biological_sequence', + 'type', + 'xref', + 'description', + 'in_taxon_label', + 'synonym', + 'iri', + 'full_name', + 'deprecated', + 'has_attribute', + 'name', + ] + edge_properties = [ + "id", + "subject", + "predicate", + "object", + "category" "qualifiers", + "has_count", + "has_total", + "publications", + "provided_by", + 'subject_category', + 'object_direction_qualifier', + 'sex_qualifier', + 'negated', + 'has_percentage', + 'aggregator_knowledge_source', + 'has_evidence', + 'qualified_predicate', + 'qualifiers', + 'object_category', + 'timepoint', + 'subject_label_closure', + 'agent_type', + 'has_attribute', + 'category', + 'original_predicate', + 'iri', + 'frequency_qualifier', + 'type', + 'subject_namespace', + 'subject_closure', + 'object_label_closure', + 'object_namespace', + 'original_object', + 'subject_category_closure', + 'name', + 'has_quotient', + 'knowledge_level', + 'knowledge_source', + 'description', + 'subject_direction_qualifier', + 'deprecated', + 'original_subject', + 'object_category_closure', + 'object_aspect_qualifier', + 'object_closure', + 'primary_knowledge_source', + ] + + outdir = "output/tests" + outfile = "tsvwriter-node-and-edge" + + t = TSVWriter(outdir, outfile, node_properties, edge_properties) + expected_message = "Extra fields found in row: ['qualifier', 'retrieval_source_ids', 'subject_aspect_qualifier']" + with pytest.raises(ValueError, match=re.escape(expected_message)): + t.write(ent) diff --git a/tests/unit/test_tsvwriter_node_only_extra_params.py b/tests/unit/test_tsvwriter_node_only_extra_params.py new file mode 100644 index 0000000..77a703c --- /dev/null +++ b/tests/unit/test_tsvwriter_node_only_extra_params.py @@ -0,0 +1,44 @@ +import re + +import pytest +from biolink_model.datamodel.pydanticmodel_v2 import Disease, Gene + +from koza.io.writer.tsv_writer import TSVWriter + + +def test_tsv_writer(): + """ + Writes a test tsv file + """ + g = Gene(id="HGNC:11603", name="TBX4") + d = Disease(id="MONDO:0005002", name="chronic obstructive pulmonary disease") + + ent = [g, d] + + node_properties = [ + 'id', + 'category', + 'symbol', + 'in_taxon', + 'provided_by', + 'source', + 'has_biological_sequence', + 'iri', + 'type', + 'xref', + 'description', + 'synonym', + 'in_taxon_label', + 'deprecated', + 'full_name', + ] + + outdir = "output/tests" + outfile = "tsvwriter-node-only" + + t = TSVWriter(outdir, outfile, node_properties) + + t = TSVWriter(outdir, outfile, node_properties) + expected_message = "Extra fields found in row: ['has_attribute', 'name']" + with pytest.raises(ValueError, match=re.escape(expected_message)): + t.write(ent) From 5a76b6679c0dd30ae7137fb9e8a8e5b6d7e8db75 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 8 Nov 2024 12:40:18 -0600 Subject: [PATCH 07/12] Simplify init and remove unnecessary code --- src/koza/io/writer/jsonl_writer.py | 27 ++++----- src/koza/io/writer/tsv_writer.py | 91 ++++++++++++------------------ src/koza/io/writer/writer.py | 22 ++++++-- 3 files changed, 62 insertions(+), 78 deletions(-) diff --git a/src/koza/io/writer/jsonl_writer.py b/src/koza/io/writer/jsonl_writer.py index 9d1aa8b..00b64bc 100644 --- a/src/koza/io/writer/jsonl_writer.py +++ b/src/koza/io/writer/jsonl_writer.py @@ -1,27 +1,22 @@ import json import os -from typing import List, Optional +from typing import List, Optional, TextIO from koza.io.writer.writer import KozaWriter -from koza.model.config.sssom_config import SSSOMConfig class JSONLWriter(KozaWriter): - def __init__( - self, - output_dir: str, - source_name: str, - node_properties: List[str], - edge_properties: Optional[List[str]] = None, - sssom_config: SSSOMConfig = None, - ): - super().__init__(output_dir, source_name, node_properties, edge_properties, sssom_config) + node_properties: List[str] + edge_properties: List[str] + nodeFH: Optional[TextIO] + edgeFH: Optional[TextIO] - os.makedirs(output_dir, exist_ok=True) - if node_properties: - self.nodeFH = open(f"{output_dir}/{source_name}_nodes.jsonl", "w") - if edge_properties: - self.edgeFH = open(f"{output_dir}/{source_name}_edges.jsonl", "w") + def init(self): + os.makedirs(self.output_dir, exist_ok=True) + if self.node_properties: + self.nodeFH = open(f"{self.output_dir}/{self.source_name}_nodes.jsonl", "w") + if self.edge_properties: + self.edgeFH = open(f"{self.output_dir}/{self.source_name}_edges.jsonl", "w") def write_edge(self, edge: dict): edge = json.dumps(edge, ensure_ascii=False) diff --git a/src/koza/io/writer/tsv_writer.py b/src/koza/io/writer/tsv_writer.py index 45019d3..b6a94e9 100644 --- a/src/koza/io/writer/tsv_writer.py +++ b/src/koza/io/writer/tsv_writer.py @@ -1,79 +1,71 @@ #### TSV Writer #### -# NOTE - May want to rename to KGXWriter at some point, if we develop writers for other models non biolink/kgx specific from pathlib import Path -from typing import Dict, List, Literal, Set, Tuple, Union +from typing import Dict, List, Literal, Set, TextIO -from numpy.f2py.auxfuncs import throw_error from ordered_set import OrderedSet -# from koza.converter.kgx_converter import KGXConverter from koza.io.utils import build_export_row from koza.io.writer.writer import KozaWriter -from koza.model.config.sssom_config import SSSOMConfig class TSVWriter(KozaWriter): - def __init__( - self, - output_dir: Union[str, Path], - source_name: str, - node_properties: List[str] = None, - edge_properties: List[str] = None, - sssom_config: SSSOMConfig = None, - ): - super().__init__(output_dir, source_name, node_properties, edge_properties, sssom_config) - self.delimiter = "\t" - self.list_delimiter = "|" + delimiter: str = "\t" + list_delimiter: str = "|" + nodes_file_name: Path + edges_file_name: Path + + nodeFH: TextIO + edgeFH: TextIO + + def init(self): Path(self.output_dir).mkdir(parents=True, exist_ok=True) - if node_properties: # Make node file - self.node_columns = TSVWriter._order_columns(set(node_properties), "node") + if self.node_properties: # Make node file + self.node_properties = TSVWriter._order_columns(set(self.node_properties), "node") self.nodes_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_nodes.tsv") self.nodeFH = open(self.nodes_file_name, "w") - self.nodeFH.write(self.delimiter.join(self.node_columns) + "\n") + self.nodeFH.write(self.delimiter.join(self.node_properties) + "\n") - if edge_properties: # Make edge file - if sssom_config: - edge_properties = self.add_sssom_columns(edge_properties) - self.edge_columns = TSVWriter._order_columns(set(edge_properties), "edge") + if self.edge_properties: # Make edge file + if self.sssom_config: + self.edge_properties = self.add_sssom_columns(self.edge_properties) + self.edge_properties = TSVWriter._order_columns(set(self.edge_properties), "edge") self.edges_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_edges.tsv") self.edgeFH = open(self.edges_file_name, "w") - self.edgeFH.write(self.delimiter.join(self.edge_columns) + "\n") + self.edgeFH.write(self.delimiter.join(self.edge_properties) + "\n") def write_edge(self, edge: dict): - self.write_row(edge, record_type="edge") + """Write an edge to the underlying store. - def write_node(self, node: dict): - self.write_row(node, record_type="node") + Args: + edge: dict - An edge record + """ + row = build_export_row(edge, list_delimiter=self.list_delimiter) + values = self.get_columns(row, self.edge_properties) + self.edgeFH.write(self.delimiter.join(values) + "\n") - def write_row(self, record: Dict, record_type: Literal["node", "edge"]) -> None: - """Write a row to the underlying store. + def write_node(self, node: dict): + """Write a node to the underlying store. Args: - record: Dict - A node or edge record - record_type: Literal["node", "edge"] - The record_type of record + node: dict - A node record """ - fh = self.nodeFH if record_type == "node" else self.edgeFH - columns = self.node_columns if record_type == "node" else self.edge_columns - row = build_export_row(record, list_delimiter=self.list_delimiter) - - # Throw error if the record has extra columns - columns_tuple = tuple(columns) - row_keys_tuple = tuple(row.keys()) - if self.has_extra_columns(row_keys_tuple, columns_tuple): - throw_error(f"Record has extra columns: {set(row.keys()) - set(columns)} not defined in {record_type}") + row = build_export_row(node, list_delimiter=self.list_delimiter) + row["id"] = node["id"] + values = self.get_columns(row, self.node_properties) + self.nodeFH.write(self.delimiter.join(values) + "\n") + @staticmethod + def get_columns(row: Dict, columns) -> List[str]: values = [] - if record_type == "node": - row["id"] = record["id"] for c in columns: if c in row: values.append(str(row[c])) else: values.append("") - fh.write(self.delimiter.join(values) + "\n") + return values def finalize(self): """Close file handles.""" @@ -83,19 +75,6 @@ def finalize(self): if hasattr(self, "edgeFH"): self.edgeFH.close() - @staticmethod - def has_extra_columns(row_keys: Tuple[str, ...], columns_tuple: Tuple[str, ...]) -> bool: - """Check if a row has extra columns. - - Args: - row_keys: Tuple[str, ...] - A tuple of row keys - columns_tuple: Tuple[str, ...] - A tuple of columns - - Returns: - bool - True if row has extra columns, False otherwise - """ - return not set(row_keys).issubset(set(columns_tuple)) - @staticmethod def _order_columns(cols: Set, record_type: Literal["node", "edge"]) -> OrderedSet: """Arrange node or edge columns in a defined order. diff --git a/src/koza/io/writer/writer.py b/src/koza/io/writer/writer.py index 7ef5063..c8b4f82 100644 --- a/src/koza/io/writer/writer.py +++ b/src/koza/io/writer/writer.py @@ -19,28 +19,34 @@ def __init__( node_properties: List[str] = None, edge_properties: List[str] = None, sssom_config: SSSOMConfig = None, + skip_checks: bool = False, ): + """Do not override this method; implement `init` instead.""" self.output_dir = output_dir self.source_name = source_name - self.node_columns = node_properties - self.edge_columns = edge_properties + self.node_properties = node_properties + self.edge_properties = edge_properties self.sssom_config = sssom_config - + self.skip_checks = skip_checks self.converter = KGXConverter() - def write(self, entities: Iterable, skip_checks: bool = False): + self.init() + + def write(self, entities: Iterable): nodes, edges = self.converter.convert(entities) if nodes: for node in nodes: - self.check_extra_fields(tuple(node.keys()), tuple(self.node_columns)) + if not self.skip_checks: + self.check_extra_fields(tuple(node.keys()), tuple(self.node_properties)) self.write_node(node) if edges: for edge in edges: if self.sssom_config: edge = self.sssom_config.apply_mapping(edge) - self.check_extra_fields(tuple(edge.keys()), tuple(self.edge_columns)) + if not self.skip_checks: + self.check_extra_fields(tuple(edge.keys()), tuple(self.edge_properties)) self.write_edge(edge) @staticmethod @@ -54,6 +60,10 @@ def check_extra_fields(row_keys: Tuple, columns: Tuple) -> None: if extra_fields: raise ValueError(f"Extra fields found in row: {sorted(set(row_keys) - set(columns))}") + @abstractmethod + def init(self): + pass + @abstractmethod def write_edge(self, edge: dict): pass From d8fb3b47e87f2c58219994e67e1f1ce09fb42a2f Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 8 Nov 2024 12:54:12 -0600 Subject: [PATCH 08/12] Simplify FH properties --- src/koza/io/writer/jsonl_writer.py | 4 +--- src/koza/io/writer/tsv_writer.py | 17 +++++++---------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/koza/io/writer/jsonl_writer.py b/src/koza/io/writer/jsonl_writer.py index 00b64bc..292d48c 100644 --- a/src/koza/io/writer/jsonl_writer.py +++ b/src/koza/io/writer/jsonl_writer.py @@ -1,13 +1,11 @@ import json import os -from typing import List, Optional, TextIO +from typing import Optional, TextIO from koza.io.writer.writer import KozaWriter class JSONLWriter(KozaWriter): - node_properties: List[str] - edge_properties: List[str] nodeFH: Optional[TextIO] edgeFH: Optional[TextIO] diff --git a/src/koza/io/writer/tsv_writer.py b/src/koza/io/writer/tsv_writer.py index b6a94e9..39708c1 100644 --- a/src/koza/io/writer/tsv_writer.py +++ b/src/koza/io/writer/tsv_writer.py @@ -1,7 +1,7 @@ #### TSV Writer #### from pathlib import Path -from typing import Dict, List, Literal, Set, TextIO +from typing import Dict, List, Literal, Optional, Set, TextIO from ordered_set import OrderedSet @@ -13,27 +13,24 @@ class TSVWriter(KozaWriter): delimiter: str = "\t" list_delimiter: str = "|" - nodes_file_name: Path - edges_file_name: Path - - nodeFH: TextIO - edgeFH: TextIO + nodeFH: Optional[TextIO] + edgeFH: Optional[TextIO] def init(self): Path(self.output_dir).mkdir(parents=True, exist_ok=True) if self.node_properties: # Make node file self.node_properties = TSVWriter._order_columns(set(self.node_properties), "node") - self.nodes_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_nodes.tsv") - self.nodeFH = open(self.nodes_file_name, "w") + nodes_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_nodes.tsv") + self.nodeFH = open(nodes_file_name, "w") self.nodeFH.write(self.delimiter.join(self.node_properties) + "\n") if self.edge_properties: # Make edge file if self.sssom_config: self.edge_properties = self.add_sssom_columns(self.edge_properties) self.edge_properties = TSVWriter._order_columns(set(self.edge_properties), "edge") - self.edges_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_edges.tsv") - self.edgeFH = open(self.edges_file_name, "w") + edges_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_edges.tsv") + self.edgeFH = open(edges_file_name, "w") self.edgeFH.write(self.delimiter.join(self.edge_properties) + "\n") def write_edge(self, edge: dict): From 02da77170f1927ae9f4b43ec7c23c4f9e6001274 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 8 Nov 2024 13:16:08 -0600 Subject: [PATCH 09/12] Add kwargs pass through in init if child classes need special args --- src/koza/io/writer/writer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/koza/io/writer/writer.py b/src/koza/io/writer/writer.py index c8b4f82..e9e9a70 100644 --- a/src/koza/io/writer/writer.py +++ b/src/koza/io/writer/writer.py @@ -20,6 +20,7 @@ def __init__( edge_properties: List[str] = None, sssom_config: SSSOMConfig = None, skip_checks: bool = False, + kwargs: dict = None, ): """Do not override this method; implement `init` instead.""" self.output_dir = output_dir @@ -30,7 +31,8 @@ def __init__( self.skip_checks = skip_checks self.converter = KGXConverter() - self.init() + kwargs = kwargs or {} + self.init(**kwargs) def write(self, entities: Iterable): nodes, edges = self.converter.convert(entities) @@ -61,7 +63,7 @@ def check_extra_fields(row_keys: Tuple, columns: Tuple) -> None: raise ValueError(f"Extra fields found in row: {sorted(set(row_keys) - set(columns))}") @abstractmethod - def init(self): + def init(self, **kwargs): pass @abstractmethod From 9b376a20be1aefa1d847f8006e3443b4ae69044c Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Mon, 11 Nov 2024 16:29:07 -0600 Subject: [PATCH 10/12] Make extra field check optional and improve logic --- src/koza/io/writer/writer.py | 8 ++++---- tests/unit/test_tsvwriter_node_and_edge.py | 2 +- tests/unit/test_tsvwriter_node_and_edge_extra_params.py | 4 ++-- tests/unit/test_tsvwriter_node_only.py | 2 +- tests/unit/test_tsvwriter_node_only_extra_params.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/koza/io/writer/writer.py b/src/koza/io/writer/writer.py index e9e9a70..fc667ae 100644 --- a/src/koza/io/writer/writer.py +++ b/src/koza/io/writer/writer.py @@ -19,7 +19,7 @@ def __init__( node_properties: List[str] = None, edge_properties: List[str] = None, sssom_config: SSSOMConfig = None, - skip_checks: bool = False, + check_fields: bool = False, kwargs: dict = None, ): """Do not override this method; implement `init` instead.""" @@ -28,7 +28,7 @@ def __init__( self.node_properties = node_properties self.edge_properties = edge_properties self.sssom_config = sssom_config - self.skip_checks = skip_checks + self.check_fields = check_fields self.converter = KGXConverter() kwargs = kwargs or {} @@ -39,7 +39,7 @@ def write(self, entities: Iterable): if nodes: for node in nodes: - if not self.skip_checks: + if self.check_fields: self.check_extra_fields(tuple(node.keys()), tuple(self.node_properties)) self.write_node(node) @@ -47,7 +47,7 @@ def write(self, entities: Iterable): for edge in edges: if self.sssom_config: edge = self.sssom_config.apply_mapping(edge) - if not self.skip_checks: + if self.check_fields: self.check_extra_fields(tuple(edge.keys()), tuple(self.edge_properties)) self.write_edge(edge) diff --git a/tests/unit/test_tsvwriter_node_and_edge.py b/tests/unit/test_tsvwriter_node_and_edge.py index 2454d47..98d0e22 100644 --- a/tests/unit/test_tsvwriter_node_and_edge.py +++ b/tests/unit/test_tsvwriter_node_and_edge.py @@ -97,7 +97,7 @@ def test_tsv_writer(): outdir = "output/tests" outfile = "tsvwriter-node-and-edge" - t = TSVWriter(outdir, outfile, node_properties, edge_properties) + t = TSVWriter(outdir, outfile, node_properties, edge_properties, check_fields=True) t.write(ent) t.finalize() diff --git a/tests/unit/test_tsvwriter_node_and_edge_extra_params.py b/tests/unit/test_tsvwriter_node_and_edge_extra_params.py index 81eecd9..a4429d7 100644 --- a/tests/unit/test_tsvwriter_node_and_edge_extra_params.py +++ b/tests/unit/test_tsvwriter_node_and_edge_extra_params.py @@ -89,7 +89,7 @@ def test_tsv_writer_extra_node_params(): outdir = "output/tests" outfile = "tsvwriter-node-and-edge" - t = TSVWriter(outdir, outfile, node_properties, edge_properties) + t = TSVWriter(outdir, outfile, node_properties, edge_properties, check_fields=True) expected_message = "Extra fields found in row: ['deprecated', 'has_attribute', 'name']" with pytest.raises(ValueError, match=re.escape(expected_message)): t.write(ent) @@ -184,7 +184,7 @@ def test_tsv_writer_extra_edge_params(): outdir = "output/tests" outfile = "tsvwriter-node-and-edge" - t = TSVWriter(outdir, outfile, node_properties, edge_properties) + t = TSVWriter(outdir, outfile, node_properties, edge_properties, check_fields=True) expected_message = "Extra fields found in row: ['qualifier', 'retrieval_source_ids', 'subject_aspect_qualifier']" with pytest.raises(ValueError, match=re.escape(expected_message)): t.write(ent) diff --git a/tests/unit/test_tsvwriter_node_only.py b/tests/unit/test_tsvwriter_node_only.py index ce9849e..8cf4ce0 100644 --- a/tests/unit/test_tsvwriter_node_only.py +++ b/tests/unit/test_tsvwriter_node_only.py @@ -37,7 +37,7 @@ def test_tsv_writer(): outdir = "output/tests" outfile = "tsvwriter-node-only" - t = TSVWriter(outdir, outfile, node_properties) + t = TSVWriter(outdir, outfile, node_properties, check_fields=True) t.write(ent) t.finalize() diff --git a/tests/unit/test_tsvwriter_node_only_extra_params.py b/tests/unit/test_tsvwriter_node_only_extra_params.py index 77a703c..394ab8c 100644 --- a/tests/unit/test_tsvwriter_node_only_extra_params.py +++ b/tests/unit/test_tsvwriter_node_only_extra_params.py @@ -38,7 +38,7 @@ def test_tsv_writer(): t = TSVWriter(outdir, outfile, node_properties) - t = TSVWriter(outdir, outfile, node_properties) + t = TSVWriter(outdir, outfile, node_properties, check_fields=True) expected_message = "Extra fields found in row: ['has_attribute', 'name']" with pytest.raises(ValueError, match=re.escape(expected_message)): t.write(ent) From 6f63afa1f785fd8a5d6ce2899e529e797584e395 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:55:02 -0600 Subject: [PATCH 11/12] Finish hooks for checking extra fields --- src/koza/app.py | 3 ++- src/koza/model/config/source_config.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/koza/app.py b/src/koza/app.py index b5236a4..410f549 100644 --- a/src/koza/app.py +++ b/src/koza/app.py @@ -170,13 +170,14 @@ def write(self, *entities): self.writer.write(entities) - def _get_writer(self) -> Union[TSVWriter, JSONLWriter]: + def _get_writer(self) -> KozaWriter: writer_params = [ self.output_dir, self.source.config.name, self.source.config.node_properties, self.source.config.edge_properties, self.source.config.sssom_config, + self.source.config.check_fields, ] if self.output_format == OutputFormat.tsv: return TSVWriter(*writer_params) diff --git a/src/koza/model/config/source_config.py b/src/koza/model/config/source_config.py index 28a304c..bf2e1fe 100644 --- a/src/koza/model/config/source_config.py +++ b/src/koza/model/config/source_config.py @@ -313,6 +313,7 @@ class PrimaryFileConfig(SourceConfig): # edge_report_columns: Optional[List[str]] = None depends_on: List[str] = field(default_factory=list) on_map_failure: MapErrorEnum = MapErrorEnum.warning + check_fields: bool = False @dataclass(config=PYDANTIC_CONFIG) From a8e486b9753e1258306aa56427927c6d0744238a Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 19 Nov 2024 13:54:31 -0600 Subject: [PATCH 12/12] Change KGXConvert to only pass set fields --- src/koza/converter/kgx_converter.py | 50 ++++++---- tests/unit/test_tsvwriter_node_and_edge.py | 55 +---------- ...st_tsvwriter_node_and_edge_extra_params.py | 97 +------------------ .../test_tsvwriter_node_only_extra_params.py | 11 +-- 4 files changed, 35 insertions(+), 178 deletions(-) diff --git a/src/koza/converter/kgx_converter.py b/src/koza/converter/kgx_converter.py index a2a50a7..3a37de8 100644 --- a/src/koza/converter/kgx_converter.py +++ b/src/koza/converter/kgx_converter.py @@ -1,7 +1,9 @@ from dataclasses import asdict -from typing import Iterable, Tuple +from typing import Any, Dict, Iterable, List, Tuple, Union + from pydantic import BaseModel +from biolink_model.datamodel.pydanticmodel_v2 import Association, BiologicalEntity, ChemicalEntity class KGXConverter: """ @@ -15,35 +17,41 @@ class KGXConverter: """ - def convert(self, entities: Iterable) -> Tuple[list, list]: - nodes = [] - edges = [] + def convert(self, entities: Iterable[Union[Association, BiologicalEntity, ChemicalEntity]]) \ + -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + nodes: List[Dict[str, Any]] = [] + edges: List[Dict[str, Any]] = [] for entity in entities: - # if entity has subject + object + predicate, treat as edge - if all(hasattr(entity, attr) for attr in ["subject", "object", "predicate"]): - edges.append(self.convert_association(entity)) - - # if entity has id and name, but not subject/object/predicate, treat as node - elif all(hasattr(entity, attr) for attr in ["id", "name"]) and not all( - hasattr(entity, attr) for attr in ["subject", "object", "predicate"] - ): + # edge entities are Associations + if isinstance(entity, Association): + edges.append(self.convert_edge(entity)) + + # node entities are BiologicalEntity or ChemicalEntity + elif isinstance(entity, (BiologicalEntity, ChemicalEntity)): nodes.append(self.convert_node(entity)) # otherwise, not a valid entity else: raise ValueError( - f"Cannot convert {entity}: Can only convert NamedThing or Association entities to KGX compatible dictionaries" + f"Cannot convert {entity}: Can only convert Association, BiologicalEntity, or ChemicalEntity to KGX compatible dictionaries" ) return nodes, edges - def convert_node(self, node) -> dict: - if isinstance(node, BaseModel): - return dict(node) - return asdict(node) + def convert_node(self, node: Union[BiologicalEntity, ChemicalEntity]) -> Dict[str, Any]: + node_set_fields = self.get_set_fields(node) + node_set_fields["description"] = node.description # description field is not explicitly set? + return node_set_fields + + def convert_edge(self, association: Association) -> Dict[str, Any]: + edge_set_fields = self.get_set_fields(association) + return edge_set_fields + + @staticmethod + def get_set_fields(entity: BaseModel) -> Dict[str, Any]: + fields_set_keys = entity.model_fields_set + entity_set_fields = {key: getattr(entity, key) for key in fields_set_keys} + entity_set_fields["category"] = entity.category # category field is not explicitly set? + return entity_set_fields - def convert_association(self, association) -> dict: - if isinstance(association, BaseModel): - return dict(association) - return asdict(association) diff --git a/tests/unit/test_tsvwriter_node_and_edge.py b/tests/unit/test_tsvwriter_node_and_edge.py index 98d0e22..c39ac23 100644 --- a/tests/unit/test_tsvwriter_node_and_edge.py +++ b/tests/unit/test_tsvwriter_node_and_edge.py @@ -21,6 +21,7 @@ def test_tsv_writer(): has_count=0, has_total=20, ) + ent = [g, d, a] node_properties = [ @@ -28,18 +29,7 @@ def test_tsv_writer(): "category", "symbol", "in_taxon", - "provided_by", - "source", - 'has_biological_sequence', - 'type', - 'xref', 'description', - 'in_taxon_label', - 'synonym', - 'deprecated', - 'has_attribute', - 'full_name', - 'iri', 'name', ] edge_properties = [ @@ -50,48 +40,9 @@ def test_tsv_writer(): "category" "qualifiers", "has_count", "has_total", - "publications", - "provided_by", - 'subject_category', - 'object_direction_qualifier', - 'sex_qualifier', - 'negated', - 'has_percentage', - 'aggregator_knowledge_source', - 'has_evidence', - 'qualified_predicate', - 'qualifiers', - 'object_category', - 'timepoint', - 'subject_label_closure', 'agent_type', - 'has_attribute', 'category', - 'original_predicate', - 'iri', - 'frequency_qualifier', - 'type', - 'subject_namespace', - 'subject_closure', - 'object_label_closure', - 'object_namespace', - 'original_object', - 'subject_category_closure', - 'name', - 'has_quotient', 'knowledge_level', - 'knowledge_source', - 'description', - 'subject_direction_qualifier', - 'deprecated', - 'original_subject', - 'object_category_closure', - 'qualifier', - 'retrieval_source_ids', - 'primary_knowledge_source', - 'object_aspect_qualifier', - 'object_closure', - 'subject_aspect_qualifier', ] outdir = "output/tests" @@ -109,7 +60,7 @@ def test_tsv_writer(): with open("{}/{}_nodes.tsv".format(outdir, outfile), "r") as f: lines = f.readlines() # assert lines[1] == "HGNC:11603\tbiolink:Gene\t\tNCBITaxon:9606\t\tTBX4\n" - assert lines[1] == "HGNC:11603\tbiolink:Gene\t\t\t\t\t\t\t\t\t\tNCBITaxon:9606\t\t\t\tTBX4\t\n" + assert lines[1] == "HGNC:11603\tbiolink:Gene\t\t\tNCBITaxon:9606\tTBX4\n" assert len(lines) == 3 with open("{}/{}_edges.tsv".format(outdir, outfile), "r") as f: @@ -117,6 +68,6 @@ def test_tsv_writer(): assert ( lines[1].strip() == "uuid:5b06e86f-d768-4cd9-ac27-abe31e95ab1e\tHGNC:11603\tbiolink:contributes_to\tMONDO:0005002\t" - + "biolink:GeneToDiseaseAssociation\t\tnot_provided\t\t\t\t\t\t\t0\t\t\t\t20\t\tnot_provided" + + "biolink:GeneToDiseaseAssociation\tnot_provided\t\t0\t20\tnot_provided" ) assert len(lines) == 2 diff --git a/tests/unit/test_tsvwriter_node_and_edge_extra_params.py b/tests/unit/test_tsvwriter_node_and_edge_extra_params.py index a4429d7..6dd7541 100644 --- a/tests/unit/test_tsvwriter_node_and_edge_extra_params.py +++ b/tests/unit/test_tsvwriter_node_and_edge_extra_params.py @@ -28,17 +28,7 @@ def test_tsv_writer_extra_node_params(): "id", "category", "symbol", - "in_taxon", - "provided_by", - "source", - 'has_biological_sequence', - 'type', - 'xref', 'description', - 'in_taxon_label', - 'synonym', - 'iri', - 'full_name', ] edge_properties = [ "id", @@ -48,49 +38,16 @@ def test_tsv_writer_extra_node_params(): "category" "qualifiers", "has_count", "has_total", - "publications", - "provided_by", - 'subject_category', - 'object_direction_qualifier', - 'sex_qualifier', - 'negated', - 'has_percentage', - 'aggregator_knowledge_source', - 'has_evidence', - 'qualified_predicate', - 'qualifiers', - 'object_category', - 'timepoint', - 'subject_label_closure', 'agent_type', - 'has_attribute', 'category', - 'original_predicate', - 'iri', - 'frequency_qualifier', - 'type', - 'subject_namespace', - 'subject_closure', - 'object_label_closure', - 'object_namespace', - 'original_object', - 'subject_category_closure', - 'name', - 'has_quotient', 'knowledge_level', - 'knowledge_source', - 'description', - 'subject_direction_qualifier', - 'deprecated', - 'original_subject', - 'object_category_closure', ] outdir = "output/tests" outfile = "tsvwriter-node-and-edge" t = TSVWriter(outdir, outfile, node_properties, edge_properties, check_fields=True) - expected_message = "Extra fields found in row: ['deprecated', 'has_attribute', 'name']" + expected_message = "Extra fields found in row: ['in_taxon']" with pytest.raises(ValueError, match=re.escape(expected_message)): t.write(ent) @@ -120,17 +77,6 @@ def test_tsv_writer_extra_edge_params(): "in_taxon", "provided_by", "source", - 'has_biological_sequence', - 'type', - 'xref', - 'description', - 'in_taxon_label', - 'synonym', - 'iri', - 'full_name', - 'deprecated', - 'has_attribute', - 'name', ] edge_properties = [ "id", @@ -140,51 +86,12 @@ def test_tsv_writer_extra_edge_params(): "category" "qualifiers", "has_count", "has_total", - "publications", - "provided_by", - 'subject_category', - 'object_direction_qualifier', - 'sex_qualifier', - 'negated', - 'has_percentage', - 'aggregator_knowledge_source', - 'has_evidence', - 'qualified_predicate', - 'qualifiers', - 'object_category', - 'timepoint', - 'subject_label_closure', - 'agent_type', - 'has_attribute', - 'category', - 'original_predicate', - 'iri', - 'frequency_qualifier', - 'type', - 'subject_namespace', - 'subject_closure', - 'object_label_closure', - 'object_namespace', - 'original_object', - 'subject_category_closure', - 'name', - 'has_quotient', - 'knowledge_level', - 'knowledge_source', - 'description', - 'subject_direction_qualifier', - 'deprecated', - 'original_subject', - 'object_category_closure', - 'object_aspect_qualifier', - 'object_closure', - 'primary_knowledge_source', ] outdir = "output/tests" outfile = "tsvwriter-node-and-edge" t = TSVWriter(outdir, outfile, node_properties, edge_properties, check_fields=True) - expected_message = "Extra fields found in row: ['qualifier', 'retrieval_source_ids', 'subject_aspect_qualifier']" + expected_message = "Extra fields found in row: ['description']" with pytest.raises(ValueError, match=re.escape(expected_message)): t.write(ent) diff --git a/tests/unit/test_tsvwriter_node_only_extra_params.py b/tests/unit/test_tsvwriter_node_only_extra_params.py index 394ab8c..25472ba 100644 --- a/tests/unit/test_tsvwriter_node_only_extra_params.py +++ b/tests/unit/test_tsvwriter_node_only_extra_params.py @@ -21,16 +21,7 @@ def test_tsv_writer(): 'symbol', 'in_taxon', 'provided_by', - 'source', - 'has_biological_sequence', - 'iri', - 'type', - 'xref', 'description', - 'synonym', - 'in_taxon_label', - 'deprecated', - 'full_name', ] outdir = "output/tests" @@ -39,6 +30,6 @@ def test_tsv_writer(): t = TSVWriter(outdir, outfile, node_properties) t = TSVWriter(outdir, outfile, node_properties, check_fields=True) - expected_message = "Extra fields found in row: ['has_attribute', 'name']" + expected_message = "Extra fields found in row: ['name']" with pytest.raises(ValueError, match=re.escape(expected_message)): t.write(ent)