Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Throw an error in writer if property isn't defined #154

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ node_properties:
- 'id'
- 'category'
- 'provided_by'
- 'iri'
- 'name'
- 'synonym'
- 'has_attribute'
- 'deprecated'
- 'full_name'
- 'in_taxon'
- 'xref'
- 'in_taxon_label'
- 'description'
- 'type'

edge_properties:
- 'id'
Expand All @@ -46,4 +57,35 @@ edge_properties:
- 'object'
- 'category'
- 'relation'
- 'provided_by'
- 'provided_by'
- 'object_closure'
- 'negated'
- 'qualifier'
- 'name'
- 'deprecated'
- 'original_subject'
- 'has_evidence'
- 'description'
- 'subject_label_closure'
- 'aggregator_knowledge_source'
- 'has_attribute'
- 'type'
- 'timepoint'
- 'subject_category_closure'
- 'object_category'
- 'primary_knowledge_source'
- 'original_object'
- 'knowledge_source'
- 'iri'
- 'subject_namespace'
- 'subject_closure'
- 'object_namespace'
- 'object_category_closure'
- 'object_label_closure'
- 'agent_type'
- 'knowledge_level'
- 'publications'
- 'retrieval_source_ids'
- 'original_predicate'
- 'subject_category'
- 'qualifiers'
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ node_properties:
- 'id'
- 'category'
- 'provided_by'
- 'deprecated'
- 'full_name'
- 'in_taxon_label'
- 'has_attribute'
- 'type'
- 'symbol'
- 'in_taxon'
- 'has_biological_sequence'
- 'xref'
- 'name'
- 'iri'
- 'synonym'
- 'description'

edge_properties:
- 'id'
Expand All @@ -43,4 +56,35 @@ edge_properties:
- 'object'
- 'category'
- 'relation'
- 'provided_by'
- 'provided_by'
- 'knowledge_level'
- 'type'
- 'has_attribute'
- 'original_subject'
- 'subject_category'
- 'object_closure'
- 'description'
- 'object_category_closure'
- 'subject_closure'
- 'original_predicate'
- 'has_evidence'
- 'object_category'
- 'subject_label_closure'
- 'iri'
- 'aggregator_knowledge_source'
- 'original_object'
- 'name'
- 'primary_knowledge_source'
- 'subject_namespace'
- 'subject_category_closure'
- 'deprecated'
- 'timepoint'
- 'qualifiers'
- 'agent_type'
- 'object_namespace'
- 'retrieval_source_ids'
- 'object_label_closure'
- 'publications'
- 'qualifier'
- 'knowledge_source'
- 'negated'
46 changes: 45 additions & 1 deletion examples/string-w-map/map-protein-links-detailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ node_properties:
- 'id'
- 'category'
- 'provided_by'
- 'deprecated'
- 'has_attribute'
- 'iri'
- 'in_taxon'
- 'xref'
- 'symbol'
- 'description'
- 'type'
- 'name'
- 'synonym'
- 'full_name'
- 'in_taxon_label'
- 'has_biological_sequence'

edge_properties:
- 'id'
Expand All @@ -43,4 +56,35 @@ edge_properties:
- 'object'
- 'category'
- 'relation'
- 'provided_by'
- 'provided_by'
- 'subject_closure'
- 'object_closure'
- 'name'
- 'subject_namespace'
- 'aggregator_knowledge_source'
- 'object_category'
- 'type'
- 'original_predicate'
- 'subject_label_closure'
- 'retrieval_source_ids'
- 'agent_type'
- 'primary_knowledge_source'
- 'iri'
- 'knowledge_source'
- 'qualifiers'
- 'timepoint'
- 'object_namespace'
- 'negated'
- 'object_category_closure'
- 'deprecated'
- 'original_object'
- 'original_subject'
- 'subject_category'
- 'has_attribute'
- 'publications'
- 'subject_category_closure'
- 'qualifier'
- 'object_label_closure'
- 'description'
- 'knowledge_level'
- 'has_evidence'
42 changes: 42 additions & 0 deletions examples/string/protein-links-detailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,17 @@ node_properties:
- 'id'
- 'category'
- 'provided_by'
- 'iri'
- 'name'
- 'synonym'
- 'has_attribute'
- 'deprecated'
- 'full_name'
- 'in_taxon'
- 'xref'
- 'in_taxon_label'
- 'description'
- 'type'

edge_properties:
- 'id'
Expand All @@ -33,3 +44,34 @@ edge_properties:
- 'category'
- 'relation'
- 'provided_by'
- 'object_closure'
- 'negated'
- 'qualifier'
- 'name'
- 'deprecated'
- 'original_subject'
- 'has_evidence'
- 'description'
- 'subject_label_closure'
- 'aggregator_knowledge_source'
- 'has_attribute'
- 'type'
- 'timepoint'
- 'subject_category_closure'
- 'object_category'
- 'primary_knowledge_source'
- 'original_object'
- 'knowledge_source'
- 'iri'
- 'subject_namespace'
- 'subject_closure'
- 'object_namespace'
- 'object_category_closure'
- 'object_label_closure'
- 'agent_type'
- 'knowledge_level'
- 'publications'
- 'retrieval_source_ids'
- 'original_predicate'
- 'subject_category'
- 'qualifiers'
3 changes: 2 additions & 1 deletion src/koza/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,14 @@ def write(self, *entities):

self.writer.write(entities)

def _get_writer(self) -> Union[TSVWriter, JSONLWriter]:
def _get_writer(self) -> KozaWriter:
writer_params = [
self.output_dir,
self.source.config.name,
self.source.config.node_properties,
self.source.config.edge_properties,
self.source.config.sssom_config,
self.source.config.check_fields,
]
if self.output_format == OutputFormat.tsv:
return TSVWriter(*writer_params)
Expand Down
50 changes: 29 additions & 21 deletions src/koza/converter/kgx_converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from dataclasses import asdict
from typing import Iterable, Tuple
from typing import Any, Dict, Iterable, List, Tuple, Union

from pydantic import BaseModel

from biolink_model.datamodel.pydanticmodel_v2 import Association, BiologicalEntity, ChemicalEntity

class KGXConverter:
"""
Expand All @@ -15,35 +17,41 @@ class KGXConverter:

"""

def convert(self, entities: Iterable) -> Tuple[list, list]:
nodes = []
edges = []
def convert(self, entities: Iterable[Union[Association, BiologicalEntity, ChemicalEntity]]) \
-> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
nodes: List[Dict[str, Any]] = []
edges: List[Dict[str, Any]] = []

for entity in entities:
# if entity has subject + object + predicate, treat as edge
if all(hasattr(entity, attr) for attr in ["subject", "object", "predicate"]):
edges.append(self.convert_association(entity))

# if entity has id and name, but not subject/object/predicate, treat as node
elif all(hasattr(entity, attr) for attr in ["id", "name"]) and not all(
hasattr(entity, attr) for attr in ["subject", "object", "predicate"]
):
# edge entities are Associations
if isinstance(entity, Association):
edges.append(self.convert_edge(entity))

# node entities are BiologicalEntity or ChemicalEntity
elif isinstance(entity, (BiologicalEntity, ChemicalEntity)):
nodes.append(self.convert_node(entity))

# otherwise, not a valid entity
else:
raise ValueError(
f"Cannot convert {entity}: Can only convert NamedThing or Association entities to KGX compatible dictionaries"
f"Cannot convert {entity}: Can only convert Association, BiologicalEntity, or ChemicalEntity to KGX compatible dictionaries"
)

return nodes, edges

def convert_node(self, node) -> dict:
if isinstance(node, BaseModel):
return dict(node)
return asdict(node)
def convert_node(self, node: Union[BiologicalEntity, ChemicalEntity]) -> Dict[str, Any]:
node_set_fields = self.get_set_fields(node)
node_set_fields["description"] = node.description # description field is not explicitly set?
return node_set_fields

def convert_edge(self, association: Association) -> Dict[str, Any]:
edge_set_fields = self.get_set_fields(association)
return edge_set_fields

@staticmethod
def get_set_fields(entity: BaseModel) -> Dict[str, Any]:
fields_set_keys = entity.model_fields_set
entity_set_fields = {key: getattr(entity, key) for key in fields_set_keys}
entity_set_fields["category"] = entity.category # category field is not explicitly set?
return entity_set_fields

def convert_association(self, association) -> dict:
if isinstance(association, BaseModel):
return dict(association)
return asdict(association)
55 changes: 18 additions & 37 deletions src/koza/io/writer/jsonl_writer.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,28 @@
import json
import os
from typing import Iterable, List, Optional
from typing import Optional, TextIO

from koza.converter.kgx_converter import KGXConverter
from koza.io.writer.writer import KozaWriter
from koza.model.config.sssom_config import SSSOMConfig


class JSONLWriter(KozaWriter):
def __init__(
self,
output_dir: str,
source_name: str,
node_properties: List[str],
edge_properties: Optional[List[str]] = [],
sssom_config: SSSOMConfig = None,
):
self.output_dir = output_dir
self.source_name = source_name
self.sssom_config = sssom_config

self.converter = KGXConverter()

os.makedirs(output_dir, exist_ok=True)
if node_properties:
self.nodeFH = open(f"{output_dir}/{source_name}_nodes.jsonl", "w")
if edge_properties:
self.edgeFH = open(f"{output_dir}/{source_name}_edges.jsonl", "w")

def write(self, entities: Iterable):
(nodes, edges) = self.converter.convert(entities)

if nodes:
for n in nodes:
node = json.dumps(n, ensure_ascii=False)
self.nodeFH.write(node + '\n')

if edges:
for e in edges:
if self.sssom_config:
e = self.sssom_config.apply_mapping(e)
edge = json.dumps(e, ensure_ascii=False)
self.edgeFH.write(edge + '\n')
nodeFH: Optional[TextIO]
edgeFH: Optional[TextIO]

def init(self):
os.makedirs(self.output_dir, exist_ok=True)
if self.node_properties:
self.nodeFH = open(f"{self.output_dir}/{self.source_name}_nodes.jsonl", "w")
if self.edge_properties:
self.edgeFH = open(f"{self.output_dir}/{self.source_name}_edges.jsonl", "w")

def write_edge(self, edge: dict):
edge = json.dumps(edge, ensure_ascii=False)
self.edgeFH.write(edge + '\n')

def write_node(self, node: dict):
node = json.dumps(node, ensure_ascii=False)
self.nodeFH.write(node + '\n')

def finalize(self):
if hasattr(self, 'nodeFH'):
Expand Down
Loading