From 4f5e1cb712e74440eaec2adbaadbd4d0c4d20c30 Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Mon, 18 Mar 2024 15:46:18 -0700 Subject: [PATCH] More efficient simple obo diffs (#719) * Adding a more efficient diff implementation for simpleobo. Undoing some of #605 * ruffruff * fixing tests --- src/oaklib/cli.py | 30 +- .../simpleobo/simple_obo_implementation.py | 170 ++++++++- src/oaklib/interfaces/differ_interface.py | 336 ++++++------------ src/oaklib/io/streaming_markdown_writer.py | 63 ++-- .../utilities/mapping/mapping_validation.py | 10 +- .../utilities/writers/change_handler.py | 5 +- tests/test_implementations/__init__.py | 30 +- tests/test_implementations/test_bioportal.py | 4 +- 8 files changed, 380 insertions(+), 268 deletions(-) diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 59bc01d5c..4da18d2ba 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -924,6 +924,12 @@ def chain_results(v): show_default=True, help="Merge all inputs specified using --add", ) +@click.option( + "--profile/--no-profile", + default=False, + show_default=True, + help="If set, will profile the command", +) def main( verbose: int, quiet: bool, @@ -941,6 +947,7 @@ def main( metamodel_mappings, requests_cache_db, prefix, + profile: bool, import_depth: Optional[int], **kwargs, ): @@ -968,6 +975,24 @@ def main( logger.setLevel(logging.WARNING) if quiet: logger.setLevel(logging.ERROR) + if profile: + import atexit + import cProfile + import io + import pstats + + print("Profiling...") + pr = cProfile.Profile() + pr.enable() + + def exit(): + pr.disable() + print("Profiling completed") + s = io.StringIO() + pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() + print(s.getvalue()) + + atexit.register(exit) if requests_cache_db: import requests_cache @@ -5839,9 +5864,8 @@ def diff( writer.emit(summary) else: if isinstance(writer, StreamingMarkdownWriter): - config.yield_individual_changes = False - for change in impl.diff(other_impl, configuration=config): - writer.emit(change, other_impl=other_impl) + for change_type, changes in impl.grouped_diff(other_impl, configuration=config): + writer.emit({change_type: changes}, other_impl=other_impl) else: for change in impl.diff(other_impl, configuration=config): writer.emit(change) diff --git a/src/oaklib/implementations/simpleobo/simple_obo_implementation.py b/src/oaklib/implementations/simpleobo/simple_obo_implementation.py index 5ed757deb..2807a9086 100644 --- a/src/oaklib/implementations/simpleobo/simple_obo_implementation.py +++ b/src/oaklib/implementations/simpleobo/simple_obo_implementation.py @@ -57,6 +57,7 @@ OWL_VERSION_IRI, RDFS_DOMAIN, RDFS_RANGE, + SCOPE_TO_SYNONYM_PRED_MAP, SEMAPV, SKOS_CLOSE_MATCH, SUBPROPERTY_OF, @@ -105,7 +106,7 @@ RELATIONSHIP, RELATIONSHIP_MAP, ) -from oaklib.interfaces.differ_interface import DifferInterface +from oaklib.interfaces.differ_interface import DiffConfiguration, DifferInterface from oaklib.interfaces.dumper_interface import DumperInterface from oaklib.interfaces.mapping_provider_interface import MappingProviderInterface from oaklib.interfaces.merge_interface import MergeInterface @@ -124,7 +125,7 @@ from oaklib.utilities.axioms.logical_definition_utilities import ( logical_definition_matches, ) -from oaklib.utilities.kgcl_utilities import tidy_change_object +from oaklib.utilities.kgcl_utilities import generate_change_id, tidy_change_object from oaklib.utilities.mapping.sssom_utils import inject_mapping_sources @@ -323,6 +324,13 @@ def subset_members(self, subset: SUBSET_CURIE) -> Iterable[CURIE]: if subset in s.simple_values(TAG_SUBSET): yield s.id + def terms_subsets(self, curies: Iterable[CURIE]) -> Iterable[Tuple[CURIE, SUBSET_CURIE]]: + for curie in curies: + s = self._stanza(curie, False) + if s: + for subset in s.simple_values(TAG_SUBSET): + yield curie, subset + def ontologies(self) -> Iterable[CURIE]: od = self.obo_document for v in od.header.simple_values(TAG_ONTOLOGY): @@ -761,9 +769,161 @@ def logical_definitions( # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - # Implements: PatcherInterface + # Implements: DifferInterface # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + def diff( + self, + other_ontology: DifferInterface, + configuration: DiffConfiguration = None, + **kwargs, + ) -> Iterator[kgcl.Change]: + if configuration is None: + configuration = DiffConfiguration() + if not isinstance(other_ontology, SimpleOboImplementation): + raise ValueError("Can only diff SimpleOboImplementation") + stanzas1 = self.obo_document.stanzas + stanzas2 = other_ontology.obo_document.stanzas + all_ids = set(stanzas1.keys()).union(stanzas2.keys()) + for id in all_ids: + yield from self._diff_stanzas(stanzas1.get(id, None), stanzas2.get(id, None)) + + def _diff_stanzas( + self, stanza1: Optional[Stanza], stanza2: Optional[Stanza] + ) -> Iterator[kgcl.Change]: + def _id(): + return generate_change_id() + + node_is_deleted = False + if stanza1 is None and stanza2 is None: + raise ValueError("Both stanzas are None") + if stanza1 is None: + stanza1 = Stanza(id=stanza2.id, type=stanza2.type) + if stanza2.type == "Term": + yield kgcl.ClassCreation(id=_id(), about_node=stanza2.id) + elif stanza2.type == "Typedef": + yield kgcl.NodeCreation(id=_id(), about_node=stanza2.id) + else: + raise ValueError(f"Unknown stanza type: {stanza2.type}") + if stanza2 is None: + stanza2 = Stanza(id=stanza1.id, type=stanza1.type) + if stanza1.type == "Term": + yield kgcl.NodeDeletion(id=_id(), about_node=stanza1.id) + else: + yield kgcl.NodeDeletion(id=_id(), about_node=stanza1.id) + node_is_deleted = True + if stanza1 == stanza2: + return + if stanza1.type != stanza2.type: + raise ValueError(f"Stanza types differ: {stanza1.type} vs {stanza2.type}") + t1id = stanza1.id + t2id = stanza2.id + logging.info(f"Diffing: {t1id} vs {t2id}") + + def _tv_dict(stanza: Stanza) -> Dict[str, List[str]]: + d = defaultdict(set) + for tv in stanza.tag_values: + d[tv.tag].add(tv.value) + return d + + tv_dict1 = _tv_dict(stanza1) + tv_dict2 = _tv_dict(stanza2) + all_tags = set(tv_dict1.keys()).union(tv_dict2.keys()) + for tag in all_tags: + vals1 = tv_dict1.get(tag, []) + vals2 = tv_dict2.get(tag, []) + vals1list = list(vals1) + vals2list = list(vals2) + tvs1 = [tv for tv in stanza1.tag_values if tv.tag == tag] + tvs2 = [tv for tv in stanza2.tag_values if tv.tag == tag] + if vals1 == vals2: + continue + logging.info(f"Difference in {tag}: {vals1} vs {vals2}") + if tag == TAG_NAME: + if node_is_deleted: + continue + if vals1 and vals2: + yield kgcl.NodeRename( + id=_id(), about_node=t1id, new_value=vals2list[0], old_value=vals1list[0] + ) + elif vals1: + yield kgcl.NodeDeletion(id=_id(), about_node=t1id) + else: + yield kgcl.ClassCreation(id=_id(), about_node=t2id, name=vals2list[0]) + elif tag == TAG_DEFINITION: + if node_is_deleted: + continue + # TODO: provenance changes + td1 = stanza1.quoted_value(TAG_DEFINITION) + td2 = stanza2.quoted_value(TAG_DEFINITION) + if vals1 and vals2: + yield kgcl.NodeTextDefinitionChange( + id=_id(), about_node=t1id, new_value=td2, old_value=td1 + ) + elif vals1: + yield kgcl.RemoveTextDefinition(id=_id(), about_node=t1id) + else: + yield kgcl.NewTextDefinition(id=_id(), about_node=t2id, new_value=td2) + elif tag == TAG_IS_OBSOLETE: + if node_is_deleted: + continue + if vals1 and not vals2: + yield kgcl.NodeUnobsoletion(id=_id(), about_node=t1id) + elif not vals1 and vals2: + replaced_by = stanza2.simple_values(TAG_REPLACED_BY) + if replaced_by: + yield kgcl.NodeObsoletionWithDirectReplacement( + id=_id(), about_node=t2id, has_direct_replacement=replaced_by[0] + ) + else: + yield kgcl.NodeObsoletion(id=_id(), about_node=t2id) + elif tag == TAG_SUBSET: + if node_is_deleted: + continue + subsets1 = stanza1.simple_values(TAG_SUBSET) + subsets2 = stanza2.simple_values(TAG_SUBSET) + for subset in subsets1: + if subset not in subsets2: + yield kgcl.RemoveNodeFromSubset(id=_id(), about_node=t1id, in_subset=subset) + for subset in subsets2: + if subset not in subsets1: + yield kgcl.AddNodeToSubset(id=_id(), about_node=t2id, in_subset=subset) + elif tag == TAG_IS_A: + isas1 = stanza1.simple_values(TAG_IS_A) + isas2 = stanza2.simple_values(TAG_IS_A) + for isa in isas1: + if isa not in isas2: + yield kgcl.EdgeDeletion(id=_id(), subject=t1id, predicate=IS_A, object=isa) + for isa in isas2: + if isa not in isas1: + yield kgcl.EdgeCreation(id=_id(), subject=t2id, predicate=IS_A, object=isa) + elif tag == TAG_RELATIONSHIP: + rels1 = stanza1.pair_values(TAG_RELATIONSHIP) + rels2 = stanza2.pair_values(TAG_RELATIONSHIP) + for p, v in rels1: + p_curie = self.map_shorthand_to_curie(p) + if (p, v) not in rels2: + yield kgcl.EdgeDeletion(id=_id(), subject=t1id, predicate=p_curie, object=v) + for p, v in rels2: + p_curie = self.map_shorthand_to_curie(p) + if (p, v) not in rels1: + yield kgcl.EdgeCreation(id=_id(), subject=t2id, predicate=p_curie, object=v) + elif tag == TAG_SYNONYM: + if node_is_deleted: + continue + # TODO: make this sensitive to annotation changes; for now we truncate the tuple + syns1 = [tv.as_synonym()[0:2] for tv in tvs1] + syns2 = [tv.as_synonym()[0:2] for tv in tvs2] + for syn in syns1: + if syn not in syns2: + yield kgcl.RemoveSynonym(id=_id(), about_node=t1id, old_value=syn[0]) + for syn in syns2: + if syn not in syns1: + pred = SCOPE_TO_SYNONYM_PRED_MAP[syn[1]] + yield kgcl.NewSynonym( + id=_id(), about_node=t2id, new_value=syn[0], predicate=pred + ) + def different_from(self, entity: CURIE, other_ontology: DifferInterface) -> bool: t1 = self._stanza(entity, strict=False) if t1: @@ -772,6 +932,10 @@ def different_from(self, entity: CURIE, other_ontology: DifferInterface) -> bool return str(t1) != str(t2) return True + # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + # Implements: PatcherInterface + # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + def migrate_curies(self, curie_map: Mapping[CURIE, CURIE]) -> None: od = self.obo_document for t in od.stanzas.values(): diff --git a/src/oaklib/interfaces/differ_interface.py b/src/oaklib/interfaces/differ_interface.py index 61fafcd52..8602455eb 100644 --- a/src/oaklib/interfaces/differ_interface.py +++ b/src/oaklib/interfaces/differ_interface.py @@ -3,7 +3,7 @@ from abc import ABC from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, Iterator, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple import kgcl_schema.datamodel.kgcl as kgcl from kgcl_schema.datamodel.kgcl import ( @@ -32,13 +32,8 @@ ) from oaklib.datamodels.vocabulary import ( # OIO_SYNONYM_TYPE_PROPERTY, - CLASS_CREATION, DEPRECATED_PREDICATE, HAS_OBSOLESCENCE_REASON, - MAPPING_EDGE_DELETION, - NODE_CREATION, - NODE_DELETION, - NODE_TEXT_DEFINITION_CHANGE, OWL_CLASS, TERM_REPLACED_BY, TERMS_MERGED, @@ -50,6 +45,8 @@ TERM_LIST_DIFF = Tuple[CURIE, CURIE] RESIDUAL_KEY = "__RESIDUAL__" +logger = logging.getLogger(__name__) + @dataclass class DiffConfiguration: @@ -57,7 +54,6 @@ class DiffConfiguration: simple: bool = False group_by_property: PRED_CURIE = None - yield_individual_changes: bool = True def _gen_id(): @@ -68,41 +64,70 @@ class DifferInterface(BasicOntologyInterface, ABC): """ Generates Change objects between one ontology and another. - This uses the KGCL datamodel, see :ref:`kgcl-datamodel` for more information. + This uses the KGCL datamodel, see ``_ for more information. """ + def changed_nodes( + self, + other_ontology: BasicOntologyInterface, + configuration: DiffConfiguration = None, + **kwargs, + ) -> Iterator[Tuple[CURIE, str]]: + raise NotImplementedError + + def grouped_diff( + self, + *args, + **kwargs, + ) -> Iterator[Tuple[str, List[Change]]]: + """ + Yields changes grouped by type. + + This wraps the :meth:`diff` method and groups the changes by type. + + :param args: + :param kwargs: + :return: + """ + changes = list(self.diff(*args, **kwargs)) + change_map = defaultdict(list) + for change in changes: + change_map[change.type].append(change) + for k, vs in change_map.items(): + yield k, vs + def diff( self, other_ontology: BasicOntologyInterface, configuration: DiffConfiguration = None, - ) -> Iterator[Dict[str, Any]]: + **kwargs, + ) -> Iterator[Change]: """ Diffs two ontologies. The changes that are yielded describe transitions from the current ontology to the other ontology. - Note that this is not guaranteed to diff every axiom in both ontologies. Only a subset of KGCL change - types are supported: - - - NodeCreation - - NodeDeletion - - NodeMove - - NodeRename - - PredicateChange - - NewTextDefinition - - NodeTextDefinitionChange - - AddNodeToSubset - - RemoveNodeFromSubset - - Preferred sequence of changes: - 1. New classes - 2. Label changes - 3. Definition changes - 4. Obsoletions - 5. Added synonyms - 6. Added definitions - 7. Changed relationships - 8. Changed subsets + Note that this is not guaranteed to diff every axiom in both ontologies. Different implementations + may implement different subsets. + + Example usage: + + >>> from oaklib import get_adapter + >>> from linkml_runtime.dumpers import yaml_dumper + >>> path1 = "simpleobo:tests/input/go-nucleus.obo" + >>> path2 = "simpleobo:tests/input/go-nucleus-modified.obo" + >>> ont1 = get_adapter(path1) + >>> ont2 = get_adapter(path2) + >>> for change in ont1.diff(ont2): + ... print(yaml_dumper.dumps(change)) + + ... + type: NodeRename + old_value: catalytic activity + new_value: enzyme activity + about_node: GO:0003824 + ... + :param other_ontology: Ontology to compare against :param configuration: Configuration for the differentiation @@ -110,10 +135,13 @@ def diff( """ if configuration is None: configuration = DiffConfiguration() + logging.info(f"Configuration: {configuration}") # * self => old ontology # * other_ontology => latest ontology other_ontology_entities = set(list(other_ontology.entities(filter_obsoletes=False))) + logger.info(f"other_ontology_entities = {len(other_ontology_entities)}") self_entities = set(list(self.entities(filter_obsoletes=False))) + logger.info(f"self_entities = {len(self_entities)}") intersection_of_entities = self_entities.intersection(other_ontology_entities) obsolete_nodes = set() @@ -123,61 +151,28 @@ def diff( # Node/Class Creation created_entities = other_ontology_entities - self_entities + logger.info(f"Created = {len(created_entities)}") + + logger.info("finding Creations") + for entity in created_entities: + types = other_ontology.owl_type(entity) + if OWL_CLASS in types: + yield ClassCreation(id=_gen_id(), about_node=entity) + # elif OIO_SYNONYM_TYPE_PROPERTY in types: + # yield NodeCreation( + # id=_gen_id(), about_node=OIO_SYNONYM_TYPE_PROPERTY + # ) + else: + yield NodeCreation(id=_gen_id(), about_node=entity) - if configuration.yield_individual_changes: - # Yield each creation individually - for entity in created_entities: - types = other_ontology.owl_type(entity) - if OWL_CLASS in types: - yield ClassCreation(id=_gen_id(), about_node=entity) - # elif OIO_SYNONYM_TYPE_PROPERTY in types: - # yield NodeCreation( - # id=_gen_id(), about_node=OIO_SYNONYM_TYPE_PROPERTY - # ) - else: - yield NodeCreation(id=_gen_id(), about_node=entity) - else: - # Collect creations and yield at the end - class_creations = [] - node_creations = [] - - for entity in created_entities: - types = other_ontology.owl_type(entity) - if OWL_CLASS in types: - class_creations.append(ClassCreation(id=_gen_id(), about_node=entity)) - # elif OIO_SYNONYM_TYPE_PROPERTY in types: - # node_creations.append( - # NodeCreation(id=_gen_id(), about_node=OIO_SYNONYM_TYPE_PROPERTY) - # ) - else: - node_creations.append(NodeCreation(id=_gen_id(), about_node=entity)) - - # Yield collected changes as a dictionary if there are any - changes = {} - if class_creations: - changes[CLASS_CREATION] = class_creations - if node_creations: - changes[NODE_CREATION] = node_creations - if changes: - yield changes - - # Node Deletion with consideration for yield_individual_changes flag + logger.info("finding Deletions") nodes_to_delete = self_entities - other_ontology_entities if nodes_to_delete: - if configuration.yield_individual_changes: - # Yield each deletion individually - for node in nodes_to_delete: - yield NodeDeletion(id=_gen_id(), about_node=node) - else: - # Yield all deletions at once in a dictionary - yield { - NODE_DELETION: [ - NodeDeletion(id=_gen_id(), about_node=node) for node in nodes_to_delete - ] - } - - # ! Obsoletions + for node in nodes_to_delete: + yield NodeDeletion(id=_gen_id(), about_node=node) + + logger.info("finding Obsoletions") other_ontology_entities_with_obsoletes = set( other_ontology.entities(filter_obsoletes=False) ) @@ -194,7 +189,7 @@ def diff( self_ontology_without_obsoletes = set(list(self.entities(filter_obsoletes=True))) self_ontology_obsoletes = self_entities - self_ontology_without_obsoletes - # Find NodeUnobsoletions + logger.info("finding Unsoletions") possible_unobsoletes = self_ontology_obsoletes.intersection( other_ontology_unobsolete_entities ) @@ -205,45 +200,23 @@ def diff( other_ontology.entity_metadata_map, ) - if configuration.yield_individual_changes: - # Yield each obsoletion_change object individually - for obsoletion_change in obsoletion_generator: - obsolete_nodes.add(obsoletion_change.about_node) - yield obsoletion_change - else: - # Collect all obsoletion_change objects in a dictionary and yield them at the end - obsoletion_changes = defaultdict(list) - for obsoletion_change in obsoletion_generator: - if obsoletion_change: - class_name = obsoletion_change.type - obsolete_nodes.add(obsoletion_change.about_node) - obsoletion_changes.setdefault(class_name, []).append(obsoletion_change) - - if obsoletion_changes: - yield obsoletion_changes - - # ! Remove obsolete nodes from relevant sets + logger.info("Remaining...") + for obsoletion_change in obsoletion_generator: + obsolete_nodes.add(obsoletion_change.about_node) + yield obsoletion_change + + logger.info("Remove obsolete nodes from relevant sets") intersection_of_entities = self_ontology_without_obsoletes.intersection( other_ontology_entities_without_obsoletes ) - # Initialize variables for label changes, definition changes, new definitions, and synonyms - if not configuration.yield_individual_changes: - label_change_list = [] - definition_changes = defaultdict(list) - new_definition_list = [] - synonym_changes = defaultdict(list) - edge_creation_list = [] - edge_deletion_list = [] - edge_change_list = [] - subset_addition_list = [] - subset_removal_list = [] + logger.info("finding remaining changes") self_aliases = {} other_aliases = {} # Loop through each entity once and process - # ! label changes, definition changes, new definitions, and synonyms + logger.info("label changes, definition changes, new definitions, and synonyms") for entity in intersection_of_entities: # Label change if self.label(entity) != other_ontology.label(entity): @@ -253,10 +226,7 @@ def diff( old_value=self.label(entity), new_value=other_ontology.label(entity), ) - if configuration.yield_individual_changes: - yield node_rename - else: - label_change_list.append(node_rename) + yield node_rename # Definition changes old_value = self.definition(entity) @@ -268,10 +238,7 @@ def diff( new_value=new_value, old_value=old_value, ) - if configuration.yield_individual_changes: - yield change - else: - definition_changes[NODE_TEXT_DEFINITION_CHANGE].append(change) + yield change # New definitions if self.definition(entity) is None and other_ontology.definition(entity) is not None: @@ -281,10 +248,7 @@ def diff( new_value=other_ontology.definition(entity), old_value=self.definition(entity), ) - if configuration.yield_individual_changes: - yield new_def - else: - new_definition_list.append(new_def) + yield new_def # Synonyms - compute both sets of aliases self_aliases[entity] = set(self.alias_relationships(entity, exclude_labels=True)) @@ -307,10 +271,7 @@ def diff( predicate=predicate, object=xref, ) - if configuration.yield_individual_changes: - yield edge_created - else: - edge_creation_list.append(edge_created) + yield edge_created if mappings_removed_set: for mapping in mappings_removed_set: predicate, xref = mapping @@ -320,10 +281,7 @@ def diff( predicate=predicate, object=xref, ) - if configuration.yield_individual_changes: - yield deleted_edge - else: - edge_deletion_list.append(deleted_edge) + yield deleted_edge if mapping_changed_set: for changes in mapping_changed_set: object, new_predicate, old_predicate = changes @@ -334,10 +292,7 @@ def diff( new_value=new_predicate, ) - if configuration.yield_individual_changes: - yield edge_change - else: - edge_change_list.append(edge_change) + yield edge_change # ! Subset changes self_subsets = set(self.terms_subsets([entity])) @@ -351,10 +306,7 @@ def diff( about_node=entity, in_subset=subset, ) - if configuration.yield_individual_changes: - yield change - else: - subset_addition_list.append(change) + yield change if subsets_removed_set: for _, subset in subsets_removed_set: change = RemoveNodeFromSubset( @@ -362,46 +314,16 @@ def diff( about_node=entity, in_subset=subset, ) - if configuration.yield_individual_changes: - yield change - else: - subset_removal_list.append(change) - - # Yield collected changes after processing all entities - if not configuration.yield_individual_changes: - if label_change_list: - yield {NodeRename.__name__: label_change_list} - if definition_changes[NodeTextDefinitionChange.__name__]: - yield definition_changes - if new_definition_list: - yield {NewTextDefinition.__name__: new_definition_list} - if edge_creation_list: - yield {EdgeCreation.__name__: edge_creation_list} - if edge_deletion_list: - yield {MAPPING_EDGE_DELETION: edge_deletion_list} - if edge_change_list: - yield {EdgeChange.__name__: edge_change_list} - if subset_addition_list: - yield {AddNodeToSubset.__name__: subset_addition_list} - if subset_removal_list: - yield {RemoveNodeFromSubset.__name__: subset_removal_list} - - # Process synonyms changes after collecting all aliases + yield change + + logger.info("Process synonyms changes after collecting all aliases") synonyms_generator = _generate_synonym_changes( intersection_of_entities, self_aliases, other_aliases ) - if configuration.yield_individual_changes: - # Yield each synonyms_change object individually - for synonyms_change in synonyms_generator: - yield synonyms_change - else: - # Collect all changes in a defaultdict and yield them at the end - for synonyms_change in synonyms_generator: - synonym_changes[synonyms_change.__class__.__name__].append(synonyms_change) - if synonym_changes: - yield synonym_changes + for synonyms_change in synonyms_generator: + yield synonyms_change - # ! Relationships + logger.info("Relationships") self_out_rels = { entity: set(self.outgoing_relationships(entity)) for entity in self_ontology_without_obsoletes @@ -412,23 +334,9 @@ def diff( } # Process the entities in parallel using a generator - list_of_changes = defaultdict(list) if not configuration.yield_individual_changes else [] - for relationship_changes in _parallely_get_relationship_changes( - self_ontology_without_obsoletes, - self_out_rels, - other_out_rels, - configuration.yield_individual_changes, - ): - for change in relationship_changes: - if configuration.yield_individual_changes: - yield change - else: - # Collect all changes in a defaultdict - for change_type, change_list in change.items(): - list_of_changes.setdefault(change_type, []).extend(change_list) - - if not configuration.yield_individual_changes: - yield list_of_changes # Yield the collected changes once at the end + yield from _parallely_get_relationship_changes( + self_ontology_without_obsoletes, self_out_rels, other_out_rels, + ) def diff_summary( self, @@ -485,7 +393,7 @@ def diff_summary( if len(v) == 1: partition = v[0] else: - logging.warning( + logger.warning( f"Multiple values for {configuration.group_by_property} = {v}" ) if partition not in summary: @@ -529,12 +437,13 @@ def compare_term_in_two_ontologies( other_ontology: BasicOntologyInterface, curie: CURIE, other_curie: CURIE = None, + **kwargs, ) -> Any: raise NotImplementedError # ! Helper functions for the diff method -def _generate_synonym_changes(self_entities, self_aliases, other_aliases): +def _generate_synonym_changes(self_entities, self_aliases, other_aliases) -> Iterator[Change]: for e1 in self_entities: e1_arels = self_aliases[e1] e2_arels = other_aliases[e1] @@ -569,7 +478,7 @@ def _generate_synonym_changes(self_entities, self_aliases, other_aliases): yield synonym_change -def _process_deprecation_data(deprecation_data_item): +def _process_deprecation_data(deprecation_data_item) -> Iterator[Change]: e1, e1_dep, e2_dep, e2_meta = deprecation_data_item if e1_dep != e2_dep: if not e1_dep and e2_dep: @@ -617,10 +526,10 @@ def _generate_obsoletion_changes( yield result -def _generate_relation_changes(e1, self_out_rels, other_out_rels, yield_individual_changes): +def _generate_relation_changes(e1, self_out_rels, other_out_rels) -> List[Change]: e1_rels = self_out_rels[e1] e2_rels = other_out_rels[e1] - changes = [] if yield_individual_changes else defaultdict(list) + changes = [] for rel in e1_rels.difference(e2_rels): pred, alias = rel @@ -632,45 +541,30 @@ def _generate_relation_changes(e1, self_out_rels, other_out_rels, yield_individu change = PredicateChange( id=_gen_id(), about_edge=edge, old_value=pred, new_value=switches[0] ) - if yield_individual_changes: - changes.append(change) - else: - changes.setdefault(PredicateChange.__name__, []).append(change) + changes.append(change) else: change = EdgeDeletion(id=_gen_id(), subject=e1, predicate=pred, object=alias) - if yield_individual_changes: - changes.append(change) - else: - changes.setdefault(EdgeDeletion.__name__, []).append(change) + changes.append(change) for rel in e2_rels.difference(e1_rels): pred, alias = rel edge = Edge(subject=e1, predicate=pred, object=alias) change = NodeMove(id=_gen_id(), about_edge=edge) - if yield_individual_changes: - changes.append(change) - else: - changes.setdefault(NodeMove.__name__, []).append(change) - - # If not yielding individual changes and there are changes, return them as a dictionary - if not yield_individual_changes and changes: - return [changes] + changes.append(change) - # Otherwise, return the list of changes return changes def _parallely_get_relationship_changes( - self_entities, self_out_rels, other_out_rels, yield_individual_changes -): + self_entities, self_out_rels, other_out_rels +) -> Iterator[Change]: with multiprocessing.Pool() as pool: results = pool.starmap( _generate_relation_changes, - [(e1, self_out_rels, other_out_rels, yield_individual_changes) for e1 in self_entities], + [(e1, self_out_rels, other_out_rels) for e1 in self_entities], ) for result in results: - if result: - yield result + yield from result def _find_mapping_changes(set1, set2): diff --git a/src/oaklib/io/streaming_markdown_writer.py b/src/oaklib/io/streaming_markdown_writer.py index 54e44f9bf..dd89451f9 100644 --- a/src/oaklib/io/streaming_markdown_writer.py +++ b/src/oaklib/io/streaming_markdown_writer.py @@ -33,39 +33,40 @@ def emit(self, curie_or_change: Union[str, Dict], label=None, **kwargs): oi = self.ontology_interface other_oi = kwargs.get("other_impl", None) if isinstance(curie_or_change, dict): + # TODO: have a more robust way to determine if this is a change change_handler = ChangeHandler(file=self.file, oi=other_oi) change_handler.process_changes(curie_or_change) - else: - if label is None: - label = oi.label(curie_or_change) - self.file.write(f"## {curie_or_change} {label}\n\n") - defn = oi.definition(curie_or_change) - if defn: - self.file.write(f"_{defn}_\n\n") - self.file.write("### Xrefs\n\n") + return + if label is None: + label = oi.label(curie_or_change) + self.file.write(f"## {curie_or_change} {label}\n\n") + defn = oi.definition(curie_or_change) + if defn: + self.file.write(f"_{defn}_\n\n") + self.file.write("### Xrefs\n\n") - for _, x in oi.simple_mappings_by_curie(curie_or_change): - self.file.write(f" * {x}\n") - self.file.write("\n") - if isinstance(oi, OboGraphInterface): - self.file.write("### Relationships\n\n") - for k, vs in oi.outgoing_relationship_map(curie_or_change).items(): - p = predicate_code_map.get(k, None) + for _, x in oi.simple_mappings_by_curie(curie_or_change): + self.file.write(f" * {x}\n") + self.file.write("\n") + if isinstance(oi, OboGraphInterface): + self.file.write("### Relationships\n\n") + for k, vs in oi.outgoing_relationship_map(curie_or_change).items(): + p = predicate_code_map.get(k, None) + if p is None: + p = oi.label(k) if p is None: - p = oi.label(k) - if p is None: - p = k - self.file.write(f"* {p}\n") - for v in vs: - self.file.write(f' * {v} "{oi.label(curie_or_change)}"\n') - if ( - self.display_options - and "t" in self.display_options - and isinstance(oi, TaxonConstraintInterface) - ): - self.file.write("### Taxon Constraints\n\n") - tc_subj = oi.get_term_with_taxon_constraints(curie_or_change) - for tc in tc_subj.never_in: - self.file.write(f"* {tc}\n") + p = k + self.file.write(f"* {p}\n") + for v in vs: + self.file.write(f' * {v} "{oi.label(curie_or_change)}"\n') + if ( + self.display_options + and "t" in self.display_options + and isinstance(oi, TaxonConstraintInterface) + ): + self.file.write("### Taxon Constraints\n\n") + tc_subj = oi.get_term_with_taxon_constraints(curie_or_change) + for tc in tc_subj.never_in: + self.file.write(f"* {tc}\n") - self.file.write("\n") + self.file.write("\n") diff --git a/src/oaklib/utilities/mapping/mapping_validation.py b/src/oaklib/utilities/mapping/mapping_validation.py index 777593813..ad0651704 100644 --- a/src/oaklib/utilities/mapping/mapping_validation.py +++ b/src/oaklib/utilities/mapping/mapping_validation.py @@ -261,10 +261,16 @@ def validate_mappings( subject_adapter = lookup_mapping_adapter(m.subject_id, adapters) object_adapter = lookup_mapping_adapter(m.object_id, adapters) comments = [] - if m.subject_id in _obsoletes(subject_prefix, adapters): + subject_is_obsolete = m.subject_id in _obsoletes(subject_prefix, adapters) + object_is_obsolete = m.object_id in _obsoletes(object_prefix, adapters) + if subject_is_obsolete and not object_is_obsolete: comments.append("subject is obsolete") - if m.object_id in _obsoletes(object_prefix, adapters): + if object_is_obsolete and not subject_is_obsolete: comments.append("object is obsolete") + if subject_is_obsolete and object_is_obsolete: + logging.info( + f"both {m.subject_id} and {m.object_id} are obsolete, but this is not a violation" + ) if m.mapping_cardinality != MappingCardinalityEnum(MappingCardinalityEnum["1:1"]): if m.predicate_id == SKOS_EXACT_MATCH or ( m.predicate_id == HAS_DBXREF and xref_is_bijective diff --git a/src/oaklib/utilities/writers/change_handler.py b/src/oaklib/utilities/writers/change_handler.py index bfdb2275a..d729e44e0 100644 --- a/src/oaklib/utilities/writers/change_handler.py +++ b/src/oaklib/utilities/writers/change_handler.py @@ -1,6 +1,9 @@ """Change Handler Class.""" from dataclasses import dataclass +from typing import Dict + +from kgcl_schema.datamodel.kgcl import Change @dataclass @@ -317,7 +320,7 @@ def handle_node_direct_merge(self, value): # # Implement place under handling logic here # logging.info("Place under handling not yet implemented.") - def process_changes(self, curie_or_change): + def process_changes(self, curie_or_change: Dict[str, Change]): # Write overview and summary at the beginning of the document # self.write_markdown_overview_and_summary(curie_or_change) dispatch_table = { diff --git a/tests/test_implementations/__init__.py b/tests/test_implementations/__init__.py index 021a1c909..006b9f788 100644 --- a/tests/test_implementations/__init__.py +++ b/tests/test_implementations/__init__.py @@ -16,7 +16,7 @@ from kgcl_schema.datamodel import kgcl from kgcl_schema.datamodel.kgcl import Change, NodeObsoletion from kgcl_schema.grammar.render_operations import render -from linkml_runtime.dumpers import json_dumper +from linkml_runtime.dumpers import json_dumper, yaml_dumper from oaklib import BasicOntologyInterface, get_adapter from oaklib.datamodels import obograph from oaklib.datamodels.association import Association @@ -993,13 +993,19 @@ def test_diff(self, oi: DifferInterface, oi_modified: DifferInterface): ), ] for ch in diff: + if isinstance(ch, list): + raise ValueError(f"Unexpected list: {[type(x) for x in ch]}") ch.id = FIXED_ID if ch in expected: expected.remove(ch) else: - logging.error(f"Unexpected change: {ch}") + logging.error(f"Unexpected change [{n_unexpected}]: {ch}") + logging.error(yaml_dumper.dumps(ch)) n_unexpected += 1 ch.type = type(ch).__name__ + for e in expected: + print("Expected not found:") + print(yaml_dumper.dumps(e)) test.assertEqual(0, len(expected), f"Expected changes not found: {expected}") expected_rev = [ kgcl.NewSynonym( @@ -1027,11 +1033,21 @@ def test_diff(self, oi: DifferInterface, oi_modified: DifferInterface): if ch in expected_rev: expected_rev.remove(ch) else: - logging.error(f"Unexpected change: {ch}") - n_unexpected += 1 + # TODO: different diff implementations differ with class creation + if isinstance(ch, kgcl.EdgeChange) and ch.subject == "GO:0033673": + pass + elif isinstance(ch, kgcl.NodeChange) and ch.about_node == "GO:0033673": + pass + else: + logging.error(f"Unexpected rev change: {ch}") + logging.error(yaml_dumper.dumps(ch)) + n_unexpected += 1 ch.type = type(ch).__name__ + for e in expected_rev: + print("Expected (reversed) not found:") + print(yaml_dumper.dumps(e)) test.assertEqual(0, len(expected_rev), f"Expected changes not found: {expected_rev}") - test.assertEqual(0, n_unexpected) + test.assertEqual(0, n_unexpected, f"Unexpected changes: {n_unexpected}") # test diff summary summary = oi.diff_summary(oi_modified) logging.info(summary) @@ -1408,10 +1424,12 @@ def test_patcher( change_obj = _as_json_dict_no_id(diff) if "old_value" in change_obj and "new_value" in change_obj: del change_obj["old_value"] - print(f"LOOKING FOR xxx {change_obj}") + logging.info(f"LOOKING FOR {change_obj}") if change_obj in expected_changes: expected_changes.remove(change_obj) else: + logging.error("not found:") + logging.error(yaml_dumper.dumps(change_obj)) raise ValueError(f"Cannot find: {change_obj} in {expected_changes}") test.assertCountEqual([], expected_changes) diff --git a/tests/test_implementations/test_bioportal.py b/tests/test_implementations/test_bioportal.py index 2b6eba82b..173644a5a 100644 --- a/tests/test_implementations/test_bioportal.py +++ b/tests/test_implementations/test_bioportal.py @@ -76,7 +76,9 @@ def test_ontology_versions(self): self.assertIn("5.0.0", versions) self.assertIn("v3.2.1", versions) - @mock.patch("oaklib.implementations.ontoportal.bioportal_implementation.BioPortalImplementation") + @mock.patch( + "oaklib.implementations.ontoportal.bioportal_implementation.BioPortalImplementation" + ) def test_ontology_metadata(self, mock_impl): mock_impl.return_value = { "id": "OBI",