diff --git a/src/pyobo/reader.py b/src/pyobo/reader.py index 814279b9..e5176586 100644 --- a/src/pyobo/reader.py +++ b/src/pyobo/reader.py @@ -17,13 +17,17 @@ from .constants import DATE_FORMAT, PROVENANCE_PREFIXES from .identifier_utils import normalize_curie +from .reader_utils import ( + _chomp_axioms, + _chomp_references, + _chomp_specificity, + _chomp_typedef, +) from .registries import curie_has_blacklisted_prefix, curie_is_blacklisted, remap_prefix from .struct import ( Obo, Reference, Synonym, - SynonymSpecificities, - SynonymSpecificity, SynonymTypeDef, Term, TypeDef, @@ -119,8 +123,8 @@ def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> Obo: for typedef in iterate_graph_typedefs(graph, ontology_prefix=ontology_prefix) } - synonym_typedefs: Mapping[str, SynonymTypeDef] = { - synonym_typedef.curie: synonym_typedef + synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef] = { + synonym_typedef.pair: synonym_typedef for synonym_typedef in iterate_graph_synonym_typedefs( graph, ontology_prefix=ontology_prefix ) @@ -429,11 +433,11 @@ def _clean_definition(s: str) -> str: def _extract_synonym( s: str, - synonym_typedefs: Mapping[str, SynonymTypeDef], + synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef], *, node: Reference, strict: bool = True, - ontology_prefix: str | None, + ontology_prefix: str, ) -> Synonym | None: # TODO check if the synonym is written like a CURIE... it shouldn't but I've seen it happen try: @@ -442,66 +446,54 @@ def _extract_synonym( logger.warning("[%s] invalid synonym: %s", node.curie, s) return None - specificity: SynonymSpecificity | None = None - for _specificity in SynonymSpecificities: - if rest.startswith(_specificity): - specificity = _specificity - rest = rest[len(_specificity) :].strip() - break - - stype: Reference | None = None - for _stype in synonym_typedefs.values(): - # Since there aren't a lot of carefully defined synonym definitions, it - # can appear as a string or curie. Therefore, we might see temporary prefixes - # get added, so we should check against full curies as well as local unique - # identifiers - if rest.startswith(_stype.curie): - rest = rest[len(_stype.curie) :].strip() - stype = _stype.reference - break - elif rest.startswith(_stype.preferred_curie): - rest = rest[len(_stype.preferred_curie) :].strip() - stype = _stype.reference - break - elif rest.startswith(_stype.identifier): - rest = rest[len(_stype.identifier) :].strip() - stype = _stype.reference - break + specificity, rest = _chomp_specificity(rest) + synonym_typedef, rest = _chomp_typedef( + rest, + synonym_typedefs=synonym_typedefs, + strict=strict, + node=node, + ontology_prefix=ontology_prefix, + ) + provenance, rest = _chomp_references( + rest, strict=strict, node=node, ontology_prefix=ontology_prefix + ) + annotations = _chomp_axioms(rest, node=node, strict=strict) - if not rest.startswith("[") or not rest.endswith("]"): - provenance = [] - else: - provenance = _parse_trailing_ref_list( - rest, strict=strict, node=node, ontology_prefix=ontology_prefix - ) return Synonym( name=name, specificity=specificity or "EXACT", - type=stype or DEFAULT_SYNONYM_TYPE.reference, + type=synonym_typedef.reference if synonym_typedef else DEFAULT_SYNONYM_TYPE.reference, provenance=provenance, + annotations=annotations, ) def _parse_trailing_ref_list( - rest, *, strict: bool = True, node: Reference, ontology_prefix: str | None -): - rest = rest.lstrip("[").rstrip("]") - return [ - Reference.from_curie_or_uri( - curie.strip(), strict=strict, node=node, ontology_prefix=ontology_prefix + rest: str, *, strict: bool = True, node: Reference, ontology_prefix: str | None +) -> list[Reference]: + rest = rest.lstrip("[").rstrip("]") # FIXME this doesn't account for trailing annotations + rv = [] + for curie in rest.split(","): + curie = curie.strip() + if not curie: + continue + reference = Reference.from_curie_or_uri( + curie, strict=strict, node=node, ontology_prefix=ontology_prefix ) - for curie in rest.split(",") - if curie.strip() - ] + if reference is None: + logger.warning("[%s] could not parse provenance CURIE: %s", node.curie, curie) + continue + rv.append(reference) + return rv def iterate_node_synonyms( data: Mapping[str, Any], - synonym_typedefs: Mapping[str, SynonymTypeDef], + synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef], *, node: Reference, strict: bool = False, - ontology_prefix: str | None, + ontology_prefix: str, ) -> Iterable[Synonym]: """Extract synonyms from a :mod:`obonet` node's data. diff --git a/src/pyobo/reader_utils.py b/src/pyobo/reader_utils.py new file mode 100644 index 00000000..b2021847 --- /dev/null +++ b/src/pyobo/reader_utils.py @@ -0,0 +1,126 @@ +"""Utilities for reading OBO files.""" + +from __future__ import annotations + +import logging +from collections import Counter +from collections.abc import Mapping + +import bioontologies.upgrade +from curies import ReferenceTuple + +from pyobo.struct import SynonymSpecificities, SynonymSpecificity +from pyobo.struct.struct import Reference, SynonymTypeDef, _synonym_typedef_warn, default_reference + +logger = logging.getLogger(__name__) + +TARGET_URI_WARNINGS: Counter[tuple[str, str]] = Counter() + + +def _chomp_specificity(s: str) -> tuple[SynonymSpecificity | None, str]: + s = s.strip() + for _specificity in SynonymSpecificities: + if s.startswith(_specificity): + return _specificity, s[len(_specificity) :].strip() + return None, s + + +def _chomp_typedef( + s: str, + *, + synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef], + strict: bool = True, + node: Reference, + ontology_prefix: str, +) -> tuple[SynonymTypeDef | None, str]: + if not s: + # This might happen if a synonym is just given as a string + return None, "" + + if s.startswith("[") or s.startswith("{"): + # there's no typedef reference here, just return + return None, s + + try: + stype_curie, rest = (x.strip() for x in s.split(" ", 1)) + except ValueError as e: + if "not enough values to unpack" not in str(e): + raise + + # let's just check if this might be a CURIE all by itself. + # if there's a space, we are out of luck, otherwise, let's + # try to parse it like a curie + if " " in s: + # if there + return None, s + + stype_curie, rest = s, "" + + reference: Reference | None + if ":" not in stype_curie: + # this catches situation where it's "ABBREVIATION" + if xx := bioontologies.upgrade.upgrade(stype_curie): + reference = Reference(prefix=xx.prefix, identifier=xx.identifier) + else: + reference = default_reference(ontology_prefix, stype_curie) + else: + reference = Reference.from_curie_or_uri( + stype_curie, + strict=strict, + node=node, + ontology_prefix=ontology_prefix, + ) + if reference is None: + logger.warning( + "[%s] unable to parse synonym type `%s` in line %s", node.curie, stype_curie, s + ) + return None, rest + + synonym_typedef = _synonym_typedef_warn( + ontology_prefix, predicate=reference, synonym_typedefs=synonym_typedefs + ) + return synonym_typedef, rest + + +SYNONYM_REFERENCE_WARNED: Counter[tuple[str, str]] = Counter() + + +def _chomp_references( + s: str, *, strict: bool = True, node: Reference, ontology_prefix: str +) -> tuple[list[Reference], str]: + if not s: + return [], "" + if not s.startswith("["): + if s.startswith("{"): + # This means there are no reference, but there are some qualifiers + return [], s + else: + logger.debug("[%s] synonym had no references: %s", node.curie, s) + return [], s + + if "]" not in s: + logger.warning("[%s] missing closing square bracket in references: %s", node.curie, s) + return [], s + + first, rest = s.lstrip("[").split("]", 1) + references = [] + for curie in first.split(","): + curie = curie.strip() + if not curie: + continue + reference = Reference.from_curie_or_uri( + curie, strict=strict, node=node, ontology_prefix=ontology_prefix + ) + if reference is None: + if not SYNONYM_REFERENCE_WARNED[ontology_prefix, curie]: + logger.warning("[%s] unable to parse synonym reference: %s", node.curie, curie) + SYNONYM_REFERENCE_WARNED[ontology_prefix, curie] += 1 + continue + references.append(reference) + return references, rest + + +def _chomp_axioms( + s: str, *, strict: bool = True, node: Reference +) -> list[tuple[Reference, Reference]]: + return [] diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py index 994b4e5b..133207fb 100644 --- a/src/pyobo/struct/struct.py +++ b/src/pyobo/struct/struct.py @@ -645,11 +645,13 @@ def _typedef_warn( def _synonym_typedef_warn( prefix: str, predicate: Reference, synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef] -) -> bool: +) -> SynonymTypeDef | None: if predicate.pair == DEFAULT_SYNONYM_TYPE.pair: - return False - if predicate.pair in default_typedefs or predicate.pair in synonym_typedefs: - return False + return DEFAULT_SYNONYM_TYPE + if predicate.pair in default_synonym_typedefs: + return default_synonym_typedefs[predicate.pair] + if predicate.pair in synonym_typedefs: + return synonym_typedefs[predicate.pair] key = prefix, predicate if key not in _SYNONYM_TYPEDEF_WARNINGS: _SYNONYM_TYPEDEF_WARNINGS.add(key) @@ -663,7 +665,7 @@ def _synonym_typedef_warn( ) else: logger.warning(f"[{prefix}] synonym typedef not defined: {predicate.preferred_curie}") - return True + return None class BioregistryError(ValueError): diff --git a/tests/test_get.py b/tests/test_get.py index 94b305f0..be04c5fd 100644 --- a/tests/test_get.py +++ b/tests/test_get.py @@ -106,11 +106,11 @@ def test_extract_definition_with_escapes(self): def test_extract_synonym(self): """Test extracting synonym strings.""" iupac_name = SynonymTypeDef( - reference=Reference(prefix="obo", identifier="IUPAC_NAME", name="IUPAC NAME") + reference=default_reference(prefix="chebi", identifier="IUPAC_NAME", name="IUPAC NAME") ) synoynym_typedefs = { - "IUPAC_NAME": iupac_name, - acronym.curie: acronym, + iupac_name.pair: iupac_name, + acronym.pair: acronym, } for expected_synonym, text in [ @@ -173,10 +173,10 @@ def test_extract_synonym(self): def test_get_node_synonyms(self): """Test getting synonyms from a node in a :mod:`obonet` graph.""" iupac_name = SynonymTypeDef( - reference=Reference(prefix="obo", identifier="IUPAC_NAME", name="IUPAC NAME") + reference=default_reference(prefix="chebi", identifier="IUPAC_NAME", name="IUPAC NAME") ) synoynym_typedefs = { - "IUPAC_NAME": iupac_name, + iupac_name.pair: iupac_name, } data = self.graph.nodes["CHEBI:51990"] synonyms = list( diff --git a/tests/test_reader.py b/tests/test_reader.py index a4f025aa..febb556d 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -10,7 +10,7 @@ from pyobo import Obo, Reference, Term from pyobo.reader import from_obonet, get_first_nonescaped_quote from pyobo.struct import default_reference -from pyobo.struct.struct import DEFAULT_SYNONYM_TYPE +from pyobo.struct.struct import DEFAULT_SYNONYM_TYPE, abbreviation from pyobo.struct.typedef import TypeDef, exact_match, has_dbxref, is_conjugate_base_of, see_also CHARLIE = Reference(prefix="orcid", identifier="0000-0003-4423-4370") @@ -704,6 +704,24 @@ def test_synonym_full(self) -> None: synonym.provenance, ) + def test_synonym_dashed(self) -> None: + """Test parsing a synonym with specificity, type, and provenance.""" + ontology = _read("""\ + ontology: chebi + synonymtypedef: OMO:1234567 "" + + [Term] + id: CHEBI:1234 + synonym: "Brown-Pearce tumour" EXACT OMO:0003005 [] + """) + term = self.get_only_term(ontology) + self.assertEqual(1, len(term.synonyms)) + synonym = term.synonyms[0] + self.assertEqual("Brown-Pearce tumour", synonym.name) + self.assertEqual("EXACT", synonym.specificity) + self.assertEqual(Reference(prefix="omo", identifier="0003005"), synonym.type) + self.assertEqual([], synonym.provenance) + def test_synonym_url(self) -> None: """Test parsing a synonym defined with a PURL.""" ontology = _read(f"""\ @@ -728,6 +746,102 @@ def test_synonym_url(self) -> None: synonym.provenance, ) + def test_synonym_casing(self) -> None: + """Test parsing a synonym when an alternate case is used.""" + ontology = _read(f"""\ + ontology: chebi + synonymtypedef: OMO:1234567 "" + + [Term] + id: CHEBI:1234 + synonym: "LTEC I" EXACT omo:1234567 [Orphanet:93938,{CHARLIE.curie}] + """) + term = self.get_only_term(ontology) + self.assertEqual(1, len(term.synonyms)) + synonym = term.synonyms[0] + self.assertEqual("LTEC I", synonym.name) + self.assertEqual("EXACT", synonym.specificity) + self.assertEqual(Reference(prefix="omo", identifier="1234567"), synonym.type) + self.assertEqual( + [ + Reference(prefix="orphanet", identifier="93938"), + CHARLIE, + ], + synonym.provenance, + ) + + def test_synonym_default(self) -> None: + """Test parsing a synonym that has a built-in prefix.""" + ontology = _read("""\ + ontology: chebi + + [Term] + id: CHEBI:1234 + synonym: "DoguAnadoluKirmizisi" EXACT most_common_name [] + """) + term = self.get_only_term(ontology) + self.assertEqual(1, len(term.synonyms)) + synonym = term.synonyms[0] + self.assertEqual("DoguAnadoluKirmizisi", synonym.name) + self.assertEqual("EXACT", synonym.specificity) + self.assertEqual(DEFAULT_SYNONYM_TYPE.reference, synonym.type) + + # now, we define it properly + ontology = _read("""\ + ontology: chebi + synonymtypedef: most_common_name "most common name" + + [Term] + id: CHEBI:1234 + synonym: "DoguAnadoluKirmizisi" EXACT most_common_name [] + """) + term = self.get_only_term(ontology) + self.assertEqual(1, len(term.synonyms)) + synonym = term.synonyms[0] + self.assertEqual("DoguAnadoluKirmizisi", synonym.name) + self.assertEqual("EXACT", synonym.specificity) + self.assertEqual(default_reference("chebi", "most_common_name"), synonym.type) + + def test_synonym_builtin(self) -> None: + """Test parsing a synonym with specificity, type, and provenance.""" + ontology = _read("""\ + ontology: chebi + + [Term] + id: CHEBI:1234 + synonym: "COP" EXACT ABBREVIATION [] + """) + term = self.get_only_term(ontology) + self.assertEqual(1, len(term.synonyms)) + synonym = term.synonyms[0] + self.assertEqual("COP", synonym.name) + self.assertEqual("EXACT", synonym.specificity) + self.assertEqual(abbreviation.reference, synonym.type) + self.assertEqual(Reference(prefix="OMO", identifier="0003000"), synonym.type) + + @unittest.skip( + reason="This needs to be fixed upstream, since obonet's " + "parser for synonyms fails on the open squiggly bracket {" + ) + def test_synonym_with_annotations(self) -> None: + """Test parsing a synonym with annotations.""" + ontology = _read("""\ + ontology: chebi + + [Term] + id: CHEBI:1234 + synonym: "10*3.{copies}/mL" EXACT [] {http://purl.obolibrary.org/obo/NCIT_P383="AB", http://purl.obolibrary.org/obo/NCIT_P384="UCUM"} + """) + term = self.get_only_term(ontology) + self.assertEqual(1, len(term.synonyms)) + synonym = term.synonyms[0] + self.assertEqual("10*3.{copies}/mL", synonym.name) + self.assertEqual("EXACT", synonym.specificity) + self.assertEqual(DEFAULT_SYNONYM_TYPE, synonym.type) + self.assertEqual([], synonym.provenance) + # TODO update this when adding annotation parsing! + self.assertEqual([], synonym.annotations) + def test_parent(self) -> None: """Test parsing out a parent.""" ontology = _read("""\