Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve synonym parser #249

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 41 additions & 49 deletions src/pyobo/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@

from .constants import DATE_FORMAT, PROVENANCE_PREFIXES
from .identifier_utils import normalize_curie
from .reader_utils import (
_chomp_axioms,
_chomp_references,
_chomp_specificity,
_chomp_typedef,
)
from .registries import curie_has_blacklisted_prefix, curie_is_blacklisted, remap_prefix
from .struct import (
Obo,
Reference,
Synonym,
SynonymSpecificities,
SynonymSpecificity,
SynonymTypeDef,
Term,
TypeDef,
Expand Down Expand Up @@ -119,8 +123,8 @@ def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> Obo:
for typedef in iterate_graph_typedefs(graph, ontology_prefix=ontology_prefix)
}

synonym_typedefs: Mapping[str, SynonymTypeDef] = {
synonym_typedef.curie: synonym_typedef
synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef] = {
synonym_typedef.pair: synonym_typedef
for synonym_typedef in iterate_graph_synonym_typedefs(
graph, ontology_prefix=ontology_prefix
)
Expand Down Expand Up @@ -429,11 +433,11 @@ def _clean_definition(s: str) -> str:

def _extract_synonym(
s: str,
synonym_typedefs: Mapping[str, SynonymTypeDef],
synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
*,
node: Reference,
strict: bool = True,
ontology_prefix: str | None,
ontology_prefix: str,
) -> Synonym | None:
# TODO check if the synonym is written like a CURIE... it shouldn't but I've seen it happen
try:
Expand All @@ -442,66 +446,54 @@ def _extract_synonym(
logger.warning("[%s] invalid synonym: %s", node.curie, s)
return None

specificity: SynonymSpecificity | None = None
for _specificity in SynonymSpecificities:
if rest.startswith(_specificity):
specificity = _specificity
rest = rest[len(_specificity) :].strip()
break

stype: Reference | None = None
for _stype in synonym_typedefs.values():
# Since there aren't a lot of carefully defined synonym definitions, it
# can appear as a string or curie. Therefore, we might see temporary prefixes
# get added, so we should check against full curies as well as local unique
# identifiers
if rest.startswith(_stype.curie):
rest = rest[len(_stype.curie) :].strip()
stype = _stype.reference
break
elif rest.startswith(_stype.preferred_curie):
rest = rest[len(_stype.preferred_curie) :].strip()
stype = _stype.reference
break
elif rest.startswith(_stype.identifier):
rest = rest[len(_stype.identifier) :].strip()
stype = _stype.reference
break
specificity, rest = _chomp_specificity(rest)
synonym_typedef, rest = _chomp_typedef(
rest,
synonym_typedefs=synonym_typedefs,
strict=strict,
node=node,
ontology_prefix=ontology_prefix,
)
provenance, rest = _chomp_references(
rest, strict=strict, node=node, ontology_prefix=ontology_prefix
)
annotations = _chomp_axioms(rest, node=node, strict=strict)

if not rest.startswith("[") or not rest.endswith("]"):
provenance = []
else:
provenance = _parse_trailing_ref_list(
rest, strict=strict, node=node, ontology_prefix=ontology_prefix
)
return Synonym(
name=name,
specificity=specificity or "EXACT",
type=stype or DEFAULT_SYNONYM_TYPE.reference,
type=synonym_typedef.reference if synonym_typedef else DEFAULT_SYNONYM_TYPE.reference,
provenance=provenance,
annotations=annotations,
)


def _parse_trailing_ref_list(
rest, *, strict: bool = True, node: Reference, ontology_prefix: str | None
):
rest = rest.lstrip("[").rstrip("]")
return [
Reference.from_curie_or_uri(
curie.strip(), strict=strict, node=node, ontology_prefix=ontology_prefix
rest: str, *, strict: bool = True, node: Reference, ontology_prefix: str | None
) -> list[Reference]:
rest = rest.lstrip("[").rstrip("]") # FIXME this doesn't account for trailing annotations
rv = []
for curie in rest.split(","):
curie = curie.strip()
if not curie:
continue
reference = Reference.from_curie_or_uri(
curie, strict=strict, node=node, ontology_prefix=ontology_prefix
)
for curie in rest.split(",")
if curie.strip()
]
if reference is None:
logger.warning("[%s] could not parse provenance CURIE: %s", node.curie, curie)
continue
rv.append(reference)
return rv


def iterate_node_synonyms(
data: Mapping[str, Any],
synonym_typedefs: Mapping[str, SynonymTypeDef],
synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
*,
node: Reference,
strict: bool = False,
ontology_prefix: str | None,
ontology_prefix: str,
) -> Iterable[Synonym]:
"""Extract synonyms from a :mod:`obonet` node's data.

Expand Down
126 changes: 126 additions & 0 deletions src/pyobo/reader_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Utilities for reading OBO files."""

from __future__ import annotations

import logging
from collections import Counter
from collections.abc import Mapping

import bioontologies.upgrade
from curies import ReferenceTuple

from pyobo.struct import SynonymSpecificities, SynonymSpecificity
from pyobo.struct.struct import Reference, SynonymTypeDef, _synonym_typedef_warn, default_reference

logger = logging.getLogger(__name__)

TARGET_URI_WARNINGS: Counter[tuple[str, str]] = Counter()


def _chomp_specificity(s: str) -> tuple[SynonymSpecificity | None, str]:
s = s.strip()
for _specificity in SynonymSpecificities:
if s.startswith(_specificity):
return _specificity, s[len(_specificity) :].strip()
return None, s


def _chomp_typedef(
s: str,
*,
synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
strict: bool = True,
node: Reference,
ontology_prefix: str,
) -> tuple[SynonymTypeDef | None, str]:
if not s:
# This might happen if a synonym is just given as a string
return None, ""

if s.startswith("[") or s.startswith("{"):
# there's no typedef reference here, just return
return None, s

try:
stype_curie, rest = (x.strip() for x in s.split(" ", 1))
except ValueError as e:
if "not enough values to unpack" not in str(e):
raise

# let's just check if this might be a CURIE all by itself.
# if there's a space, we are out of luck, otherwise, let's
# try to parse it like a curie
if " " in s:
# if there
return None, s

stype_curie, rest = s, ""

reference: Reference | None
if ":" not in stype_curie:
# this catches situation where it's "ABBREVIATION"
if xx := bioontologies.upgrade.upgrade(stype_curie):
reference = Reference(prefix=xx.prefix, identifier=xx.identifier)
else:
reference = default_reference(ontology_prefix, stype_curie)
else:
reference = Reference.from_curie_or_uri(
stype_curie,
strict=strict,
node=node,
ontology_prefix=ontology_prefix,
)
if reference is None:
logger.warning(
"[%s] unable to parse synonym type `%s` in line %s", node.curie, stype_curie, s
)
return None, rest

synonym_typedef = _synonym_typedef_warn(
ontology_prefix, predicate=reference, synonym_typedefs=synonym_typedefs
)
return synonym_typedef, rest


SYNONYM_REFERENCE_WARNED: Counter[tuple[str, str]] = Counter()


def _chomp_references(
s: str, *, strict: bool = True, node: Reference, ontology_prefix: str
) -> tuple[list[Reference], str]:
if not s:
return [], ""
if not s.startswith("["):
if s.startswith("{"):
# This means there are no reference, but there are some qualifiers
return [], s
else:
logger.debug("[%s] synonym had no references: %s", node.curie, s)
return [], s

if "]" not in s:
logger.warning("[%s] missing closing square bracket in references: %s", node.curie, s)
return [], s

first, rest = s.lstrip("[").split("]", 1)
references = []
for curie in first.split(","):
curie = curie.strip()
if not curie:
continue
reference = Reference.from_curie_or_uri(
curie, strict=strict, node=node, ontology_prefix=ontology_prefix
)
if reference is None:
if not SYNONYM_REFERENCE_WARNED[ontology_prefix, curie]:
logger.warning("[%s] unable to parse synonym reference: %s", node.curie, curie)
SYNONYM_REFERENCE_WARNED[ontology_prefix, curie] += 1
continue
references.append(reference)
return references, rest


def _chomp_axioms(
s: str, *, strict: bool = True, node: Reference
) -> list[tuple[Reference, Reference]]:
return []
12 changes: 7 additions & 5 deletions src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,11 +645,13 @@ def _typedef_warn(

def _synonym_typedef_warn(
prefix: str, predicate: Reference, synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef]
) -> bool:
) -> SynonymTypeDef | None:
if predicate.pair == DEFAULT_SYNONYM_TYPE.pair:
return False
if predicate.pair in default_typedefs or predicate.pair in synonym_typedefs:
return False
return DEFAULT_SYNONYM_TYPE
if predicate.pair in default_synonym_typedefs:
return default_synonym_typedefs[predicate.pair]
if predicate.pair in synonym_typedefs:
return synonym_typedefs[predicate.pair]
key = prefix, predicate
if key not in _SYNONYM_TYPEDEF_WARNINGS:
_SYNONYM_TYPEDEF_WARNINGS.add(key)
Expand All @@ -663,7 +665,7 @@ def _synonym_typedef_warn(
)
else:
logger.warning(f"[{prefix}] synonym typedef not defined: {predicate.preferred_curie}")
return True
return None


class BioregistryError(ValueError):
Expand Down
10 changes: 5 additions & 5 deletions tests/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,11 @@ def test_extract_definition_with_escapes(self):
def test_extract_synonym(self):
"""Test extracting synonym strings."""
iupac_name = SynonymTypeDef(
reference=Reference(prefix="obo", identifier="IUPAC_NAME", name="IUPAC NAME")
reference=default_reference(prefix="chebi", identifier="IUPAC_NAME", name="IUPAC NAME")
)
synoynym_typedefs = {
"IUPAC_NAME": iupac_name,
acronym.curie: acronym,
iupac_name.pair: iupac_name,
acronym.pair: acronym,
}

for expected_synonym, text in [
Expand Down Expand Up @@ -173,10 +173,10 @@ def test_extract_synonym(self):
def test_get_node_synonyms(self):
"""Test getting synonyms from a node in a :mod:`obonet` graph."""
iupac_name = SynonymTypeDef(
reference=Reference(prefix="obo", identifier="IUPAC_NAME", name="IUPAC NAME")
reference=default_reference(prefix="chebi", identifier="IUPAC_NAME", name="IUPAC NAME")
)
synoynym_typedefs = {
"IUPAC_NAME": iupac_name,
iupac_name.pair: iupac_name,
}
data = self.graph.nodes["CHEBI:51990"]
synonyms = list(
Expand Down
Loading