Skip to content

Commit

Permalink
Title cleaning updates
Browse files Browse the repository at this point in the history
- Update: Refactor: Cleaned some redundancy in how this was being called. Simplified code. Modularized other parts of code.
  • Loading branch information
joeflack4 committed Sep 23, 2024
1 parent 9ff2520 commit f5e4794
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 88 deletions.
20 changes: 10 additions & 10 deletions omim2obo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@
from rdflib.term import Identifier

from omim2obo.namespaces import *
from omim2obo.parsers.omim_entry_parser import get_alt_and_included_titles_and_symbols, get_pubs, get_mapped_ids, \
LabelCleaner
from omim2obo.parsers.omim_entry_parser import cleanup_title, get_alt_and_included_titles_and_symbols, get_pubs, \
get_mapped_ids, \
recapitalize_acronyms_in_title
from omim2obo.config import ROOT_DIR, GLOBAL_TERMS
from omim2obo.parsers.omim_txt_parser import *

Expand Down Expand Up @@ -140,7 +141,6 @@ def get_graph():
TAX_ID = GLOBAL_TERMS[TAX_LABEL]
TAX_URI = URIRef(NCBITAXON + TAX_ID.split(':')[1])
CURIE_MAP = get_curie_maps()
label_cleaner = LabelCleaner()
CONFIG = {
'verbose': False
}
Expand Down Expand Up @@ -197,7 +197,7 @@ def omim2obo(use_cache: bool = False):
# Parse titles & symbols
omim_type, pref_titles_str, alt_titles_str, inc_titles_str = omim_type_and_titles[omim_id]
pref_titles_and_symbols: List[str] = [x.strip() for x in pref_titles_str.split(';')]
pref_title, pref_symbols = pref_titles_and_symbols[0], pref_titles_and_symbols[1:]
pref_title, pref_symbols = cleanup_title(pref_titles_and_symbols[0]), pref_titles_and_symbols[1:]
alt_titles, alt_symbols, former_alt_titles, former_alt_symbols = \
get_alt_and_included_titles_and_symbols(alt_titles_str)
included_titles, included_symbols, former_included_titles, former_included_symbols = \
Expand Down Expand Up @@ -225,7 +225,7 @@ def omim2obo(use_cache: bool = False):
LOG.warning(gene_label_err) # todo: rare (n=1?), but decide the best way to handle these situations
graph.add((omim_uri, RDFS.label, Literal(pref_symbols[0])))
else:
graph.add((omim_uri, RDFS.label, Literal(label_cleaner.clean(pref_title))))
graph.add((omim_uri, RDFS.label, Literal(pref_title)))

# todo: .clean()/.cleanup_label() 2nd param `explicit_abbrev` should be List[str] instead of str. And below,
# should pass all symbols/abbrevs from each of preferred, alt, included each time it is called. If no symbols
Expand All @@ -234,17 +234,17 @@ def omim2obo(use_cache: bool = False):

# Add synonyms
# - exact titles
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(pref_title, pref_abbrev))))
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(recapitalize_acronyms_in_title(pref_title, pref_abbrev))))
for title in alt_titles:
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(title, pref_abbrev))))
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(recapitalize_acronyms_in_title(title, pref_abbrev))))
# - exact abbreviations
for abbrevs in [pref_symbols, alt_symbols]:
for abbreviation in abbrevs:
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasExactSynonym, abbreviation,
[(oboInOwl.hasSynonymType, OMO['0003000'])])
# - related, deprecated 'former' titles
for title in former_alt_titles:
clean_title = label_cleaner.clean(title, pref_abbrev)
clean_title = recapitalize_acronyms_in_title(title, pref_abbrev)
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, clean_title,
[(OWL.deprecated, Literal(True))])
# - related, deprecated 'former' abbreviations
Expand All @@ -259,7 +259,7 @@ def omim2obo(use_cache: bool = False):
graph.add((omim_uri, RDFS['comment'], Literal(included_comment)))
# - titles
for title in included_titles:
graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(label_cleaner.clean(title, pref_abbrev))))
graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(recapitalize_acronyms_in_title(title, pref_abbrev))))
# - symbols
for symbol in included_symbols:
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [
Expand All @@ -268,7 +268,7 @@ def omim2obo(use_cache: bool = False):
])
# - deprecated, 'former'
for title in former_included_titles:
clean_title = label_cleaner.clean(title, pref_abbrev)
clean_title = recapitalize_acronyms_in_title(title, pref_abbrev)
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), clean_title,
[(OWL.deprecated, Literal(True))])
for symbol in former_included_symbols:
Expand Down
132 changes: 56 additions & 76 deletions omim2obo/parsers/omim_entry_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,31 @@
LOG = logging.getLogger('omim2obo.parsers.api_entry_parser')


def get_known_capitalizations() -> Dict[str, str]:
"""Get list of known capitalizations for proper names, acronyms, and the like.
TODO: Contains space-delimited words, e.g. "vitamin d". The way that
cleanup_label is currently implemented, each word in the label gets
replaced; i.e. it would try to replace "vitamin" and "d" separately. Hence,
this would fail.
Therefore, we should probably do this in 2 different operations: (1) use
the current 'word replacement' logic, but also, (2), at the end, do a
generic string replacement (e.g. my_str.replace(a, b). When implementing
(2), we should also split this dictionary into two separate dictionaries,
each for 1 of these 2 different purposes."""
path = DATA_DIR / 'known_capitalizations.tsv'
with open(path, "r") as file:
data_io = csv.reader(file, delimiter="\t")
data: List[List[str]] = [x for x in data_io]
df = pd.DataFrame(data[1:], columns=data[0])
d = {}
for index, row in df.iterrows():
d[row['lower_name']] = row['cap_name']
return d


CAPITALIZATION_REPLACEMENTS: Dict[str, str] = get_known_capitalizations()


# todo: This isn't used in the ingest to create omim.ttl. Did this have some other use case?
def transform_entry(entry) -> Graph:
"""
Expand Down Expand Up @@ -49,18 +74,18 @@ def transform_entry(entry) -> Graph:
abbrev = label.split(';')[1].strip() if ';' in label else None

if omim_type == OmimType.HERITABLE_PHENOTYPIC_MARKER.value: # %
graph.add((omim_uri, RDFS.label, Literal(cleanup_label(label))))
graph.add((omim_uri, RDFS.label, Literal(cleanup_title(label))))
graph.add((omim_uri, BIOLINK['category'], BIOLINK['Disease']))
elif omim_type == OmimType.GENE.value or omim_type == OmimType.HAS_AFFECTED_FEATURE.value: # * or +
omim_type = OmimType.GENE.value
graph.add((omim_uri, RDFS.label, Literal(abbrev)))
graph.add((omim_uri, RDFS.subClassOf, SO['0000704']))
graph.add((omim_uri, BIOLINK['category'], BIOLINK['Gene']))
elif omim_type == OmimType.PHENOTYPE.value: # #
graph.add((omim_uri, RDFS.label, Literal(cleanup_label(label))))
graph.add((omim_uri, RDFS.label, Literal(cleanup_title(label))))
graph.add((omim_uri, BIOLINK['category'], BIOLINK['Disease']))
else: # ^ or NULL (no prefix character)
graph.add((omim_uri, RDFS.label, Literal(cleanup_label(label))))
graph.add((omim_uri, RDFS.label, Literal(cleanup_title(label))))

graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label)))
for label in other_labels:
Expand Down Expand Up @@ -122,12 +147,8 @@ def transform_entry(entry) -> Graph:
return graph


def _detect_abbreviations(
label: str,
explicit_abbrev: str = None,
trailing_abbrev: str = None,
CAPITALIZATION_THRESHOLD = 0.75
):
# todo: probably best to combine explicit abbrevs outside of this func
def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalization_threshold=0.75) -> List[str]:
"""Detect possible abbreviations / acronyms"""
# Compile regexp
acronyms_without_periods_compiler = re.compile('[A-Z]{1}[A-Z0-9]{1,}')
Expand All @@ -142,7 +163,7 @@ def _detect_abbreviations(
if word.upper() == word:
fully_capitalized_count += 1
is_largely_uppercase = \
fully_capitalized_count / len(words) >= CAPITALIZATION_THRESHOLD
fully_capitalized_count / len(words) >= capitalization_threshold

# Detect acronyms without periods
if is_largely_uppercase:
Expand All @@ -155,8 +176,7 @@ def _detect_abbreviations(
# Combine list of things to re-format
replacements = []
candidates: List[List[str]] = [
acronyms_with_periods, acronyms_without_periods, title_cased_abbrevs,
[trailing_abbrev], [explicit_abbrev]]
acronyms_with_periods, acronyms_without_periods, title_cased_abbrevs, [explicit_abbrev]]
for item_list in candidates:
for item in item_list:
if item:
Expand All @@ -166,22 +186,17 @@ def _detect_abbreviations(


# todo: rename? It's doing more than cleaning; it's mutating
# todo: This step should no longer be necessary as it is now done beforehand: "remove the abbreviation suffixes"
# todo: explicit_abbrev: Change to List[str]. See: https://github.com/monarch-initiative/omim/issues/129
def cleanup_label(
label: str,
explicit_abbrev: str = None,
replacement_case_method: str = 'lower', # lower | title | upper
replacement_case_method_acronyms: str = 'upper', # lower | title | upper
def cleanup_title(
title: str,
conjunctions: List[str] = ['and', 'but', 'yet', 'for', 'nor', 'so'],
little_preps: List[str] = [
'at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'],
little_preps: List[str] = ['at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'],
articles: List[str] = ['a', 'an', 'the'],
CAPITALIZATION_THRESHOLD = 0.75,
word_replacements: Dict[str, str] = None # w/ known cols
word_replacements: Dict[str, str] = CAPITALIZATION_REPLACEMENTS,
) -> str:
"""Reformat the ALL CAPS OMIM labels to something more pleasant to read.
:param title: A preferred, alternative, or included title.
1. Removes the abbreviation suffixes
2. Converts roman numerals to arabic
3. Makes the text Title Case, except for supplied conjunctions/prepositions/articles
Expand Down Expand Up @@ -218,19 +233,12 @@ def cleanup_label(
e.g.: Balint syndrome, Barre-Lieou syndrome, Wallerian degeneration, etc.
How to do this? Simply get/create a list of known eponyms? Is this feasible?
"""
# 1/3: Detect abbreviations / acronyms
label2 = label.split(r';')[0] if r';' in label else label
trailing_abbrev = label.split(r';')[1] if r';' in label else ''
possible_abbreviations = _detect_abbreviations(
label2, explicit_abbrev, trailing_abbrev, CAPITALIZATION_THRESHOLD)

# 2/3: Format label
# Simple method: Lower/title case everything but acronyms
# label_newcase = getattr(label2, replacement_case_method)()
# Advanced method: iteritavely format words
fixedwords = []
i = 0
for wrd in label2.split():
for wrd in title.split():
i += 1
# convert the roman numerals to numbers,
# but assume that the first word is not
Expand All @@ -246,22 +254,29 @@ def cleanup_label(
suffix = wrd.replace(toRoman(num), '', 1)
fixed = ''.join((str(num), suffix))
wrd = fixed
wrd = getattr(wrd, replacement_case_method)()
# todo: next few lines don't make sense. why lower 'wrd', and then conditionally lowercase it again?
wrd = wrd.lower()
# replace interior conjunctions, prepositions, and articles with lowercase, always
if wrd.lower() in (conjunctions + little_preps + articles) and i != 1:
if wrd in (conjunctions + little_preps + articles) and i != 1:
wrd = wrd.lower()
if word_replacements:
wrd = word_replacements.get(wrd, wrd)
fixedwords.append(wrd)
label_newcase = ' '.join(fixedwords)

# 3/3 Re-capitalize acronyms / words based on information contained w/in original label
formatted_label = copy(label_newcase)
for item in possible_abbreviations:
to_replace = getattr(item, replacement_case_method_acronyms)()
formatted_label = formatted_label.replace(to_replace, item)
return label_newcase

return formatted_label

# todo: explicit_abbrev: Change to List[str]. See: https://github.com/monarch-initiative/omim/issues/129
def recapitalize_acronyms_in_title(title: str, explicit_abbrev=None, capitalization_threshold=0.75) -> str:
"""Re-capitalize acronyms / words based on information contained w/in original label"""
# todo: probably best to combine explicit abbrevs outside of this func
possible_abbreviations = _detect_abbreviations(
title, explicit_abbrev, capitalization_threshold=capitalization_threshold)
title2 = title
for abbrev in possible_abbreviations:
title2 = title2.replace(abbrev.upper(), abbrev)
return title2


def remove_included_and_formerly_suffixes(title: str) -> str:
Expand All @@ -288,7 +303,7 @@ def clean_alt_and_included_titles(titles: List[str], symbols: List[str]) -> Tupl
titles2 = [remove_included_and_formerly_suffixes(x) for x in titles]
symbols2 = [remove_included_and_formerly_suffixes(x) for x in symbols]
# additional reformatting for titles
titles3 = [cleanup_label(x) for x in titles2]
titles3 = [cleanup_title(x) for x in titles2]
return titles3, symbols2


Expand Down Expand Up @@ -387,38 +402,3 @@ def get_phenotypic_series(entry) -> List[str]:
def get_process_allelic_variants(entry) -> List:
# Not sure when/if Dazhi intended to use this - joeflack4 2021/12/20
return []


def get_known_capitalizations() -> Dict[str, str]:
"""Get list of known capitalizations for proper names, acronyms, and the like.
TODO: Contains space-delimited words, e.g. "vitamin d". The way that
cleanup_label is currently implemented, each word in the label gets
replaced; i.e. it would try to replace "vitamin" and "d" separately. Hence,
this would fail.
Therefore, we should probably do this in 2 different operations: (1) use
the current 'word replacement' logic, but also, (2), at the end, do a
generic string replacement (e.g. my_str.replace(a, b). When implementing
(2), we should also split this dictionary into two separate dictionaries,
each for 1 of these 2 different purposes."""
path = DATA_DIR / 'known_capitalizations.tsv'
with open(path, "r") as file:
data_io = csv.reader(file, delimiter="\t")
data: List[List[str]] = [x for x in data_io]
df = pd.DataFrame(data[1:], columns=data[0])
d = {}
for index, row in df.iterrows():
d[row['lower_name']] = row['cap_name']
return d


class LabelCleaner():
"""Cleans labels"""

def __init__(self):
"""New obj"""
self.word_replacements: Dict[str, str] = get_known_capitalizations()

def clean(self, label, *args, **kwargs):
"""Overrides cleanup_label by adding word_replacements"""
return cleanup_label(
label, *args, **kwargs, word_replacements=self.word_replacements)
3 changes: 1 addition & 2 deletions omim2obo/parsers/omim_txt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,9 @@
# https://www.ebi.ac.uk/ols/ontologies/ro/properties?iri=http://purl.obolibrary.org/obo/RO_0003304
'4': RO['0003304'],
}

MORBIDMAP_PHENOTYPE_MAPPING_KEY_INVERSE_PREDICATES = {
RO['0004013']: RO['0004003'],
}
}


def convert_txt_to_tsv(file_name: str):
Expand Down

0 comments on commit f5e4794

Please sign in to comment.