Skip to content

Commit

Permalink
Merge hot fix for polish into main (#96)
Browse files Browse the repository at this point in the history
* Changed PyPI version badge

For the next release the PyPI version badge stems now from 'shields.io' and not from 'badge.fury.io'.

* Changed colour for refineGEMs version badge

* Adjusted handling of BioCyc identifiers in polish_annotations #95 #58

* Added requirement for importlib_resources=5.13.0 to Pipfile

* Added code to cope with missing sub-database prefixes for BioCyc identifiers #95

* Changed NaN identifier handling #95

* Fixed issue III: None prefix identifier pairs in invalid_curies.tsv #95

* Adjusted files with version for release 1.2.2
  • Loading branch information
GwennyGit authored Aug 16, 2023
1 parent c80e5f0 commit 23e307f
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 37 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ cobra = "==0.22.0"
biopython = "==1.79"
bioregistry = "==0.10.1"
bioservices = "==1.7.11"
importlib_resources = "==5.13.0"
memote = "==0.13.0"
pandas = "==1.2.4"
numpy = "==1.20.3"
Expand Down
2 changes: 1 addition & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
![GitHub Pipenv locked dependency version](https://img.shields.io/github/pipenv/locked/dependency-version/draeger-lab/refinegems/refinegems)
![GitHub Pipenv locked dependency version](https://img.shields.io/github/pipenv/locked/dependency-version/draeger-lab/refinegems/refinegems?label=refineGEMs&color=B4A069)
![GitHub Pipenv locked Python version](https://img.shields.io/github/pipenv/locked/python-version/draeger-lab/refinegems)
[![Documentation Status](https://readthedocs.org/projects/refinegems/badge/?version=latest)](https://refinegems.readthedocs.io/en/latest/?badge=latest)
![GitHub last commit (branch)](https://img.shields.io/github/last-commit/draeger-lab/refinegems/main)
![Repo Size](https://img.shields.io/github/repo-size/draeger-lab/refinegems)
[![PyPI version](https://badge.fury.io/py/refineGEMs.svg)](https://badge.fury.io/py/refineGEMs)
![PyPI version](https://img.shields.io/pypi/v/refinegems?label=PyPI%20package&color=neongreen)
![PyPI - Format](https://img.shields.io/pypi/format/refinegems)
[![PyPI downloads](https://img.shields.io/pypi/dm/refinegems.svg)](https://pypistats.org/packages/refinegems)
[![DOI](https://zenodo.org/badge/359867657.svg)](https://zenodo.org/badge/latestdoi/359867657)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
author = 'Famke Bäuerle and Gwendolyn O. Gusak'

# The full version, including alpha/beta/rc tags
release = '1.2.1'
release = '1.2.2'


# -- General configuration ---------------------------------------------------
Expand Down
133 changes: 101 additions & 32 deletions refinegems/polish.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
from datetime import date

__author__ = "Famke Baeuerle and Gwendolyn O. Gusak"


#------------------------------------------------ Constant variables --------------------------------------------------#
BIOCYC_TIER1_DATABASES_PREFIXES = ['META', 'ECO', 'ECOLI', 'HUMAN']


#----------- Functions to add URIs from the entity IDs to the annotation field for metabolites & reactions ------------#
Expand Down Expand Up @@ -547,7 +551,7 @@ def cv_ncbiprotein(gene_list, email, protein_fasta: str, lab_strain: bool=False)
logging.warning(f'The following {len(genes_missing_annotation)} genes have no annotation, name & label (locus tag): {genes_missing_annotation}')


#------------------- Functions to change the CURIE pattern/CVTerm qualifier & qualifier type --------------------------#
#------------------- Functions to change the CURIE pattern/CVTerm qualifier & qualifier type --------------------------#
def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[str]], list[str]]:
"""| Gets a list of URIs
| & maps the database prefixes to their respective identifier sets
Expand Down Expand Up @@ -575,14 +579,50 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
curie = manager.parse_curie(extracted_curie) # Contains valid db prefix to identifiers pairs
curie = list(curie) # Turn tuple into list to allow item assignment

if not curie[0]: # Need to do own parsing if prefix is not valid
if curie[0]: # Prefix is valid but to have same result for same databases need to do a bit of own parsing
if re.fullmatch('^biocyc$', curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible
# Always add META if BioCyc sub-datbase prefixes are missing
curie = curie if curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else [curie[0], f'META:{curie[1]}']

if 'META' in curie[1]:
if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary
prefix, identifier = normalize_parsed_curie(*curie)

if not curie_dict or (prefix not in curie_dict):
curie_dict[prefix] = SortedSet()
curie_dict[prefix].add(identifier)
else:
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add the MetaCyc identifier additionally
curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
curie[0] = 'metacyc.reaction'
else:
curie[0] = 'metacyc.compound'
elif 'metacyc.' in curie[0]:
if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary
prefix, identifier = normalize_parsed_curie(*curie)

if not curie_dict or (prefix not in curie_dict):
curie_dict[prefix] = SortedSet()
curie_dict[prefix].add(identifier)
else:
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add the BioCyc identifier additionally
curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier
elif re.fullmatch('^brenda$', curie[0], re.IGNORECASE): # Brenda & EC code is the same
curie[0] = 'eccode'

elif not curie[0]: # Need to do own parsing if prefix is not valid
# Get CURIEs irrespective of pattern
if '/' in extracted_curie:
extracted_curie = extracted_curie.split('/')

# Check for NaN identifiers
if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE):
# Only return strings where the database prefix is 'NaN' but a possible identifier could be contained
if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) and not re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE):
invalid_curies.append(f'{extracted_curie[0]}:{extracted_curie[1]}')
continue
Expand All @@ -595,34 +635,45 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
curie = (wrong_prefix[0], f'{wrong_prefix[1]}/{"/".join(extracted_curie[1:len(extracted_curie)])}')
elif re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE): # Brenda & EC code is the same
curie = ('eccode', extracted_curie[1])
elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE) or ('metacyc.' in extracted_curie[0]): # Check for bio- & metacyc
elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible
# Always add META if BioCyc sub-datbase prefixes are missing
extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
curie = ['biocyc', extracted_curie[1]]

if is_valid_identifier(*curie): # Get all valid identifiers
if 'META' in curie[1]:
if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary
prefix, identifier = normalize_parsed_curie(*curie)

if not curie_dict or (prefix not in curie_dict):
curie_dict[prefix] = SortedSet()
curie_dict[prefix].add(identifier)
else:
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add additionallly the MetaCyc identifier
curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
curie[0] = 'metacyc.reaction'
else:
curie[0] = 'metacyc.compound'
elif 'metacyc.' in extracted_curie[0]:
curie = extracted_curie
if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary
prefix, identifier = normalize_parsed_curie(*curie)

if not curie_dict or (prefix not in curie_dict):
curie_dict[prefix] = SortedSet()
curie_dict[prefix].add(identifier)

else:
invalid_curies.append(f'{prefix}:{identifier}')

if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
curie[0] = 'metacyc.reaction'
else:
curie[0] = 'metacyc.compound'
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add BioCyc identfier additionally
curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier
elif re.fullmatch('^chebi$', extracted_curie[0], re.IGNORECASE):
new_curie = extracted_curie[1].split(':')

curie = (new_curie[0].lower(), new_curie[1])

# Checks for old pattern of SBO term URIs ('MIRIAM/sbo/SBO:identifier')
elif re.search('^sbo:', extracted_curie[1], re.IGNORECASE):
prefix = extracted_curie[0]
identifier = extracted_curie[1].split(':')[1]

elif re.search('^sbo:', extracted_curie[1], re.IGNORECASE): # Checks for old pattern of SBO term URIs ('MIRIAM/sbo/SBO:identifier')
curie = [extracted_curie[0], extracted_curie[1].split(':')[1]]
else:
if re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^ec-code$', extracted_curie[0], re.IGNORECASE): # Brenda equals EC code, EC code in URI = ec-code
curie[0] = 'eccode'
Expand All @@ -635,27 +686,45 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
extracted_curie = extracted_curie.split(':')

# Check for NaN identifiers
if re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE):
if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE):
# Only return strings where the database prefix is 'NaN' but a possible identifier could be contained
if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) and not re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE):
invalid_curies.append(f'{extracted_curie[0]}:{extracted_curie[1]}')
continue
elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible
# Always add META if BioCyc sub-datbase prefixes are missing
extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
curie = ['biocyc', extracted_curie[1]]

if re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE) or ('metacyc.' in extracted_curie[0]): # Check for bio- & metacyc
curie = ['biocyc', extracted_curie[-1]]

if is_valid_identifier(*curie): # Get all valid identifiers
if 'META' in curie[1]:
if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary
prefix, identifier = normalize_parsed_curie(*curie)

if not curie_dict or (prefix not in curie_dict):
curie_dict[prefix] = SortedSet()
curie_dict[prefix].add(identifier)
else:
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add MetaCyc identifier additionally
curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
curie[0] = 'metacyc.reaction'
else:
curie[0] = 'metacyc.compound'
elif 'metacyc.' in extracted_curie[0]:
curie = extracted_curie
if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary
prefix, identifier = normalize_parsed_curie(*curie)

if not curie_dict or (prefix not in curie_dict):
curie_dict[prefix] = SortedSet()
curie_dict[prefix].add(identifier)

else:
invalid_curies.append(f'{prefix}:{identifier}')

if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
curie[0] = 'metacyc.reaction'
else:
curie[0] = 'metacyc.compound'
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add BioCyc identifier additionally
curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier
else:
if re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^ec-code$', extracted_curie[0], re.IGNORECASE): # Brenda equals EC code, EC code in URI = ec-code
curie[0] = 'eccode'
Expand Down Expand Up @@ -906,7 +975,7 @@ def polish_annotations(model: libModel, bioregistry: bool, new_pattern: bool, fi
f'These invalid CURIEs are saved to {curies_filename}')
invalid_curies_df = parse_dict_to_dataframe(all_entity2invalid_curies)
invalid_curies_df.columns = ['entity', 'invalid_curie']
invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(':', expand = True)
invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(':', n=1, expand = True) # Required for identifiers that aso contain a ':'
invalid_curies_df = invalid_curies_df.drop('invalid_curie', axis=1)
invalid_curies_df.to_csv(curies_filename, sep='\t')

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
readme = readme_file.read()

setup(name='refineGEMs',
version='1.2.1',
version='1.2.2',
description='refineGEMs: a python package intended to help with the curation of genome-scale metabolic models (GEMS)',
long_description=readme,
long_description_content_type='text/markdown',
Expand Down

0 comments on commit 23e307f

Please sign in to comment.