Merge hot fix for polish into main (#96)

* Changed PyPI version badge For the next release the PyPI version badge stems now from 'shields.io' and not from 'badge.fury.io'. * Changed colour for refineGEMs version badge * Adjusted handling of BioCyc identifiers in polish_annotations #95 #58 * Added requirement for importlib_resources=5.13.0 to Pipfile * Added code to cope with missing sub-database prefixes for BioCyc identifiers #95 * Changed NaN identifier handling #95 * Fixed issue III: None prefix identifier pairs in invalid_curies.tsv #95 * Adjusted files with version for release 1.2.2
draeger-lab · Aug 16, 2023 · 23e307f · 23e307f
1 parent c80e5f0
commit 23e307f
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 37 deletions.
diff --git a/Pipfile b/Pipfile
@@ -8,6 +8,7 @@ cobra = "==0.22.0"
 biopython = "==1.79"
 bioregistry = "==0.10.1"
 bioservices = "==1.7.11"
+importlib_resources = "==5.13.0"
 memote = "==0.13.0"
 pandas = "==1.2.4"
 numpy = "==1.20.3"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-![GitHub Pipenv locked dependency version](https://img.shields.io/github/pipenv/locked/dependency-version/draeger-lab/refinegems/refinegems)
+![GitHub Pipenv locked dependency version](https://img.shields.io/github/pipenv/locked/dependency-version/draeger-lab/refinegems/refinegems?label=refineGEMs&color=B4A069)
 ![GitHub Pipenv locked Python version](https://img.shields.io/github/pipenv/locked/python-version/draeger-lab/refinegems)
 [![Documentation Status](https://readthedocs.org/projects/refinegems/badge/?version=latest)](https://refinegems.readthedocs.io/en/latest/?badge=latest)
 ![GitHub last commit (branch)](https://img.shields.io/github/last-commit/draeger-lab/refinegems/main)
 ![Repo Size](https://img.shields.io/github/repo-size/draeger-lab/refinegems)
-[![PyPI version](https://badge.fury.io/py/refineGEMs.svg)](https://badge.fury.io/py/refineGEMs)
+![PyPI version](https://img.shields.io/pypi/v/refinegems?label=PyPI%20package&color=neongreen)
 ![PyPI - Format](https://img.shields.io/pypi/format/refinegems)
 [![PyPI downloads](https://img.shields.io/pypi/dm/refinegems.svg)](https://pypistats.org/packages/refinegems)
 [![DOI](https://zenodo.org/badge/359867657.svg)](https://zenodo.org/badge/latestdoi/359867657)

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -23,7 +23,7 @@
 author = 'Famke Bäuerle and Gwendolyn O. Gusak'
 
 # The full version, including alpha/beta/rc tags
-release = '1.2.1'
+release = '1.2.2'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/refinegems/polish.py b/refinegems/polish.py
@@ -19,6 +19,10 @@
 from datetime import date
 
 __author__ = "Famke Baeuerle and Gwendolyn O. Gusak"
+
+
+#------------------------------------------------ Constant variables --------------------------------------------------#
+BIOCYC_TIER1_DATABASES_PREFIXES = ['META', 'ECO', 'ECOLI', 'HUMAN']
 
 
 #----------- Functions to add URIs from the entity IDs to the annotation field for metabolites & reactions ------------#       
@@ -547,7 +551,7 @@ def cv_ncbiprotein(gene_list, email, protein_fasta: str, lab_strain: bool=False)
         logging.warning(f'The following {len(genes_missing_annotation)} genes have no annotation, name & label (locus tag): {genes_missing_annotation}')
 
 
-#------------------- Functions to change the CURIE pattern/CVTerm qualifier & qualifier type --------------------------# 
+#------------------- Functions to change the CURIE pattern/CVTerm qualifier & qualifier type --------------------------#
 def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[str]], list[str]]:
     """| Gets a list of URIs
        | & maps the database prefixes to their respective identifier sets
@@ -575,14 +579,50 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
         curie = manager.parse_curie(extracted_curie) # Contains valid db prefix to identifiers pairs
         curie = list(curie) # Turn tuple into list to allow item assignment
 
-        if not curie[0]: # Need to do own parsing if prefix is not valid
+        if curie[0]: # Prefix is valid but to have same result for same databases need to do a bit of own parsing
+            if re.fullmatch('^biocyc$', curie[0], re.IGNORECASE):  # Check for biocyc to also add metacyc if possible
+                # Always add META if BioCyc sub-datbase prefixes are missing
+                curie = curie if curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else [curie[0], f'META:{curie[1]}']
+
+                if 'META' in curie[1]: 
+                    if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary
+                        prefix, identifier = normalize_parsed_curie(*curie)
+
+                        if not curie_dict or (prefix not in curie_dict):
+                            curie_dict[prefix] = SortedSet()
+                        curie_dict[prefix].add(identifier)
+                    else:
+                        invalid_curies.append(f'{curie[0]}:{curie[1]}')
 
+                    # Add the MetaCyc identifier additionally
+                    curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
+                    if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
+                        curie[0] = 'metacyc.reaction'
+                    else:
+                        curie[0] = 'metacyc.compound'
+            elif 'metacyc.' in curie[0]:
+                if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary
+                    prefix, identifier = normalize_parsed_curie(*curie)
+
+                    if not curie_dict or (prefix not in curie_dict):
+                        curie_dict[prefix] = SortedSet()
+                    curie_dict[prefix].add(identifier)
+                else:
+                    invalid_curies.append(f'{curie[0]}:{curie[1]}')
+
+                # Add the BioCyc identifier additionally
+                curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier
+            elif re.fullmatch('^brenda$', curie[0], re.IGNORECASE): # Brenda & EC code is the same
+                curie[0] = 'eccode'
+
+        elif not curie[0]: # Need to do own parsing if prefix is not valid
             # Get CURIEs irrespective of pattern
             if '/' in extracted_curie:
                 extracted_curie = extracted_curie.split('/')
 
                 # Check for NaN identifiers
                 if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE):
+                    # Only return strings where the database prefix is 'NaN' but a possible identifier could be contained
                     if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) and not re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE): 
                         invalid_curies.append(f'{extracted_curie[0]}:{extracted_curie[1]}')
                     continue
@@ -595,34 +635,45 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                         curie = (wrong_prefix[0], f'{wrong_prefix[1]}/{"/".join(extracted_curie[1:len(extracted_curie)])}')
                 elif re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE): # Brenda & EC code is the same
                     curie = ('eccode', extracted_curie[1])
-                elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE) or ('metacyc.' in extracted_curie[0]):  # Check for bio- & metacyc
+                elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE):  # Check for biocyc to also add metacyc if possible
+                    # Always add META if BioCyc sub-datbase prefixes are missing
+                    extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
                     curie = ['biocyc', extracted_curie[1]]
 
-                    if is_valid_identifier(*curie): # Get all valid identifiers
+                    if 'META' in curie[1]: 
+                        if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary
+                            prefix, identifier = normalize_parsed_curie(*curie)
+
+                            if not curie_dict or (prefix not in curie_dict):
+                                curie_dict[prefix] = SortedSet()
+                            curie_dict[prefix].add(identifier)
+                        else:
+                            invalid_curies.append(f'{curie[0]}:{curie[1]}')
+
+                        # Add additionallly the MetaCyc identifier 
+                        curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
+                        if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
+                            curie[0] = 'metacyc.reaction'
+                        else:
+                            curie[0] = 'metacyc.compound'
+                elif 'metacyc.' in extracted_curie[0]:
+                    curie = extracted_curie
+                    if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary
                         prefix, identifier = normalize_parsed_curie(*curie)
-                        
+
                         if not curie_dict or (prefix not in curie_dict):
                             curie_dict[prefix] = SortedSet()
                         curie_dict[prefix].add(identifier)
-
                     else:
-                        invalid_curies.append(f'{prefix}:{identifier}')
-
-                    if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
-                        curie[0] = 'metacyc.reaction'
-                    else:
-                        curie[0] = 'metacyc.compound'
+                        invalid_curies.append(f'{curie[0]}:{curie[1]}')
 
+                    # Add BioCyc identfier additionally
+                    curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier
                 elif re.fullmatch('^chebi$', extracted_curie[0], re.IGNORECASE):
                     new_curie = extracted_curie[1].split(':')
-
                     curie = (new_curie[0].lower(), new_curie[1])
-
-                # Checks for old pattern of SBO term URIs ('MIRIAM/sbo/SBO:identifier')
-                elif re.search('^sbo:', extracted_curie[1], re.IGNORECASE):
-                    prefix = extracted_curie[0]
-                    identifier = extracted_curie[1].split(':')[1]
-
+                elif re.search('^sbo:', extracted_curie[1], re.IGNORECASE): # Checks for old pattern of SBO term URIs ('MIRIAM/sbo/SBO:identifier')
+                    curie = [extracted_curie[0], extracted_curie[1].split(':')[1]]
                 else:
                     if re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^ec-code$', extracted_curie[0], re.IGNORECASE): # Brenda equals EC code, EC code in URI = ec-code
                         curie[0] = 'eccode'
@@ -635,27 +686,45 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                 extracted_curie = extracted_curie.split(':')
 
                 # Check for NaN identifiers
-                if re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE):
+                if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE):
+                    # Only return strings where the database prefix is 'NaN' but a possible identifier could be contained
+                    if re.fullmatch('^nan$', extracted_curie[0], re.IGNORECASE) and not re.fullmatch('^nan$', extracted_curie[1], re.IGNORECASE): 
+                        invalid_curies.append(f'{extracted_curie[0]}:{extracted_curie[1]}')
                     continue
+                elif re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE):  # Check for biocyc to also add metacyc if possible
+                    # Always add META if BioCyc sub-datbase prefixes are missing
+                    extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
+                    curie = ['biocyc', extracted_curie[1]]
 
-                if re.fullmatch('^biocyc$', extracted_curie[0], re.IGNORECASE) or ('metacyc.' in extracted_curie[0]):  # Check for bio- & metacyc
-                    curie = ['biocyc', extracted_curie[-1]]
-
-                    if is_valid_identifier(*curie): # Get all valid identifiers
+                    if 'META' in curie[1]:
+                        if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary
+                            prefix, identifier = normalize_parsed_curie(*curie)
+
+                            if not curie_dict or (prefix not in curie_dict):
+                                curie_dict[prefix] = SortedSet()
+                            curie_dict[prefix].add(identifier)
+                        else:
+                            invalid_curies.append(f'{curie[0]}:{curie[1]}')
+
+                        # Add MetaCyc identifier additionally
+                        curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
+                        if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
+                            curie[0] = 'metacyc.reaction'
+                        else:
+                            curie[0] = 'metacyc.compound'
+                elif 'metacyc.' in extracted_curie[0]:
+                    curie = extracted_curie
+                    if is_valid_identifier(*curie): # Get the valid MetaCyc identifier & Add to dictionary
                         prefix, identifier = normalize_parsed_curie(*curie)
 
                         if not curie_dict or (prefix not in curie_dict):
                             curie_dict[prefix] = SortedSet()
                         curie_dict[prefix].add(identifier)
-
                     else:
-                        invalid_curies.append(f'{prefix}:{identifier}')
-
-                    if re.search('^rxn-|-rxn$', curie[1], re.IGNORECASE):
-                        curie[0] = 'metacyc.reaction'
-                    else:
-                        curie[0] = 'metacyc.compound'
+                        invalid_curies.append(f'{curie[0]}:{curie[1]}')
 
+                    # Add BioCyc identifier additionally
+                    curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier
                 else:
                     if re.fullmatch('^brenda$', extracted_curie[0], re.IGNORECASE) or re.fullmatch('^ec-code$', extracted_curie[0], re.IGNORECASE): # Brenda equals EC code, EC code in URI = ec-code
                         curie[0] = 'eccode'
@@ -906,7 +975,7 @@ def polish_annotations(model: libModel, bioregistry: bool, new_pattern: bool, fi
                      f'These invalid CURIEs are saved to {curies_filename}')      
         invalid_curies_df = parse_dict_to_dataframe(all_entity2invalid_curies)
         invalid_curies_df.columns = ['entity', 'invalid_curie']
-        invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(':', expand = True)
+        invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(':', n=1, expand = True) # Required for identifiers that aso contain a ':'
         invalid_curies_df = invalid_curies_df.drop('invalid_curie', axis=1)
         invalid_curies_df.to_csv(curies_filename, sep='\t')
 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
     readme = readme_file.read()
 
 setup(name='refineGEMs',
-      version='1.2.1',
+      version='1.2.2',
       description='refineGEMs: a python package intended to help with the curation of genome-scale metabolic models (GEMS)',
       long_description=readme,
       long_description_content_type='text/markdown',