Merge pull request #32 from WangHong007/master

Get protein accessions from multiple databases
bigbio · Nov 20, 2023 · 5b3e0d5 · 5b3e0d5
2 parents 10249dc + 736b6d1
commit 5b3e0d5
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 13 deletions.
diff --git a/bin/compute_ibaq.py b/bin/compute_ibaq.py
@@ -43,13 +43,17 @@ def normalize_ibaq(res: DataFrame) -> DataFrame:
     return res
 
 
-def parse_uniprot_accession(identifier: str) -> str:
+def get_accession(identifier: str) -> str:
     """
-    Parse the uniprot accession from the identifier  (e.g. sp|P12345|PROT_NAME)
-    :param identifier: Uniprot identifier
-    :return:
+    Get protein accession from the identifier  (e.g. sp|P12345|PROT_NAME)
+    :param identifier: Protein identifier
+    :return: Protein accession
     """
-    return identifier.split("|")[1]
+    identifier_lst = identifier.split("|")
+    if len(identifier_lst) == 1:
+        return identifier_lst[0]
+    else:
+        return identifier_lst[1]
 
 
 @click.command()
@@ -144,8 +148,8 @@ def get_average_nr_peptides_unique_bygroup(pdrow: Series) -> Series:
         digest = list()  # type: list[str]
         digestor.digest(AASequence().fromString(entry.sequence), digest, min_aa, max_aa)
         digestuniq = set(digest)
-        # TODO: We keep uniprot accessions rather than names.
-        protein_name = parse_uniprot_accession(entry.identifier)
+        # TODO: Try to get protein accessions from multiple databases.
+        protein_name = get_accession(entry.identifier)
         uniquepepcounts[protein_name] = len(digestuniq)
         protein_accessions.append(protein_name)
 

diff --git a/ibaq/ibaqpy_commons.py b/ibaq/ibaqpy_commons.py
@@ -341,9 +341,7 @@ def get_spectrum_prefix(reference_spectrum: str) -> str:
     return re.split(r"\.mzML|\.MZML|\.raw|\.RAW|\.d|\.wiff", reference_spectrum)[0]
 
 
-"""
-Common functions when normalizing peptide dataframe
-"""
+# Common functions when normalizing peptide dataframe
 def get_peptidoform_normalize_intensities(
     dataset: DataFrame, higher_intensity: bool = True
 ) -> DataFrame:
@@ -436,9 +434,7 @@ def best_probability_error_bestsearch_engine(probability: float) -> float:
     return 1 - probability
 
 
-"""
-Functions needed by Combiner
-"""
+# Functions needed by Combiner
 def load_sdrf(sdrf_path: str) -> DataFrame:
     """
     Load sdrf TSV as a dataframe.