Skip to content

Commit

Permalink
consIder X as a valid amino acid as per DeepGO-SE
Browse files Browse the repository at this point in the history
  • Loading branch information
aditya0by0 committed Nov 6, 2024
1 parent 33436e8 commit c6d60cd
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 17 deletions.
27 changes: 19 additions & 8 deletions chebai/preprocessing/datasets/go_uniprot.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
# Reference for this file :
# References for this file :
# Reference 1:
# Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf;
# DeepGO: Predicting protein functions from sequence and interactions
# using a deep ontology-aware classifier, Bioinformatics, 2017.
# https://doi.org/10.1093/bioinformatics/btx624
# Github: https://github.com/bio-ontology-research-group/deepgo

# Reference 2:
# https://www.ebi.ac.uk/GOA/downloads
# https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt
# https://www.uniprot.org/uniprotkb

# Reference 3:
# Kulmanov, M., Guzmán-Vega, F.J., Duek Roggli,
# P. et al. Protein function prediction as approximate semantic entailment. Nat Mach Intell 6, 220–228 (2024).
# https://doi.org/10.1038/s42256-024-00795-w
# https://github.com/bio-ontology-research-group/deepgo2

__all__ = [
"GOUniProtOver250",
"GOUniProtOver50",
Expand All @@ -34,6 +43,7 @@
from chebai.preprocessing import reader as dr
from chebai.preprocessing.datasets.base import _DynamicDataset

# https://github.com/bio-ontology-research-group/deepgo/blob/master/utils.py#L15
EXPERIMENTAL_EVIDENCE_CODES = {
"EXP",
"IDA",
Expand All @@ -43,6 +53,8 @@
"IEP",
"TAS",
"IC",
# New evidence codes added in latest paper year 2024 Reference number 3
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/utils.py#L24-L26
"HTP",
"HDA",
"HMP",
Expand All @@ -51,7 +63,9 @@
}

# https://github.com/bio-ontology-research-group/deepgo/blob/d97447a05c108127fee97982fd2c57929b2cf7eb/aaindex.py#L8
AMBIGUOUS_AMINO_ACIDS = {"B", "O", "J", "U", "X", "Z", "*"}
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L10
# `X` is now considered as valid amino acid, as per latest paper year 2024 Refernce number 3
AMBIGUOUS_AMINO_ACIDS = {"B", "O", "J", "U", "Z", "*"}


class _GOUniProtDataExtractor(_DynamicDataset, ABC):
Expand Down Expand Up @@ -416,12 +430,9 @@ def _get_swiss_to_go_mapping(self) -> pd.DataFrame:
Note:
This mapping is necessary because the GO data does not include the protein sequence representation.
Quote from the DeepGo Paper:
`We select proteins with annotations having experimental evidence codes
`EXPERIMENTAL_EVIDENCE_CODES` and filter the proteins by a
maximum length of 1002, ignoring proteins with ambiguous amino acid codes
(B, O, J, U, X, Z) in their sequence.`
We select proteins with annotations having experimental evidence codes, as specified in
`EXPERIMENTAL_EVIDENCE_CODES` and filter the proteins by a maximum length of 1002, ignoring proteins with
ambiguous amino acid codes specified in `AMBIGUOUS_AMINO_ACIDS` in their sequence.
Check the link below for keyword details:
https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt
Expand Down
4 changes: 2 additions & 2 deletions chebai/preprocessing/datasets/protein_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,15 @@ def _download_required_data(self) -> str:
def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
"""
Parses the Swiss-Prot data and returns a DataFrame containing Swiss-Prot proteins which does not have any valid
Gene Ontology(GO) label. A valid GO label is the one which has one of the following evidence code defined in
Gene Ontology(GO) label. A valid GO label is the one which has one of the following evidence codes, as specified in
`EXPERIMENTAL_EVIDENCE_CODES`.
The DataFrame includes the following columns:
- "swiss_id": The unique identifier for each Swiss-Prot record.
- "sequence": The protein sequence.
Note:
We ignore proteins with ambiguous amino acid codes (B, O, J, U, X, Z) in their sequence.`
We ignore proteins with ambiguous amino acid specified in `AMBIGUOUS_AMINO_ACIDS` in their sequence.`
Returns:
pd.DataFrame: A DataFrame where each row corresponds to a Swiss-Prot record with not associated valid GO.
Expand Down
4 changes: 3 additions & 1 deletion chebai/preprocessing/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ class ProteinDataReader(DataReader):

COLLATOR = RaggedCollator

# 20 natural amino acid notation
# 21 natural amino acid notation
AA_LETTER = [
"A",
"R",
Expand All @@ -370,6 +370,8 @@ class ProteinDataReader(DataReader):
"W",
"Y",
"V",
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L3-L5
"X", # Consider valid in latest paper year 2024 Reference number 3 in go_uniprot.py
]

def name(self) -> str:
Expand Down
13 changes: 7 additions & 6 deletions tests/unit/mock_data/ontology_mock_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,18 +658,19 @@ def get_UniProt_raw_data() -> str:
- **Swiss_Prot_1**: A valid protein with three valid GO classes and one invalid GO class.
- **Swiss_Prot_2**: Another valid protein with two valid GO classes and one invalid.
- **Swiss_Prot_3**: Contains valid GO classes but has a sequence length > 1002.
- **Swiss_Prot_4**: Has valid GO classes but contains an invalid amino acid, 'X'.
- **Swiss_Prot_4**: Has valid GO classes but contains an invalid amino acid, 'B'.
- **Swiss_Prot_5**: Has a sequence but no GO classes associated.
- **Swiss_Prot_6**: Has GO classes without any associated evidence codes.
- **Swiss_Prot_7**: Has a GO class with an invalid evidence code.
- **Swiss_Prot_8**: Has a sequence length > 1002 and has only invalid GO class.
- **Swiss_Prot_9**: Has no GO classes but contains an invalid amino acid, 'X', in its sequence.
- **Swiss_Prot_9**: Has no GO classes but contains an invalid amino acid, 'B', in its sequence.
- **Swiss_Prot_10**: Has a valid GO class but lacks a sequence.
- **Swiss_Prot_11**: Has only Invalid GO class but lacks a sequence.
Note:
A valid GO label is the one which has one of the following evidence code defined in
`EXPERIMENTAL_EVIDENCE_CODES`.
A valid GO label is the one which has one of the following evidence code specified in
go_uniprot.py->`EXPERIMENTAL_EVIDENCE_CODES`.
Invalid amino acids are specified in go_uniprot.py->`AMBIGUOUS_AMINO_ACIDS`.
Returns:
str: The raw UniProt data in string format.
Expand Down Expand Up @@ -715,7 +716,7 @@ def get_UniProt_raw_data() -> str:
"DR GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n"
"DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
" XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
" BAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
"//\n"
# Below protein with sequence string but has no GO class
"ID Swiss_Prot_5 Reviewed; 60 AA.\n"
Expand Down Expand Up @@ -749,7 +750,7 @@ def get_UniProt_raw_data() -> str:
"ID Swiss_Prot_9 Reviewed; 60 AA.\n"
"AC Q6GZX4;\n"
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
" XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
" BAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
"//\n"
# Below protein with a `valid` associated GO class but without sequence string
"ID Swiss_Prot_10 Reviewed; 60 AA.\n"
Expand Down

0 comments on commit c6d60cd

Please sign in to comment.