diff --git a/docs/nbs/tutorial_HLA_prediction.rst b/docs/nbs/tutorial_HLA_prediction.rst deleted file mode 100644 index 544073bf..00000000 --- a/docs/nbs/tutorial_HLA_prediction.rst +++ /dev/null @@ -1,5 +0,0 @@ -Tutorial: HLA prediction -========================== - -Check `HLA1_Classifier.ipynb `_ -in `PeptDeep-HLA `_ repo. diff --git a/docs/notebooks.rst b/docs/notebooks.rst index 1ba96c8a..701040d6 100644 --- a/docs/notebooks.rst +++ b/docs/notebooks.rst @@ -6,10 +6,10 @@ Tutorials and notebooks about how to use AlphaPeptDeep .. toctree:: :maxdepth: 1 + tutorials/tutorial_immunopeptidomics nbs/tutorial_models_from_scratch nbs/tutorial_speclib_from_fasta nbs/alphapeptdeep_hdf_to_tsv - nbs/tutorial_HLA_prediction nbs/tutorial_model_manager nbs/tutorial_building_rt_model nbs/tutorial_building_ccs_model diff --git a/docs/tutorials/example.fasta b/docs/tutorials/example.fasta new file mode 100644 index 00000000..5619e28a --- /dev/null +++ b/docs/tutorials/example.fasta @@ -0,0 +1,9 @@ +>tr|A0A024R161|A0A024R161_HUMAN Guanine nucleotide-binding protein subunit gamma OS=Homo sapiens GN=DNAJC25-GNG10 PE=3 SV=1 +MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSA +GKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAEL +QQYCMQNACKDALLVGVPAGSNPFREPRSCALL +>tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, isoform CRA_b OS=Homo sapiens GN=KLRC4-KLRK1 PE=4 SV=1 +MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIA +VAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNW +YESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLT +IIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV diff --git a/docs/tutorials/tutorial_immunopeptidomics.ipynb b/docs/tutorials/tutorial_immunopeptidomics.ipynb new file mode 100644 index 00000000..eb536a8a --- /dev/null +++ b/docs/tutorials/tutorial_immunopeptidomics.ipynb @@ -0,0 +1,3758 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using peptdeep for MHC class I immunopeptidomics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook introduces how to generate spectral libraries for immunopeptidomics analysis from a list of protein sequences. This entails several steps:\n", + "\n", + "1. unspecific digestion of protein sequences\n", + "2. selection of peptide sequences used for library prediction by peptdeep-hla predicition\n", + " 2.1 using the pretrained model\n", + " 2.2 using an improved model by including a transfer learning step\n", + "3. spectral library prediction\n", + "4. matching the peptides back to the proteins (this can be done before or after library prediction or seach) \n", + "\n", + "\n", + "\n", + "Note that pydivsufsort package is not installed by peptdeep by default. Install by:\n", + "```\n", + "pip install \"peptdeep[development,hla]\"\n", + "```\n", + "\n", + "Or install within jupyter notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -q pydivsufsort" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Unspecific digestion in alphabase" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The unspecific digestion workflow uses the longest common prefix (LCP) algorithm, which is based on suffix array data structure, has been proven to be very efficient for unspecific digestion [https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-577]. Here we used `pydivsufsort`, a Python wrapper of a high-performance C library libdivsufsort [https://github.com/y-256/libdivsufsort], to facilitate LCP-based digestion.\n", + "\n", + "This means, the digestion is performed on a single sequence of strings and retrives both the peptide sequence as well as the start and stop indices of the peptide within the complete sequence. Therefore, unspecific digestion in alphabase involves two steps:\n", + "\n", + "1. concatenation of protein sequences into a single sequence\n", + "2. unspecific digestion\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.1 Concatenate protein sequences into a single sequence\n", + "\n", + "The protein sequences are concatenated into a single sequence. The sequences are seperated by a sentinel character, in this case '$', so that no peptides across proteins are formed. Note that the first and last sentinel characters are crutial as well.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
protein_idfull_namegene_namegene_orgdescriptionsequencenAA
tr|A0A024R161|A0A024R161_HUMANA0A024R161tr|A0A024R161|A0A024R161_HUMANDNAJC25-GNG10A0A024R161_HUMANtr|A0A024R161|A0A024R161_HUMAN Guanine nucleot...MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...153
tr|A0A024RAP8|A0A024RAP8_HUMANA0A024RAP8tr|A0A024RAP8|A0A024RAP8_HUMANKLRC4-KLRK1A0A024RAP8_HUMANtr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, iso...MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKC...216
\n", + "
" + ], + "text/plain": [ + " protein_id full_name \\\n", + "tr|A0A024R161|A0A024R161_HUMAN A0A024R161 tr|A0A024R161|A0A024R161_HUMAN \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN A0A024RAP8 tr|A0A024RAP8|A0A024RAP8_HUMAN \n", + "\n", + " gene_name gene_org \\\n", + "tr|A0A024R161|A0A024R161_HUMAN DNAJC25-GNG10 A0A024R161_HUMAN \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN KLRC4-KLRK1 A0A024RAP8_HUMAN \n", + "\n", + " description \\\n", + "tr|A0A024R161|A0A024R161_HUMAN tr|A0A024R161|A0A024R161_HUMAN Guanine nucleot... \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN tr|A0A024RAP8|A0A024RAP8_HUMAN HCG2009644, iso... \n", + "\n", + " sequence \\\n", + "tr|A0A024R161|A0A024R161_HUMAN MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG... \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKC... \n", + "\n", + " nAA \n", + "tr|A0A024R161|A0A024R161_HUMAN 153 \n", + "tr|A0A024RAP8|A0A024RAP8_HUMAN 216 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peptdeep.hla.hla_utils import load_prot_df\n", + "fasta_path = \"example.fasta\"\n", + "protein_df = load_prot_df(fasta_path)\n", + "protein_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'$MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCGTRDCYEVLGVSRSAGKAEIARAYRQLARRYHPDRYRPQPGDEGPGRTPQSAEEAFLLVATAYETLKVSQAAAELQQYCMQNACKDALLVGVPAGSNPFREPRSCALL$MGWIRGRRSRHSWEMSEFHNYNLDLKKSDFSTRWQKQRCPVVKSKCRENASPFFFCCFIAVAMGIRFIIMVTIWSAVFLNSLFNQEVQIPLTESYCGPCPKNWICYKNNCYQFFDESKNWYESQASCMSQNASLLKVYSKEDQDLLKLVKSYHWMGLVHIPTNGSWQWEDGSILSPNLLTIIEMQKGDCALYASSFKGYIENCSTPNTYICMQRTV$'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peptdeep.hla.hla_utils import cat_proteins\n", + "cat_sequence = cat_proteins(protein_df[\"sequence\"])\n", + "cat_sequence" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.2 Unspecific digestion\n", + "\n", + "Use `alphabase.protein.lcp_digest.get_substring_indices` to get all non-redundant non-specific peptide sequences from the concatenated protein sequence. The digested peptide sequences are stored in a dataframe based on their start and stop indices in the concantenated protein sequence string. To save the RAM, the `peptdeep.hla` module works on start and stop indices instead of on peptide sequences directly. This will save about 8 times of the RAM for HLA-I peptides (length from 7 to 14, deomnstrated below). For a large protein sequence database, there will be millions of unspecific peptides, so working with strings is not feasible for a complete human fasta due to the requirements of extremely large RAM. (~ 70M unspecific sequences from the reviewed swissprot fasta require ~ 4-5 GB RAM already).\n", + "\n", + "Using the get_substring_indices function we extract the start and stop indices of all peptide sequences between 7 and 14 aa (min_len, max_len) from the concatenated protein sequences. All peptides sequences are unique, guranteed by the LCP algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_pos
019
1110
2111
3112
4113
.........
2438361370
2439361371
2440362370
2441362371
2442363371
\n", + "

2443 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos\n", + "0 1 9\n", + "1 1 10\n", + "2 1 11\n", + "3 1 12\n", + "4 1 13\n", + "... ... ...\n", + "2438 361 370\n", + "2439 361 371\n", + "2440 362 370\n", + "2441 362 371\n", + "2442 363 371\n", + "\n", + "[2443 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from alphabase.protein.lcp_digest import get_substring_indices\n", + "import pandas as pd\n", + "import sys\n", + "\n", + "start_idxes, stop_idxes = get_substring_indices(\n", + " cat_sequence, min_len=8, max_len=14, stop_char=\"$\"\n", + ")\n", + "digest_pos_df = pd.DataFrame({\n", + " \"start_pos\": start_idxes,\n", + " \"stop_pos\": stop_idxes,\n", + "})\n", + "digest_pos_df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "RAM_use_idxes = sys.getsizeof(digest_pos_df)*1e-6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The unspecific peptide sequences can be localted by the `start_pos` and `stop_pos`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_possequence
019MGAPLLSP
1110MGAPLLSPG
2111MGAPLLSPGW
3112MGAPLLSPGWG
4113MGAPLLSPGWGA
............
2438361370NTYICMQRT
2439361371NTYICMQRTV
2440362370TYICMQRT
2441362371TYICMQRTV
2442363371YICMQRTV
\n", + "

2443 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos sequence\n", + "0 1 9 MGAPLLSP\n", + "1 1 10 MGAPLLSPG\n", + "2 1 11 MGAPLLSPGW\n", + "3 1 12 MGAPLLSPGWG\n", + "4 1 13 MGAPLLSPGWGA\n", + "... ... ... ...\n", + "2438 361 370 NTYICMQRT\n", + "2439 361 371 NTYICMQRTV\n", + "2440 362 370 TYICMQRT\n", + "2441 362 371 TYICMQRTV\n", + "2442 363 371 YICMQRTV\n", + "\n", + "[2443 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "digest_pos_df[\"sequence\"] = digest_pos_df[\n", + " [\"start_pos\",\"stop_pos\"]\n", + "].apply(lambda x: cat_sequence[slice(*x)], axis=1)\n", + "digest_pos_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "RAM_use_seqs = sys.getsizeof(digest_pos_df[\"sequence\"])*1e-6" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'seq RAM = 0.16623 Mb, idxes RAM = 0.01971, ratio = 8.43475'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f\"seq RAM = {RAM_use_seqs:.5f} Mb, idxes RAM = {RAM_use_idxes:.5f}, ratio = {RAM_use_seqs/RAM_use_idxes:.5f}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Selection of peptide sequences used for library prediction\n", + "The digest_prot_df contains all unspecifically digested peptide sequences between 7 and 14 aa generatable from the concatenated protein sequences. This list is reduced using a HLA1_Binding_Classifier from peptdeep.hla.hla_class1. Two different model architectures are available, an LSTM model (HLA_Class_I_LSTM) and a BERT model (HLA_Class_I_BERT). A pretrained model is only available for the LSTM model architecture.\n", + "The HLA1_Binding_Classifer can be used with a pretrained model, tuned with existing peptide data or trained from scratch. Training of a new model should be considered carefully and will not be covered in this tutorial.\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Selection of peptide seqeuence candidates without transferlearning\n", + "\n", + "Selection of peptide sequences for library predicition using the pretrained model can be done in a few steps. First, the Classifier model needs to be initialized and the pretrained model is loaded. Next, we can use any kind of dataframe containing peptide sequences to predict how likely there are HLA peptides, the only requirement beeing that the column containing the peptides is called 'sequence'.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_possequencenAAHLA_prob_pred
019MGAPLLSP80.239477
1145153REPRSCAL80.061692
2146154EPRSCALL80.137313
3155163MGWIRGRR80.056462
4156164GWIRGRRS80.001298
..................
2438112126KVSQAAAELQQYCM140.243115
2439317331NGSWQWEDGSILSP140.021114
24407993DRYRPQPGDEGPGR140.060634
2441113127VSQAAAELQQYCMQ140.355900
2442190204KQRCPVVKSKCREN140.000362
\n", + "

2443 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos sequence nAA HLA_prob_pred\n", + "0 1 9 MGAPLLSP 8 0.239477\n", + "1 145 153 REPRSCAL 8 0.061692\n", + "2 146 154 EPRSCALL 8 0.137313\n", + "3 155 163 MGWIRGRR 8 0.056462\n", + "4 156 164 GWIRGRRS 8 0.001298\n", + "... ... ... ... ... ...\n", + "2438 112 126 KVSQAAAELQQYCM 14 0.243115\n", + "2439 317 331 NGSWQWEDGSILSP 14 0.021114\n", + "2440 79 93 DRYRPQPGDEGPGR 14 0.060634\n", + "2441 113 127 VSQAAAELQQYCMQ 14 0.355900\n", + "2442 190 204 KQRCPVVKSKCREN 14 0.000362\n", + "\n", + "[2443 rows x 5 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n", + "\n", + "model = HLA1_Binding_Classifier()\n", + "model.load_pretrained_hla_model()\n", + "manual_prediction = model.predict(digest_pos_df)\n", + "manual_prediction\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we can filter the list based on the HLA_prob_pred. The higher the probability, the more likely it is for the peptide sequence to be present in a immunopeptidomics sample. It is not recommended to use a cut-off below 0.7 as this inflates the spectral library. It is rather recommended to use more conservative cut-offs. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_possequencenAAHLA_prob_pred
17168176EMSEFHNY80.793702
24130138KDALLVGV80.817415
31137145VPAGSNPF80.751329
37170178SEFHNYNL80.940019
67181189KSDFSTRW80.895964
..................
231895109QSAEEAFLLVATAY140.969541
2378329343SPNLLTIIEMQKGD140.756001
2382519LLSPGWGAGAAGRR140.733784
2408110124TLKVSQAAAELQQY140.891976
2419620LSPGWGAGAAGRRW140.842583
\n", + "

148 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos sequence nAA HLA_prob_pred\n", + "17 168 176 EMSEFHNY 8 0.793702\n", + "24 130 138 KDALLVGV 8 0.817415\n", + "31 137 145 VPAGSNPF 8 0.751329\n", + "37 170 178 SEFHNYNL 8 0.940019\n", + "67 181 189 KSDFSTRW 8 0.895964\n", + "... ... ... ... ... ...\n", + "2318 95 109 QSAEEAFLLVATAY 14 0.969541\n", + "2378 329 343 SPNLLTIIEMQKGD 14 0.756001\n", + "2382 5 19 LLSPGWGAGAAGRR 14 0.733784\n", + "2408 110 124 TLKVSQAAAELQQY 14 0.891976\n", + "2419 6 20 LSPGWGAGAAGRRW 14 0.842583\n", + "\n", + "[148 rows x 5 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "manual_prediction[manual_prediction['HLA_prob_pred'] > 0.7]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As described above, directly using the sequences for classification can be memory intense for large lists of sequences. Thereby, the manual concatenation, unspecific digestion, predicition and filtering is only suggested for small sets of proteins or integration of selected sequences (e.g mutations, nuORFs etc.). This can be circumvented by directly predicting and filtering from a fasta using model.predict_from_proteins(). This executes the concatenation, unspecific digestion, predicition and filtering automatically in batches. Thereby the whole process can be done more efficient and be performed without a specialized computation infrastructure." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:01<00:00, 1.27s/it]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_posnAAHLA_prob_predsequence
016817680.793702EMSEFHNY
113013880.817415KDALLVGV
213714580.751329VPAGSNPF
317017880.940019SEFHNYNL
418118980.895964KSDFSTRW
..................
14395109140.969541QSAEEAFLLVATAY
144329343140.756001SPNLLTIIEMQKGD
145519140.733784LLSPGWGAGAAGRR
146110124140.891976TLKVSQAAAELQQY
147620140.842583LSPGWGAGAAGRRW
\n", + "

148 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos nAA HLA_prob_pred sequence\n", + "0 168 176 8 0.793702 EMSEFHNY\n", + "1 130 138 8 0.817415 KDALLVGV\n", + "2 137 145 8 0.751329 VPAGSNPF\n", + "3 170 178 8 0.940019 SEFHNYNL\n", + "4 181 189 8 0.895964 KSDFSTRW\n", + ".. ... ... ... ... ...\n", + "143 95 109 14 0.969541 QSAEEAFLLVATAY\n", + "144 329 343 14 0.756001 SPNLLTIIEMQKGD\n", + "145 5 19 14 0.733784 LLSPGWGAGAAGRR\n", + "146 110 124 14 0.891976 TLKVSQAAAELQQY\n", + "147 6 20 14 0.842583 LSPGWGAGAAGRRW\n", + "\n", + "[148 rows x 5 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sequence_df = model.predict_from_proteins(protein_df, prob_threshold=0.7)\n", + "sequence_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Selection of peptide seqeuence candidates with transferlearning\n", + "\n", + "To perform transferlearning we need a list of peptide sequences we expect to be present in our sample. These peptides can be retrived from several different sources like DDA or directDIA search results. It is recommended to use at the very least 1000 sequences for transferlearning. The more sequences available the better the transferlearning step works. The model performance can be assessed after transferlearning and should be assessed before predicition. \n", + "\n", + "First, the Classifier model needs to be initialized and the pretrained model is loaded. Next, a protein dataframe is added, in this example the previousely loaded fasta file. The protein dataframe is used by the Classifier internaly to draw negative training data during model training and testing." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "model = HLA1_Binding_Classifier()\n", + "model.load_pretrained_hla_model()\n", + "model.load_proteins(fasta_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we load the peptide sequences wee use for transferlearning and split it into a training and testing dataset. This step is very important to assess the model performance after transferlearning. Here, we use the digest_pos_df generated above. As these are no immunopeptides, but a list of unspecifically digested proteins, the model performance will not improve, but the pronciples remain the same. \n", + "@ Feng should we include a example file so that the model is actually improved or just use this? " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1954, 489)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seq_df = digest_pos_df.sample(frac=0.2)\n", + "train_seq_df = digest_pos_df.drop(index=test_seq_df.index)\n", + "len(train_seq_df), len(test_seq_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we train the model using the training sequence dataframe. In this example we use 10 training epochs, in a real experiment more should be used. Good starting points are 40 epochs for a training dataset of around 10000 sequences or 100 epochs for a training dataset of around 1000 sequences. For a real experiment the warmup_epochs can be increased to 10. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-23 14:22:06> Training with fixed sequence length: 0\n", + "[Training] Epoch=1, lr=4e-05, loss=1.39779794216156\n", + "[Training] Epoch=2, lr=6e-05, loss=1.0070140702383858\n", + "[Training] Epoch=3, lr=8e-05, loss=0.7982760497501918\n", + "[Training] Epoch=4, lr=0.0001, loss=0.7397338407380241\n", + "[Training] Epoch=5, lr=0.0001, loss=0.7099559647696358\n", + "[Training] Epoch=6, lr=9.045084971874738e-05, loss=0.7016251683235168\n", + "[Training] Epoch=7, lr=6.545084971874738e-05, loss=0.6965694086892265\n", + "[Training] Epoch=8, lr=3.4549150281252636e-05, loss=0.697939566203526\n", + "[Training] Epoch=9, lr=9.549150281252633e-06, loss=0.6959438664572579\n", + "[Training] Epoch=10, lr=1.0000000000000002e-14, loss=0.6928229417119708\n" + ] + } + ], + "source": [ + "model.train(train_seq_df,\n", + " epoch=10, warmup_epoch=5, \n", + " verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can assess the model performance after transferlearning using the model.test() function on the training and testing data. This can also be done before transferlearning to assess how well the model fits the available data already. The test assesses the precision, recall and fals positive rate of the model at different probability cut offs. As a rule of thumb a false postitve rate above 7% (@FENG adjust in case lower/higher) is not recomendable because the peptide list gets disproportionally larger, leading to lower IDs during the search. In case of a high false postitive rate, the probability cut off at which the peptides are predicted should be increased. " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HLA_prob_predprecisionrecallfalse_positive
00.50.5114340.5951890.568577
10.60.4166670.0179120.025077
20.70.3333330.0005120.001024
30.8NaN0.0000000.000000
40.9NaN0.0000000.000000
\n", + "
" + ], + "text/plain": [ + " HLA_prob_pred precision recall false_positive\n", + "0 0.5 0.511434 0.595189 0.568577\n", + "1 0.6 0.416667 0.017912 0.025077\n", + "2 0.7 0.333333 0.000512 0.001024\n", + "3 0.8 NaN 0.000000 0.000000\n", + "4 0.9 NaN 0.000000 0.000000" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.test(train_seq_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HLA_prob_predprecisionrecallfalse_positive
00.50.4501920.4805730.586912
10.60.4705880.0163600.018405
20.7NaN0.0000000.000000
30.8NaN0.0000000.000000
40.9NaN0.0000000.000000
\n", + "
" + ], + "text/plain": [ + " HLA_prob_pred precision recall false_positive\n", + "0 0.5 0.450192 0.480573 0.586912\n", + "1 0.6 0.470588 0.016360 0.018405\n", + "2 0.7 NaN 0.000000 0.000000\n", + "3 0.8 NaN 0.000000 0.000000\n", + "4 0.9 NaN 0.000000 0.000000" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.test(test_seq_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After transferlearning and testing the new model, peptides can be predicted as with the pretrained model. " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:01<00:00, 1.32s/it]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_posnAAHLA_prob_predsequence
017017880.711809SEFHNYNL
1627080.627015KAEIARAY
210611480.628822TAYETLKV
329930780.605544LLKLVKSY
434635480.646759YASSFKGY
525826680.624555ICYKNNCY
629430390.610476KEDQDLLKL
729830790.645020DLLKLVKSY
823524490.629079SLFNQEVQI
925726690.623247WICYKNNCY
1026727690.611738FFDESKNWY
11172690.605875RRWWMLLAP
1232733690.616737ILSPNLLTI
13748390.611590RRYHPDRYR
14344354100.662783ALYASSFKGY
15232242100.651600FLNSLFNQEV
16221231100.617175FIIMVTIWSA
17222232100.600623IIMVTIWSAV
187484100.614895RRYHPDRYRP
19221232110.608950FIIMVTIWSAV
20353364110.613787YIENCSTPNTY
217485110.605368RRYHPDRYRPQ
22112124120.612270KVSQAAAELQQY
234254120.607715GLYCGTRDCYEV
24351363120.616891KGYIENCSTPNT
257486120.602210RRYHPDRYRPQP
268699130.644656GDEGPGRTPQSAE
27351364130.603497KGYIENCSTPNTY
287386130.622453ARRYHPDRYRPQP
297487130.611441RRYHPDRYRPQPG
30334347130.604354TIIEMQKGDCALY
31141154130.601309SNPFREPRSCALL
323245130.622797LVRPAGALVEGLY
33130143130.604786KDALLVGVPAGSN
34333347140.613545LTIIEMQKGDCALY
356074140.607648AGKAEIARAYRQLA
368599140.606241PGDEGPGRTPQSAE
37229243140.606759SAVFLNSLFNQEVQ
3886100140.622891GDEGPGRTPQSAEE
39167181140.611953WEMSEFHNYNLDLK
40117131140.619257AAELQQYCMQNACK
417387140.608767ARRYHPDRYRPQPG
42329343140.600299SPNLLTIIEMQKGD
\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos nAA HLA_prob_pred sequence\n", + "0 170 178 8 0.711809 SEFHNYNL\n", + "1 62 70 8 0.627015 KAEIARAY\n", + "2 106 114 8 0.628822 TAYETLKV\n", + "3 299 307 8 0.605544 LLKLVKSY\n", + "4 346 354 8 0.646759 YASSFKGY\n", + "5 258 266 8 0.624555 ICYKNNCY\n", + "6 294 303 9 0.610476 KEDQDLLKL\n", + "7 298 307 9 0.645020 DLLKLVKSY\n", + "8 235 244 9 0.629079 SLFNQEVQI\n", + "9 257 266 9 0.623247 WICYKNNCY\n", + "10 267 276 9 0.611738 FFDESKNWY\n", + "11 17 26 9 0.605875 RRWWMLLAP\n", + "12 327 336 9 0.616737 ILSPNLLTI\n", + "13 74 83 9 0.611590 RRYHPDRYR\n", + "14 344 354 10 0.662783 ALYASSFKGY\n", + "15 232 242 10 0.651600 FLNSLFNQEV\n", + "16 221 231 10 0.617175 FIIMVTIWSA\n", + "17 222 232 10 0.600623 IIMVTIWSAV\n", + "18 74 84 10 0.614895 RRYHPDRYRP\n", + "19 221 232 11 0.608950 FIIMVTIWSAV\n", + "20 353 364 11 0.613787 YIENCSTPNTY\n", + "21 74 85 11 0.605368 RRYHPDRYRPQ\n", + "22 112 124 12 0.612270 KVSQAAAELQQY\n", + "23 42 54 12 0.607715 GLYCGTRDCYEV\n", + "24 351 363 12 0.616891 KGYIENCSTPNT\n", + "25 74 86 12 0.602210 RRYHPDRYRPQP\n", + "26 86 99 13 0.644656 GDEGPGRTPQSAE\n", + "27 351 364 13 0.603497 KGYIENCSTPNTY\n", + "28 73 86 13 0.622453 ARRYHPDRYRPQP\n", + "29 74 87 13 0.611441 RRYHPDRYRPQPG\n", + "30 334 347 13 0.604354 TIIEMQKGDCALY\n", + "31 141 154 13 0.601309 SNPFREPRSCALL\n", + "32 32 45 13 0.622797 LVRPAGALVEGLY\n", + "33 130 143 13 0.604786 KDALLVGVPAGSN\n", + "34 333 347 14 0.613545 LTIIEMQKGDCALY\n", + "35 60 74 14 0.607648 AGKAEIARAYRQLA\n", + "36 85 99 14 0.606241 PGDEGPGRTPQSAE\n", + "37 229 243 14 0.606759 SAVFLNSLFNQEVQ\n", + "38 86 100 14 0.622891 GDEGPGRTPQSAEE\n", + "39 167 181 14 0.611953 WEMSEFHNYNLDLK\n", + "40 117 131 14 0.619257 AAELQQYCMQNACK\n", + "41 73 87 14 0.608767 ARRYHPDRYRPQPG\n", + "42 329 343 14 0.600299 SPNLLTIIEMQKGD" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict_from_proteins(fasta_path, prob_threshold=0.6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Spectral library prediciton" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the spectral library for the filtered peptide list can be predicted using PredictSpecLibFasta. First, one needs to select the models for rt/ccs/ms2 prediction using the ModelManager. One can select from a set of pretrained models or load externally trained models. Here we load the 'HLA' model (at the moment this still loads the generic model, but in the futer this is supposed to be replaced by an HLA specfic internal model). " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from peptdeep.spec_lib.predict_lib import ModelManager\n", + "from peptdeep.protein.fasta import PredictSpecLibFasta\n", + "\n", + "model_mgr = ModelManager()\n", + "model_mgr.load_installed_models(model_type='HLA')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next step, the PredictSpecLibFasta is initialized using the preloaded model. The presettings here are selected for the prediction of tryptic libraries so some parameters need to be adjusted, in particular precursor_charge_min, precursor_charge_max. By default Carbamidomethylation is set as a fixed modification (fix_mod) and Acetylation and Oxidation are set as variable modifications (var_mod). Those can be removed by adding an empty list as shown for the variable modifications. \n", + "\n", + "Of note, PredictSpecLibFasta can also be used to predict a library from a fasta file. Therfore one can also set the protease (default trypsin) and the minimum and maximum peptide length (7 to 35). Wee dont need to change those parameters here, as we wont make use of the digestion functions but rather provide a already digested sequence table. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "speclib = PredictSpecLibFasta(model_manager=model_mgr,\n", + " precursor_charge_min=1,\n", + " precursor_charge_max=3,\n", + " fix_mods=[])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To reduce the size of the dataframe and predicted library we give each peptide sequence a unique protein identifier (number). This enables the use of search engines that rely on protein information (such as AlphaDIA) but one needs to keep in mind to remove filtering steps based on how many peptides per protein are identified during data analysis. Alternatively, proteins of the peptide sequences may originate from can be infered using `alphabase.protein.fasta.annotate_precursor_df()` (demonstrated below)." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_posnAAHLA_prob_predsequenceprotein_idprotein_idxesfull_namegene_orggene_nameis_prot_ntermis_prot_cterm
016817680.793702EMSEFHNY00000FalseFalse
113013880.817415KDALLVGV11111FalseFalse
213714580.751329VPAGSNPF22222FalseFalse
317017880.940019SEFHNYNL33333FalseFalse
418118980.895964KSDFSTRW44444FalseFalse
.......................................
14395109140.969541QSAEEAFLLVATAY143143143143143FalseFalse
144329343140.756001SPNLLTIIEMQKGD144144144144144FalseFalse
145519140.733784LLSPGWGAGAAGRR145145145145145FalseFalse
146110124140.891976TLKVSQAAAELQQY146146146146146FalseFalse
147620140.842583LSPGWGAGAAGRRW147147147147147FalseFalse
\n", + "

148 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos nAA HLA_prob_pred sequence protein_id \\\n", + "0 168 176 8 0.793702 EMSEFHNY 0 \n", + "1 130 138 8 0.817415 KDALLVGV 1 \n", + "2 137 145 8 0.751329 VPAGSNPF 2 \n", + "3 170 178 8 0.940019 SEFHNYNL 3 \n", + "4 181 189 8 0.895964 KSDFSTRW 4 \n", + ".. ... ... ... ... ... ... \n", + "143 95 109 14 0.969541 QSAEEAFLLVATAY 143 \n", + "144 329 343 14 0.756001 SPNLLTIIEMQKGD 144 \n", + "145 5 19 14 0.733784 LLSPGWGAGAAGRR 145 \n", + "146 110 124 14 0.891976 TLKVSQAAAELQQY 146 \n", + "147 6 20 14 0.842583 LSPGWGAGAAGRRW 147 \n", + "\n", + " protein_idxes full_name gene_org gene_name is_prot_nterm is_prot_cterm \n", + "0 0 0 0 0 False False \n", + "1 1 1 1 1 False False \n", + "2 2 2 2 2 False False \n", + "3 3 3 3 3 False False \n", + "4 4 4 4 4 False False \n", + ".. ... ... ... ... ... ... \n", + "143 143 143 143 143 False False \n", + "144 144 144 144 144 False False \n", + "145 145 145 145 145 False False \n", + "146 146 146 146 146 False False \n", + "147 147 147 147 147 False False \n", + "\n", + "[148 rows x 12 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sequence_df['protein_id'] = [str(i) for i in range(len(sequence_df))]\n", + "sequence_df['protein_idxes'] = sequence_df.protein_id.astype(\"U\")\n", + "sequence_df['full_name'] = sequence_df['protein_id'] \n", + "sequence_df['gene_org'] = sequence_df['protein_id'] \n", + "sequence_df['gene_name'] = sequence_df['protein_id']\n", + "sequence_df[\"is_prot_nterm\"] = False\n", + "sequence_df[\"is_prot_cterm\"] = False\n", + "sequence_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The sequence dataframe contains all the relevant information to be passed to the protein_df and the precursor_df." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idnAAfull_namegene_orggene_name
0EMSEFHNY08000
1KDALLVGV18111
2VPAGSNPF28222
3SEFHNYNL38333
4KSDFSTRW48444
.....................
143QSAEEAFLLVATAY14314143143143
144SPNLLTIIEMQKGD14414144144144
145LLSPGWGAGAAGRR14514145145145
146TLKVSQAAAELQQY14614146146146
147LSPGWGAGAAGRRW14714147147147
\n", + "

148 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_id nAA full_name gene_org gene_name\n", + "0 EMSEFHNY 0 8 0 0 0\n", + "1 KDALLVGV 1 8 1 1 1\n", + "2 VPAGSNPF 2 8 2 2 2\n", + "3 SEFHNYNL 3 8 3 3 3\n", + "4 KSDFSTRW 4 8 4 4 4\n", + ".. ... ... ... ... ... ...\n", + "143 QSAEEAFLLVATAY 143 14 143 143 143\n", + "144 SPNLLTIIEMQKGD 144 14 144 144 144\n", + "145 LLSPGWGAGAAGRR 145 14 145 145 145\n", + "146 TLKVSQAAAELQQY 146 14 146 146 146\n", + "147 LSPGWGAGAAGRRW 147 14 147 147 147\n", + "\n", + "[148 rows x 6 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.protein_df = sequence_df[\n", + " [\"sequence\",\"protein_id\",\"nAA\", 'full_name', 'gene_org', 'gene_name']\n", + "].copy()\n", + "speclib.protein_df" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idxesstart_posstop_posnAAHLA_prob_predis_prot_ntermis_prot_cterm
0EMSEFHNY016817680.793702FalseFalse
1KDALLVGV113013880.817415FalseFalse
2VPAGSNPF213714580.751329FalseFalse
3SEFHNYNL317017880.940019FalseFalse
4KSDFSTRW418118980.895964FalseFalse
...........................
143QSAEEAFLLVATAY14395109140.969541FalseFalse
144SPNLLTIIEMQKGD144329343140.756001FalseFalse
145LLSPGWGAGAAGRR145519140.733784FalseFalse
146TLKVSQAAAELQQY146110124140.891976FalseFalse
147LSPGWGAGAAGRRW147620140.842583FalseFalse
\n", + "

148 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_idxes start_pos stop_pos nAA HLA_prob_pred \\\n", + "0 EMSEFHNY 0 168 176 8 0.793702 \n", + "1 KDALLVGV 1 130 138 8 0.817415 \n", + "2 VPAGSNPF 2 137 145 8 0.751329 \n", + "3 SEFHNYNL 3 170 178 8 0.940019 \n", + "4 KSDFSTRW 4 181 189 8 0.895964 \n", + ".. ... ... ... ... ... ... \n", + "143 QSAEEAFLLVATAY 143 95 109 14 0.969541 \n", + "144 SPNLLTIIEMQKGD 144 329 343 14 0.756001 \n", + "145 LLSPGWGAGAAGRR 145 5 19 14 0.733784 \n", + "146 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "147 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "\n", + " is_prot_nterm is_prot_cterm \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + ".. ... ... \n", + "143 False False \n", + "144 False False \n", + "145 False False \n", + "146 False False \n", + "147 False False \n", + "\n", + "[148 rows x 8 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.precursor_df = sequence_df[\n", + " [\"sequence\",\"protein_idxes\",\"start_pos\",\"stop_pos\",\n", + " \"nAA\",\"HLA_prob_pred\", 'is_prot_nterm', 'is_prot_cterm']\n", + "].copy()\n", + "speclib.precursor_df" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idxesstart_posstop_posnAAHLA_prob_predis_prot_ntermis_prot_cterm
0EMSEFHNY016817680.793702FalseFalse
1KDALLVGV113013880.817415FalseFalse
2VPAGSNPF213714580.751329FalseFalse
3SEFHNYNL317017880.940019FalseFalse
4KSDFSTRW418118980.895964FalseFalse
...........................
143QSAEEAFLLVATAY14395109140.969541FalseFalse
144SPNLLTIIEMQKGD144329343140.756001FalseFalse
145LLSPGWGAGAAGRR145519140.733784FalseFalse
146TLKVSQAAAELQQY146110124140.891976FalseFalse
147LSPGWGAGAAGRRW147620140.842583FalseFalse
\n", + "

148 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_idxes start_pos stop_pos nAA HLA_prob_pred \\\n", + "0 EMSEFHNY 0 168 176 8 0.793702 \n", + "1 KDALLVGV 1 130 138 8 0.817415 \n", + "2 VPAGSNPF 2 137 145 8 0.751329 \n", + "3 SEFHNYNL 3 170 178 8 0.940019 \n", + "4 KSDFSTRW 4 181 189 8 0.895964 \n", + ".. ... ... ... ... ... ... \n", + "143 QSAEEAFLLVATAY 143 95 109 14 0.969541 \n", + "144 SPNLLTIIEMQKGD 144 329 343 14 0.756001 \n", + "145 LLSPGWGAGAAGRR 145 5 19 14 0.733784 \n", + "146 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "147 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "\n", + " is_prot_nterm is_prot_cterm \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + ".. ... ... \n", + "143 False False \n", + "144 False False \n", + "145 False False \n", + "146 False False \n", + "147 False False \n", + "\n", + "[148 rows x 8 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.precursor_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, the modifications and charges can be added to the peptide dataframe using add_modifications and add_charge. This creates a unique entry for every combination of charge and modification for all the sequences in the precursor dataframe. " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idxesstart_posstop_posnAAHLA_prob_predis_prot_ntermis_prot_ctermmodsmod_sitescharge
0EMSEFHNY016817680.793702FalseFalseOxidation@M21
1EMSEFHNY016817680.793702FalseFalseOxidation@M22
2EMSEFHNY016817680.793702FalseFalseOxidation@M23
3EMSEFHNY016817680.793702FalseFalse1
4EMSEFHNY016817680.793702FalseFalse2
....................................
493TLKVSQAAAELQQY146110124140.891976FalseFalse2
494TLKVSQAAAELQQY146110124140.891976FalseFalse3
495LSPGWGAGAAGRRW147620140.842583FalseFalse1
496LSPGWGAGAAGRRW147620140.842583FalseFalse2
497LSPGWGAGAAGRRW147620140.842583FalseFalse3
\n", + "

498 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_idxes start_pos stop_pos nAA HLA_prob_pred \\\n", + "0 EMSEFHNY 0 168 176 8 0.793702 \n", + "1 EMSEFHNY 0 168 176 8 0.793702 \n", + "2 EMSEFHNY 0 168 176 8 0.793702 \n", + "3 EMSEFHNY 0 168 176 8 0.793702 \n", + "4 EMSEFHNY 0 168 176 8 0.793702 \n", + ".. ... ... ... ... ... ... \n", + "493 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "494 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "495 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "496 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "497 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "\n", + " is_prot_nterm is_prot_cterm mods mod_sites charge \n", + "0 False False Oxidation@M 2 1 \n", + "1 False False Oxidation@M 2 2 \n", + "2 False False Oxidation@M 2 3 \n", + "3 False False 1 \n", + "4 False False 2 \n", + ".. ... ... ... ... ... \n", + "493 False False 2 \n", + "494 False False 3 \n", + "495 False False 1 \n", + "496 False False 2 \n", + "497 False False 3 \n", + "\n", + "[498 rows x 11 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.add_modifications()\n", + "speclib.add_charge()\n", + "speclib.precursor_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now ccs, rt and ms2 can be predicted for each entry" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-23 14:22:43> Predicting RT/IM/MS2 for 400 precursors ...\n", + "2024-07-23 14:22:43> Predicting RT ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 27.54it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-23 14:22:43> Predicting mobility ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "100%|██████████| 7/7 [00:00<00:00, 50.06it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-23 14:22:44> Predicting MS2 ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "100%|██████████| 7/7 [00:00<00:00, 23.73it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-23 14:22:44> End predicting RT/IM/MS2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "speclib.predict_all()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "iRTs can be added using translate_rt_to_irt_pred. This is not neccessary for search engines like DIA-NN or AlphaDIA but required for Spectronaut." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predict RT for 11 iRT precursors.\n", + "Linear regression of `rt_pred` to `irt`:\n", + " R_square R slope intercept test_num\n", + "0 0.99007 0.995022 152.235639 -39.232164 11\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequenceprotein_idxesstart_posstop_posnAAHLA_prob_predis_prot_ntermis_prot_ctermmodsmod_sites...precursor_mzrt_predrt_norm_predccs_predmobility_prednceinstrumentfrag_start_idxfrag_stop_idxirt_pred
0EMSEFHNY016817680.793702FalseFalseOxidation@M2...1072.4040370.1896500.189650254.1958921.25314030.0Lumos07-10.360738
1EMSEFHNY016817680.793702FalseFalseOxidation@M2...536.7056570.1896500.189650337.3285830.83149430.0Lumos714-10.360738
2EMSEFHNY016817680.793702FalseFalse...1056.4091230.2892610.289261255.1036991.25737330.0Lumos14214.803681
3EMSEFHNY016817680.793702FalseFalse...528.7082000.2892610.289261337.4446410.83162130.0Lumos21284.803681
4KDALLVGV113013880.817415FalseFalse...814.5032800.4337910.433791256.6152041.26000130.0Lumos283526.806270
..................................................................
395TLKVSQAAAELQQY146110124140.891976FalseFalse...775.4146620.4895450.489545429.3609011.06251430.0Lumos3810382335.294030
396TLKVSQAAAELQQY146110124140.891976FalseFalse...517.2788670.4895450.489545463.2310490.76422530.0Lumos3823383635.294030
397LSPGWGAGAAGRRW147620140.842583FalseFalse...1441.7447420.3777430.377743289.2009891.43037830.0Lumos3836384918.273780
398LSPGWGAGAAGRRW147620140.842583FalseFalse...721.3760090.3777430.377743404.6336981.00065930.0Lumos3849386218.273780
399LSPGWGAGAAGRRW147620140.842583FalseFalse...481.2530980.3777430.377743476.6557010.78585130.0Lumos3862387518.273780
\n", + "

400 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " sequence protein_idxes start_pos stop_pos nAA HLA_prob_pred \\\n", + "0 EMSEFHNY 0 168 176 8 0.793702 \n", + "1 EMSEFHNY 0 168 176 8 0.793702 \n", + "2 EMSEFHNY 0 168 176 8 0.793702 \n", + "3 EMSEFHNY 0 168 176 8 0.793702 \n", + "4 KDALLVGV 1 130 138 8 0.817415 \n", + ".. ... ... ... ... ... ... \n", + "395 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "396 TLKVSQAAAELQQY 146 110 124 14 0.891976 \n", + "397 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "398 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "399 LSPGWGAGAAGRRW 147 6 20 14 0.842583 \n", + "\n", + " is_prot_nterm is_prot_cterm mods mod_sites ... precursor_mz \\\n", + "0 False False Oxidation@M 2 ... 1072.404037 \n", + "1 False False Oxidation@M 2 ... 536.705657 \n", + "2 False False ... 1056.409123 \n", + "3 False False ... 528.708200 \n", + "4 False False ... 814.503280 \n", + ".. ... ... ... ... ... ... \n", + "395 False False ... 775.414662 \n", + "396 False False ... 517.278867 \n", + "397 False False ... 1441.744742 \n", + "398 False False ... 721.376009 \n", + "399 False False ... 481.253098 \n", + "\n", + " rt_pred rt_norm_pred ccs_pred mobility_pred nce instrument \\\n", + "0 0.189650 0.189650 254.195892 1.253140 30.0 Lumos \n", + "1 0.189650 0.189650 337.328583 0.831494 30.0 Lumos \n", + "2 0.289261 0.289261 255.103699 1.257373 30.0 Lumos \n", + "3 0.289261 0.289261 337.444641 0.831621 30.0 Lumos \n", + "4 0.433791 0.433791 256.615204 1.260001 30.0 Lumos \n", + ".. ... ... ... ... ... ... \n", + "395 0.489545 0.489545 429.360901 1.062514 30.0 Lumos \n", + "396 0.489545 0.489545 463.231049 0.764225 30.0 Lumos \n", + "397 0.377743 0.377743 289.200989 1.430378 30.0 Lumos \n", + "398 0.377743 0.377743 404.633698 1.000659 30.0 Lumos \n", + "399 0.377743 0.377743 476.655701 0.785851 30.0 Lumos \n", + "\n", + " frag_start_idx frag_stop_idx irt_pred \n", + "0 0 7 -10.360738 \n", + "1 7 14 -10.360738 \n", + "2 14 21 4.803681 \n", + "3 21 28 4.803681 \n", + "4 28 35 26.806270 \n", + ".. ... ... ... \n", + "395 3810 3823 35.294030 \n", + "396 3823 3836 35.294030 \n", + "397 3836 3849 18.273780 \n", + "398 3849 3862 18.273780 \n", + "399 3862 3875 18.273780 \n", + "\n", + "[400 rows x 21 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speclib.translate_rt_to_irt_pred()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, the predicted library can be exported in an hdf format (AlphaDIA) or translated to a tsv. The tsv translation can be very time consuming. Before the spectral library can be translated, the gene and protein column need to be mapped from the protein_df into the precursor_df. " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# hdf_path = \"D:\\Software\\FASTA\\Human\\speclib_example.hdf\"\n", + "# tsv_path = \"D:\\Software\\FASTA\\Human\\speclib_example.tsv\"\n", + "# speclib.save_hdf(hdf_path) # save as hdf speclib" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from peptdeep.spec_lib.translate import translate_to_tsv\n", + "speclib.append_protein_name()\n", + "# translate_to_tsv(speclib=speclib, tsv = tsv_path) # save as tsv speclib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Matching peptides back to proteins" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The peptide sequnces can be matched back to proteins using annotate_precursor_df, requiring a 'sequence' column and a protein_df like the previously loaded fasta file. This can be done with the sequence output of any search engine or before the library is generated. " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:00<00:00, 7639.90it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posstop_posnAAHLA_prob_predsequenceprotein_idprotein_idxesfull_namegene_orggene_nameis_prot_ntermis_prot_ctermgenesproteinscardinality
016817680.793702EMSEFHNY00000FalseFalseA0A024RAP8_HUMANA0A024RAP81
113013880.817415KDALLVGV11111FalseFalseA0A024R161_HUMANA0A024R1611
213714580.751329VPAGSNPF22222FalseFalseA0A024R161_HUMANA0A024R1611
317017880.940019SEFHNYNL33333FalseFalseA0A024RAP8_HUMANA0A024RAP81
418118980.895964KSDFSTRW44444FalseFalseA0A024RAP8_HUMANA0A024RAP81
................................................
14395109140.969541QSAEEAFLLVATAY143143143143143FalseFalseA0A024R161_HUMANA0A024R1611
144329343140.756001SPNLLTIIEMQKGD144144144144144FalseFalseA0A024RAP8_HUMANA0A024RAP81
145519140.733784LLSPGWGAGAAGRR145145145145145FalseFalseA0A024R161_HUMANA0A024R1611
146110124140.891976TLKVSQAAAELQQY146146146146146FalseFalseA0A024R161_HUMANA0A024R1611
147620140.842583LSPGWGAGAAGRRW147147147147147FalseFalseA0A024R161_HUMANA0A024R1611
\n", + "

148 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " start_pos stop_pos nAA HLA_prob_pred sequence protein_id \\\n", + "0 168 176 8 0.793702 EMSEFHNY 0 \n", + "1 130 138 8 0.817415 KDALLVGV 1 \n", + "2 137 145 8 0.751329 VPAGSNPF 2 \n", + "3 170 178 8 0.940019 SEFHNYNL 3 \n", + "4 181 189 8 0.895964 KSDFSTRW 4 \n", + ".. ... ... ... ... ... ... \n", + "143 95 109 14 0.969541 QSAEEAFLLVATAY 143 \n", + "144 329 343 14 0.756001 SPNLLTIIEMQKGD 144 \n", + "145 5 19 14 0.733784 LLSPGWGAGAAGRR 145 \n", + "146 110 124 14 0.891976 TLKVSQAAAELQQY 146 \n", + "147 6 20 14 0.842583 LSPGWGAGAAGRRW 147 \n", + "\n", + " protein_idxes full_name gene_org gene_name is_prot_nterm is_prot_cterm \\\n", + "0 0 0 0 0 False False \n", + "1 1 1 1 1 False False \n", + "2 2 2 2 2 False False \n", + "3 3 3 3 3 False False \n", + "4 4 4 4 4 False False \n", + ".. ... ... ... ... ... ... \n", + "143 143 143 143 143 False False \n", + "144 144 144 144 144 False False \n", + "145 145 145 145 145 False False \n", + "146 146 146 146 146 False False \n", + "147 147 147 147 147 False False \n", + "\n", + " genes proteins cardinality \n", + "0 A0A024RAP8_HUMAN A0A024RAP8 1 \n", + "1 A0A024R161_HUMAN A0A024R161 1 \n", + "2 A0A024R161_HUMAN A0A024R161 1 \n", + "3 A0A024RAP8_HUMAN A0A024RAP8 1 \n", + "4 A0A024RAP8_HUMAN A0A024RAP8 1 \n", + ".. ... ... ... \n", + "143 A0A024R161_HUMAN A0A024R161 1 \n", + "144 A0A024RAP8_HUMAN A0A024RAP8 1 \n", + "145 A0A024R161_HUMAN A0A024R161 1 \n", + "146 A0A024R161_HUMAN A0A024R161 1 \n", + "147 A0A024R161_HUMAN A0A024R161 1 \n", + "\n", + "[148 rows x 15 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from alphabase.protein.fasta import annotate_precursor_df\n", + "inferred_sequence_df = annotate_precursor_df(sequence_df, protein_df)\n", + "inferred_sequence_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs_tests/hla/hla_class1.ipynb b/nbs_tests/hla/hla_class1.ipynb index d0fa0eb3..f4bcd7ae 100644 --- a/nbs_tests/hla/hla_class1.ipynb +++ b/nbs_tests/hla/hla_class1.ipynb @@ -33,10 +33,11 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + "2024-07-02 17:16:12> Downloading https://github.com/MannLabs/alphapeptdeep/releases/download/pre-trained-models/hla_model.zip ...\n", + "2024-07-02 17:16:14> The pretrained models had been downloaded in C:\\Users\\wahle/peptdeep\\pretrained_models\\hla_model.zip\n" ] } ], @@ -78,7 +79,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 14.32it/s]\n" + "100%|██████████| 1/1 [00:00<00:00, 7.46it/s]\n" ] }, { @@ -321,7 +322,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.8.8" } }, "nbformat": 4, diff --git a/nbs_tests/mass_spec/mass_calibration.ipynb b/nbs_tests/mass_spec/mass_calibration.ipynb index 6ee0cbe6..16140e9f 100644 --- a/nbs_tests/mass_spec/mass_calibration.ipynb +++ b/nbs_tests/mass_spec/mass_calibration.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,16 +18,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], "source": [ "from peptdeep.mass_spec.mass_calibration import *" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -112,7 +129,7 @@ "7 0.0 1.0" ] }, - "execution_count": null, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -146,6 +163,18 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/nbs_tests/mass_spec/match.ipynb b/nbs_tests/mass_spec/match.ipynb index 3a029140..cce9bee6 100644 --- a/nbs_tests/mass_spec/match.ipynb +++ b/nbs_tests/mass_spec/match.ipynb @@ -16,6 +16,15 @@ "# Match" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -377,6 +386,10 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/nbs_tests/mass_spec/ms_reader.ipynb b/nbs_tests/mass_spec/ms_reader.ipynb index 4ea3bd81..7064c883 100644 --- a/nbs_tests/mass_spec/ms_reader.ipynb +++ b/nbs_tests/mass_spec/ms_reader.ipynb @@ -16,6 +16,15 @@ "# MS Reader" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/nbs_tests/model/ccs.ipynb b/nbs_tests/model/ccs.ipynb index a9f808fa..4e84ebb5 100644 --- a/nbs_tests/model/ccs.ipynb +++ b/nbs_tests/model/ccs.ipynb @@ -327,7 +327,7 @@ "repeat = 10\n", "precursor_df = pd.DataFrame({\n", " 'sequence': ['AGHCEWQMKYR']*repeat,\n", - " 'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n", + " 'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n", " 'mod_sites': ['0;4;8']*repeat,\n", " 'nAA': [11]*repeat,\n", " 'charge': [2]*repeat,\n", diff --git a/nbs_tests/model/featurize.ipynb b/nbs_tests/model/featurize.ipynb index 7eafdd2b..8adb958b 100644 --- a/nbs_tests/model/featurize.ipynb +++ b/nbs_tests/model/featurize.ipynb @@ -66,10 +66,10 @@ "outputs": [], "source": [ "#| hide\n", - "x = parse_mod_feature(5, ['Acetyl@Protein N-term','Phospho@S','Oxidation@M'], [0,-1,1])\n", + "x = parse_mod_feature(5, ['Acetyl@Protein_N-term','Phospho@S','Oxidation@M'], [0,-1,1])\n", "assert x.shape == (7, mod_feature_size)\n", "assert np.all(x[1,:]==MOD_TO_FEATURE['Oxidation@M'])\n", - "assert np.all(x[0,:]==MOD_TO_FEATURE['Acetyl@Protein N-term'])\n", + "assert np.all(x[0,:]==MOD_TO_FEATURE['Acetyl@Protein_N-term'])\n", "assert np.all(x[-1,:]==MOD_TO_FEATURE['Phospho@S'])\n", "assert np.all(x[(2,3,4,5),:]==0)" ] diff --git a/nbs_tests/model/ms2.ipynb b/nbs_tests/model/ms2.ipynb index 9fe774e6..c820dfba 100644 --- a/nbs_tests/model/ms2.ipynb +++ b/nbs_tests/model/ms2.ipynb @@ -396,7 +396,7 @@ "repeat = 10\n", "precursor_df = pd.DataFrame({\n", " 'sequence': ['AGHCEWQMKYR']*repeat,\n", - " 'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n", + " 'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat,\n", " 'mod_sites': ['0;4;8']*repeat,\n", " 'nAA': [11]*repeat,\n", " 'nce': [20]*repeat,\n", diff --git a/nbs_tests/model/rt.ipynb b/nbs_tests/model/rt.ipynb index 9bf8803e..ed952b40 100644 --- a/nbs_tests/model/rt.ipynb +++ b/nbs_tests/model/rt.ipynb @@ -135,7 +135,7 @@ "def create_test_dataframe_with_identical_rows(nrows = 10):\n", " precursor_df = pd.DataFrame({\n", " 'sequence': ['AGHCEWQMKYR']*nrows,\n", - " 'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*nrows,\n", + " 'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*nrows,\n", " 'mod_sites': ['0;4;8']*nrows,\n", " 'nAA': [11]*nrows,\n", " 'rt_norm': [0.6]*nrows\n", diff --git a/nbs_tests/pipeline_api.ipynb b/nbs_tests/pipeline_api.ipynb index 4d81db6f..a5678902 100644 --- a/nbs_tests/pipeline_api.ipynb +++ b/nbs_tests/pipeline_api.ipynb @@ -37,6 +37,15 @@ "The refined models will be saved in the path pointed by \"PEPTDEEP_HOME\" in `peptdeep.settings.global_settings`." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/nbs_tests/protein/fasta.ipynb b/nbs_tests/protein/fasta.ipynb index 1256ad16..3c8af3a0 100644 --- a/nbs_tests/protein/fasta.ipynb +++ b/nbs_tests/protein/fasta.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,7 +18,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -35,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -197,7 +206,7 @@ "8 False 20 " ] }, - "execution_count": null, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -224,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -276,7 +285,7 @@ "1 yy gene FGHIJKLMNOPQR" ] }, - "execution_count": null, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -287,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -469,7 +478,7 @@ "8 False 20 xx " ] }, - "execution_count": null, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -482,7 +491,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -514,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -584,7 +593,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 7\n", " xx\n", @@ -597,7 +606,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 7\n", " xx\n", @@ -675,7 +684,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 12\n", " xx\n", @@ -714,7 +723,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 13\n", " xx\n", @@ -727,7 +736,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 13\n", " xx\n", @@ -766,7 +775,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;8\n", " 13\n", " xx;yy\n", @@ -779,7 +788,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 13\n", " xx;yy\n", @@ -844,7 +853,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;14;3\n", " 19\n", " xx\n", @@ -857,7 +866,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 19\n", " xx\n", @@ -922,7 +931,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 20\n", " xx\n", @@ -935,7 +944,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;15;4\n", " 20\n", " xx\n", @@ -948,7 +957,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...\n", " 0;1;15;4\n", " 20\n", " xx\n", @@ -961,7 +970,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 20\n", " xx\n", @@ -1009,36 +1018,36 @@ " is_prot_cterm mods \\\n", "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "2 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", "5 True \n", "6 True Oxidation@M \n", "7 True \n", "8 False Carbamidomethyl@C \n", - "9 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "9 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "10 False Oxidation@M;Carbamidomethyl@C \n", "11 False Carbamidomethyl@C \n", - "12 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "13 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "12 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "13 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "14 True Oxidation@M \n", "15 True \n", - "16 True Acetyl@Protein N-term;Oxidation@M \n", - "17 True Acetyl@Protein N-term \n", + "16 True Acetyl@Protein_N-term;Oxidation@M \n", + "17 True Acetyl@Protein_N-term \n", "18 True Oxidation@M \n", "19 True \n", "20 False Oxidation@M;Carbamidomethyl@C \n", "21 False Carbamidomethyl@C \n", - "22 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "23 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "22 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "23 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "24 False Oxidation@M;Carbamidomethyl@C \n", "25 False Oxidation@M;Carbamidomethyl@C \n", "26 False Oxidation@M;Oxidation@M;Carbamidomethyl@C \n", "27 False Carbamidomethyl@C \n", - "28 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "29 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "30 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", - "31 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "28 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "29 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "30 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", + "31 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "\n", " mod_sites nAA proteins genes \n", "0 1;4 7 xx \n", @@ -1075,7 +1084,7 @@ "31 0;4 20 xx " ] }, - "execution_count": null, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1087,7 +1096,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -1103,14 +1112,14 @@ " else:\n", " assert 'Carbamidomethyl@C' not in mods\n", " # test Acetyl@Protein N-term\n", - " if 'Acetyl@Protein N-term' in mods:\n", + " if 'Acetyl@Protein_N-term' in mods:\n", " assert _lib.precursor_df.is_prot_nterm[i]\n", " assert '0' in sites\n", " if '0' in mods:\n", " assert _lib.precursor_df.is_prot_nterm[i]\n", - " assert 'Acetyl@Protein N-term' in mods\n", + " assert 'Acetyl@Protein_N-term' in mods\n", " if not _lib.precursor_df.is_prot_nterm[i]:\n", - " assert 'Acetyl@Protein N-term' not in mods\n", + " assert 'Acetyl@Protein_N-term' not in mods\n", " # test Oxidation@M\n", " if 'Oxidation@M' in mods:\n", " assert 'M' in seq\n", @@ -1133,7 +1142,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -1203,7 +1212,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 7\n", " xx\n", @@ -1216,7 +1225,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 7\n", " xx\n", @@ -1346,7 +1355,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 12\n", " xx\n", @@ -1385,7 +1394,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 13\n", " xx\n", @@ -1398,7 +1407,7 @@ " 1\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 13\n", " xx\n", @@ -1437,7 +1446,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;8\n", " 13\n", " xx;yy\n", @@ -1450,7 +1459,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 13\n", " xx;yy\n", @@ -1567,7 +1576,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;14;3\n", " 19\n", " xx\n", @@ -1580,7 +1589,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 19\n", " xx\n", @@ -1645,7 +1654,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 20\n", " xx\n", @@ -1658,7 +1667,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;15;4\n", " 20\n", " xx\n", @@ -1671,7 +1680,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...\n", " 0;1;15;4\n", " 20\n", " xx\n", @@ -1684,7 +1693,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 20\n", " xx\n", @@ -1740,8 +1749,8 @@ " is_prot_cterm mods \\\n", "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "2 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", "5 True \n", "6 True Oxidation@M;Phospho@S \n", @@ -1751,15 +1760,15 @@ "10 True Phospho@T \n", "11 True \n", "12 False Carbamidomethyl@C \n", - "13 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "13 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "14 False Oxidation@M;Carbamidomethyl@C \n", "15 False Carbamidomethyl@C \n", - "16 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "17 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "16 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "17 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "18 True Oxidation@M \n", "19 True \n", - "20 True Acetyl@Protein N-term;Oxidation@M \n", - "21 True Acetyl@Protein N-term \n", + "20 True Acetyl@Protein_N-term;Oxidation@M \n", + "21 True Acetyl@Protein_N-term \n", "22 True Oxidation@M;Phospho@S \n", "23 True Oxidation@M;Phospho@T \n", "24 True Oxidation@M \n", @@ -1768,16 +1777,16 @@ "27 True \n", "28 False Oxidation@M;Carbamidomethyl@C \n", "29 False Carbamidomethyl@C \n", - "30 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "31 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "30 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "31 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "32 False Oxidation@M;Carbamidomethyl@C \n", "33 False Oxidation@M;Carbamidomethyl@C \n", "34 False Oxidation@M;Oxidation@M;Carbamidomethyl@C \n", "35 False Carbamidomethyl@C \n", - "36 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "37 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "38 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", - "39 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "36 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "37 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "38 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", + "39 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "\n", " mod_sites nAA proteins genes \n", "0 1;4 7 xx \n", @@ -1822,7 +1831,7 @@ "39 0;4 20 xx " ] }, - "execution_count": null, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1836,7 +1845,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1909,7 +1918,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4\n", " 7\n", " xx\n", @@ -1923,7 +1932,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;4\n", " 7\n", " xx\n", @@ -1965,7 +1974,7 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t...\n", + " Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any_N-t...\n", " 4;0;7;13\n", " 20\n", " xx\n", @@ -1979,7 +1988,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;4;7;13\n", " 20\n", " xx\n", @@ -1993,7 +2002,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;15;4;7;13\n", " 20\n", " xx\n", @@ -2007,7 +2016,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...\n", " 0;1;15;4;7;13\n", " 20\n", " xx\n", @@ -2021,7 +2030,7 @@ " 2\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C;Dimeth...\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth...\n", " 0;4;7;13\n", " 20\n", " xx\n", @@ -2050,15 +2059,15 @@ " is_prot_cterm mods \\\n", "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "2 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", ".. ... ... \n", - "115 False Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t... \n", - "116 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "117 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", - "118 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", - "119 False Acetyl@Protein N-term;Carbamidomethyl@C;Dimeth... \n", + "115 False Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any_N-t... \n", + "116 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "117 False Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... \n", + "118 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", + "119 False Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth... \n", "\n", " mod_sites nAA proteins genes labeling_channel \n", "0 1;4 7 xx none \n", @@ -2076,7 +2085,7 @@ "[120 rows x 11 columns]" ] }, - "execution_count": null, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -2085,15 +2094,15 @@ "#| hide\n", "_lib.add_peptide_labeling({\n", " 'none': [], # not labelled for reference\n", - " 'light': ['Dimethyl@Any N-term','Dimethyl@K'],\n", - " 'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],\n", + " 'light': ['Dimethyl@Any_N-term','Dimethyl@K'],\n", + " 'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],\n", "})\n", "_lib.precursor_df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -2102,7 +2111,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -2229,7 +2238,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;2\n", " 8\n", " 0\n", @@ -2325,7 +2334,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;6\n", " 8\n", " 1\n", @@ -2397,7 +2406,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;1;3\n", " 9\n", " 0\n", @@ -2421,7 +2430,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;3\n", " 9\n", " 0\n", @@ -2493,7 +2502,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...\n", " 0;8;6\n", " 9\n", " 1\n", @@ -2517,7 +2526,7 @@ " 0\n", " True\n", " False\n", - " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " Acetyl@Protein_N-term;Carbamidomethyl@C\n", " 0;6\n", " 9\n", " 1\n", @@ -2637,7 +2646,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;7\n", " 11\n", " 0\n", @@ -2661,7 +2670,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;7\n", " 11\n", " 0\n", @@ -2685,7 +2694,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 11\n", " 0\n", @@ -2709,7 +2718,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 11\n", " 0\n", @@ -2791,8 +2800,8 @@ " 0\n", " 0.352144\n", " 0.352144\n", - " 402.555023\n", - " 0.994806\n", + " 402.554993\n", + " 0.994805\n", " 30.0\n", " Lumos\n", " 220\n", @@ -2815,7 +2824,7 @@ " 0\n", " 0.352144\n", " 0.352144\n", - " 482.206787\n", + " 482.206757\n", " 0.794435\n", " 30.0\n", " Lumos\n", @@ -2829,7 +2838,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;4\n", " 11\n", " 1\n", @@ -2839,7 +2848,7 @@ " 0\n", " 0.406691\n", " 0.406691\n", - " 414.260437\n", + " 414.260406\n", " 1.024166\n", " 30.0\n", " Lumos\n", @@ -2853,7 +2862,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term;Oxidation@M\n", + " Acetyl@Protein_N-term;Oxidation@M\n", " 0;4\n", " 11\n", " 1\n", @@ -2863,7 +2872,7 @@ " 0\n", " 0.406691\n", " 0.406691\n", - " 470.269653\n", + " 470.269684\n", " 0.775096\n", " 30.0\n", " Lumos\n", @@ -2877,7 +2886,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 11\n", " 1\n", @@ -2901,7 +2910,7 @@ " 1\n", " True\n", " True\n", - " Acetyl@Protein N-term\n", + " Acetyl@Protein_N-term\n", " 0\n", " 11\n", " 1\n", @@ -2911,7 +2920,7 @@ " 0\n", " 0.462864\n", " 0.462864\n", - " 469.226685\n", + " 469.226715\n", " 0.773290\n", " 30.0\n", " Lumos\n", @@ -3162,35 +3171,35 @@ "0 Oxidation@M 2 8 0 \n", "1 8 0 \n", "2 Carbamidomethyl@C 2 8 0 \n", - "3 Acetyl@Protein N-term;Carbamidomethyl@C 0;2 8 0 \n", + "3 Acetyl@Protein_N-term;Carbamidomethyl@C 0;2 8 0 \n", "4 Oxidation@M 6 8 1 \n", "5 8 1 \n", "6 Carbamidomethyl@C 6 8 1 \n", - "7 Acetyl@Protein N-term;Carbamidomethyl@C 0;6 8 1 \n", + "7 Acetyl@Protein_N-term;Carbamidomethyl@C 0;6 8 1 \n", "8 Oxidation@M;Carbamidomethyl@C 1;3 9 0 \n", "9 Carbamidomethyl@C 3 9 0 \n", - "10 Acetyl@Protein N-term;Oxidation@M;Carbamidomet... 0;1;3 9 0 \n", - "11 Acetyl@Protein N-term;Carbamidomethyl@C 0;3 9 0 \n", + "10 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;1;3 9 0 \n", + "11 Acetyl@Protein_N-term;Carbamidomethyl@C 0;3 9 0 \n", "12 Oxidation@M;Carbamidomethyl@C 8;6 9 1 \n", "13 Carbamidomethyl@C 6 9 1 \n", - "14 Acetyl@Protein N-term;Oxidation@M;Carbamidomet... 0;8;6 9 1 \n", - "15 Acetyl@Protein N-term;Carbamidomethyl@C 0;6 9 1 \n", + "14 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;8;6 9 1 \n", + "15 Acetyl@Protein_N-term;Carbamidomethyl@C 0;6 9 1 \n", "16 Oxidation@M 7 11 0 \n", "17 Oxidation@M 7 11 0 \n", "18 11 0 \n", "19 11 0 \n", - "20 Acetyl@Protein N-term;Oxidation@M 0;7 11 0 \n", - "21 Acetyl@Protein N-term;Oxidation@M 0;7 11 0 \n", - "22 Acetyl@Protein N-term 0 11 0 \n", - "23 Acetyl@Protein N-term 0 11 0 \n", + "20 Acetyl@Protein_N-term;Oxidation@M 0;7 11 0 \n", + "21 Acetyl@Protein_N-term;Oxidation@M 0;7 11 0 \n", + "22 Acetyl@Protein_N-term 0 11 0 \n", + "23 Acetyl@Protein_N-term 0 11 0 \n", "24 Oxidation@M 4 11 1 \n", "25 Oxidation@M 4 11 1 \n", "26 11 1 \n", "27 11 1 \n", - "28 Acetyl@Protein N-term;Oxidation@M 0;4 11 1 \n", - "29 Acetyl@Protein N-term;Oxidation@M 0;4 11 1 \n", - "30 Acetyl@Protein N-term 0 11 1 \n", - "31 Acetyl@Protein N-term 0 11 1 \n", + "28 Acetyl@Protein_N-term;Oxidation@M 0;4 11 1 \n", + "29 Acetyl@Protein_N-term;Oxidation@M 0;4 11 1 \n", + "30 Acetyl@Protein_N-term 0 11 1 \n", + "31 Acetyl@Protein_N-term 0 11 1 \n", "32 Oxidation@M 6 13 1 \n", "33 Oxidation@M 6 13 1 \n", "34 13 1 \n", @@ -3269,12 +3278,12 @@ "23 468.311920 0.771782 30.0 Lumos 190 200 \n", "24 400.909912 0.990859 30.0 Lumos 200 210 \n", "25 478.989624 0.789230 30.0 Lumos 210 220 \n", - "26 402.555023 0.994806 30.0 Lumos 220 230 \n", - "27 482.206787 0.794435 30.0 Lumos 230 240 \n", - "28 414.260437 1.024166 30.0 Lumos 240 250 \n", - "29 470.269653 0.775096 30.0 Lumos 250 260 \n", + "26 402.554993 0.994805 30.0 Lumos 220 230 \n", + "27 482.206757 0.794435 30.0 Lumos 230 240 \n", + "28 414.260406 1.024166 30.0 Lumos 240 250 \n", + "29 470.269684 0.775096 30.0 Lumos 250 260 \n", "30 417.726074 1.032617 30.0 Lumos 260 270 \n", - "31 469.226685 0.773290 30.0 Lumos 270 280 \n", + "31 469.226715 0.773290 30.0 Lumos 270 280 \n", "32 421.076538 1.041983 30.0 Lumos 280 292 \n", "33 490.627533 0.809400 30.0 Lumos 292 304 \n", "34 423.214233 1.047176 30.0 Lumos 304 316 \n", @@ -3287,7 +3296,7 @@ "[40 rows x 26 columns]" ] }, - "execution_count": null, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -3326,7 +3335,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -3381,7 +3390,7 @@ " 1\n", " False\n", " True\n", - " Oxidation@M;Dimethyl@Any N-term\n", + " Oxidation@M;Dimethyl@Any_N-term\n", " 2;0\n", " 8\n", " 0\n", @@ -3391,7 +3400,7 @@ " 0\n", " 0.242660\n", " 0.242660\n", - " 345.390839\n", + " 345.390869\n", " 0.850135\n", " 30.0\n", " Lumos\n", @@ -3405,7 +3414,7 @@ " 1\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term\n", + " Dimethyl:2H(6)13C(2)@Any_N-term\n", " 0\n", " 8\n", " 0\n", @@ -3429,7 +3438,7 @@ " 1\n", " False\n", " True\n", - " Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term\n", + " Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term\n", " 2;0\n", " 8\n", " 0\n", @@ -3453,7 +3462,7 @@ " 1\n", " False\n", " True\n", - " Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term\n", + " Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term\n", " 6;0\n", " 8\n", " 1\n", @@ -3463,7 +3472,7 @@ " 2\n", " 0.040846\n", " 0.040846\n", - " 319.400330\n", + " 319.400391\n", " 0.786163\n", " 30.0\n", " Lumos\n", @@ -3477,7 +3486,7 @@ " 1\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term\n", + " Dimethyl:2H(6)13C(2)@Any_N-term\n", " 0\n", " 8\n", " 1\n", @@ -3525,7 +3534,7 @@ " 2\n", " False\n", " True\n", - " Dimethyl@Any N-term;Dimethyl@K\n", + " Dimethyl@Any_N-term;Dimethyl@K\n", " 0;8\n", " 13\n", " 1\n", @@ -3535,8 +3544,8 @@ " 0\n", " 0.620949\n", " 0.620949\n", - " 430.461273\n", - " 1.065108\n", + " 430.461243\n", + " 1.065107\n", " 30.0\n", " Lumos\n", " 692\n", @@ -3549,7 +3558,7 @@ " 2\n", " False\n", " True\n", - " Oxidation@M;Dimethyl@Any N-term;Dimethyl@K\n", + " Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K\n", " 6;0;8\n", " 13\n", " 1\n", @@ -3559,7 +3568,7 @@ " 0\n", " 0.468698\n", " 0.468698\n", - " 482.796692\n", + " 482.796661\n", " 0.796481\n", " 30.0\n", " Lumos\n", @@ -3573,7 +3582,7 @@ " 2\n", " False\n", " True\n", - " Oxidation@M;Dimethyl@Any N-term;Dimethyl@K\n", + " Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K\n", " 6;0;8\n", " 13\n", " 1\n", @@ -3583,7 +3592,7 @@ " 0\n", " 0.468698\n", " 0.468698\n", - " 428.150757\n", + " 428.150787\n", " 1.059489\n", " 30.0\n", " Lumos\n", @@ -3597,7 +3606,7 @@ " 2\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...\n", + " Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...\n", " 0;5\n", " 13\n", " 0\n", @@ -3621,7 +3630,7 @@ " 2\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...\n", + " Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...\n", " 0;5\n", " 13\n", " 0\n", @@ -3658,17 +3667,17 @@ "79 FGHIKLMNPQRST 0 2 False True \n", "\n", " mods mod_sites nAA decoy \\\n", - "0 Oxidation@M;Dimethyl@Any N-term 2;0 8 0 \n", - "1 Dimethyl:2H(6)13C(2)@Any N-term 0 8 0 \n", - "2 Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term 2;0 8 0 \n", - "3 Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term 6;0 8 1 \n", - "4 Dimethyl:2H(6)13C(2)@Any N-term 0 8 1 \n", + "0 Oxidation@M;Dimethyl@Any_N-term 2;0 8 0 \n", + "1 Dimethyl:2H(6)13C(2)@Any_N-term 0 8 0 \n", + "2 Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term 2;0 8 0 \n", + "3 Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term 6;0 8 1 \n", + "4 Dimethyl:2H(6)13C(2)@Any_N-term 0 8 1 \n", ".. ... ... ... ... \n", - "75 Dimethyl@Any N-term;Dimethyl@K 0;8 13 1 \n", - "76 Oxidation@M;Dimethyl@Any N-term;Dimethyl@K 6;0;8 13 1 \n", - "77 Oxidation@M;Dimethyl@Any N-term;Dimethyl@K 6;0;8 13 1 \n", - "78 Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)... 0;5 13 0 \n", - "79 Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)... 0;5 13 0 \n", + "75 Dimethyl@Any_N-term;Dimethyl@K 0;8 13 1 \n", + "76 Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K 6;0;8 13 1 \n", + "77 Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K 6;0;8 13 1 \n", + "78 Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)... 0;5 13 0 \n", + "79 Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)... 0;5 13 0 \n", "\n", " charge ... i_5 mono_isotope_idx rt_pred rt_norm_pred \\\n", "0 2 ... 0.001352 0 0.242660 0.242660 \n", @@ -3684,22 +3693,22 @@ "79 3 ... 0.058123 2 0.206957 0.206957 \n", "\n", " ccs_pred mobility_pred nce instrument frag_start_idx frag_stop_idx \n", - "0 345.390839 0.850135 30.0 Lumos 0 7 \n", + "0 345.390869 0.850135 30.0 Lumos 0 7 \n", "1 313.133270 0.770554 30.0 Lumos 7 14 \n", "2 314.302277 0.773615 30.0 Lumos 14 21 \n", - "3 319.400330 0.786163 30.0 Lumos 21 28 \n", + "3 319.400391 0.786163 30.0 Lumos 21 28 \n", "4 320.333069 0.788271 30.0 Lumos 28 35 \n", ".. ... ... ... ... ... ... \n", - "75 430.461273 1.065108 30.0 Lumos 692 704 \n", - "76 482.796692 0.796481 30.0 Lumos 704 716 \n", - "77 428.150757 1.059489 30.0 Lumos 716 728 \n", + "75 430.461243 1.065107 30.0 Lumos 692 704 \n", + "76 482.796661 0.796481 30.0 Lumos 704 716 \n", + "77 428.150787 1.059489 30.0 Lumos 716 728 \n", "78 412.858307 1.021552 30.0 Lumos 728 740 \n", "79 478.660187 0.789583 30.0 Lumos 740 752 \n", "\n", "[80 rows x 27 columns]" ] }, - "execution_count": null, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -3707,8 +3716,8 @@ "source": [ "_lib.import_and_process_protein_dict(protein_dict)\n", "_lib.add_peptide_labeling({\n", - " 'light': ['Dimethyl@Any N-term','Dimethyl@K'],\n", - " 'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],\n", + " 'light': ['Dimethyl@Any_N-term','Dimethyl@K'],\n", + " 'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],\n", "})\n", "_lib.predict_all()\n", "assert (_lib.precursor_df.decoy==1).any()\n", @@ -3732,6 +3741,18 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb b/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb index 80dda52d..a43bcad3 100644 --- a/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb +++ b/nbs_tests/psm_frag_reader/maxquant_frag_reader.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -27,16 +27,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], "source": [ "from peptdeep.psm_frag_reader.maxquant_frag_reader import *" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -48,9 +56,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " lambda x: parse_phos_probs(x[0], x[1], prob), axis=1\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0. 0.34720501 0.54503546 0.14126802 0.17500845 0.1020231\n", + " 0.04637072 0. 0. 0.01899846 0. ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0.02471942 0.41737406 0.67116171 1. 0.37160414 0.59517672\n", + " 0.54813229 0. 0.0606665 0.03838788 0.03735192]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0. 0.04495926 0.0213509 0.02114326 0.01335259 0.\n", + " 0. 0. 0. 0. 0. ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0. 0. 0.51698907 0.87869409 0.14043304 0.1052603\n", + " 0.19786873 0. 0. 0. 0. 0.\n", + " 0. ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n", + "/Users/wenfengzeng/workspace/peptdeep/peptdeep/psm_frag_reader/maxquant_frag_reader.py:141: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0. 0. 0. 0.54449196 0.2230503 0.\n", + " 0.30967216 0. 0. 0. 0. 0.\n", + " 0. ]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.\n", + " self._fragment_intensity_df.iloc[start:end, :] = intens\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 18\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmods\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmod_sites\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmods\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAcetyl@Protein N-term\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m mq_reader\u001b[38;5;241m.\u001b[39mpsm_df\u001b[38;5;241m.\u001b[39mmod_sites\u001b[38;5;241m.\u001b[39mvalues[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m0\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 20\u001b[0m seq \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAAAGPSNSSSGTSTPR\u001b[39m\u001b[38;5;124m'\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], "source": [ "#| hide\n", "mq_str = '''Raw file\tScan number\tScan index\tSequence\tLength\tMissed cleavages\tModifications\tModified sequence\tPhospho (STY) Probabilities\tPhospho (STY) Score Diffs\tAcetyl (Protein N-term)\tPhospho (STY)\tProteins\tGene Names\tProtein Names\tCharge\tFragmentation\tMass analyzer\tType\tScan event number\tIsotope index\tm/z\tMass\tMass Error [ppm]\tSimple Mass Error [ppm]\tRetention time\tPEP\tScore\tDelta score\tScore diff\tLocalization prob\tCombinatorics\tPIF\tFraction of total spectrum\tBase peak fraction\tPrecursor Full ScanNumber\tPrecursor Intensity\tPrecursor Apex Fraction\tPrecursor Apex Offset\tPrecursor Apex Offset Time\tDiagnostic peak Phospho (STY) Y\tMatches\tIntensities\tMass Deviations [Da]\tMass Deviations [ppm]\tMasses\tNumber of Matches\tIntensity coverage\tPeak coverage\tNeutral loss level\tETD identification type\tReverse\tAll scores\tAll sequences\tAll modified sequences\tid\tProtein group IDs\tPeptide ID\tMod. peptide ID\tEvidence ID\tPhospho (STY) site IDs\n", @@ -69,7 +114,7 @@ "assert 'frag_stop_idx' in mq_reader.psm_df.columns\n", "assert mq_reader.psm_df.mods.values[0] == ''\n", "assert mq_reader.psm_df.mod_sites.values[0] == ''\n", - "assert mq_reader.psm_df.mods.values[1] == 'Acetyl@Protein N-term'\n", + "assert mq_reader.psm_df.mods.values[1] in ('Acetyl@Protein_N-term', 'Acetyl@Protein N-term')\n", "assert mq_reader.psm_df.mod_sites.values[1] == '0'\n", "seq = 'AAAGPSNSSSGTSTPR'\n", "frag_types = raw_df[raw_df['Sequence']==seq]['Matches'].values[0].split(';')\n", @@ -496,6 +541,18 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" } }, "nbformat": 4, diff --git a/nbs_tests/spec_lib/library_factory.ipynb b/nbs_tests/spec_lib/library_factory.ipynb index 9e96cffe..bcf70221 100644 --- a/nbs_tests/spec_lib/library_factory.ipynb +++ b/nbs_tests/spec_lib/library_factory.ipynb @@ -23,6 +23,15 @@ "Factory classes to predict libraries from different sources (input file format)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/nbs_tests/spec_lib/predict_lib.ipynb b/nbs_tests/spec_lib/predict_lib.ipynb index 7fa38264..55faffad 100644 --- a/nbs_tests/spec_lib/predict_lib.ipynb +++ b/nbs_tests/spec_lib/predict_lib.ipynb @@ -33,6 +33,15 @@ "\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch # noqa: 401, to prevent crash in Mac Arm" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/nbs_tests/spec_lib/test_translate_tsv.ipynb b/nbs_tests/spec_lib/test_translate_tsv.ipynb index 514e2c9b..b9658a39 100644 --- a/nbs_tests/spec_lib/test_translate_tsv.ipynb +++ b/nbs_tests/spec_lib/test_translate_tsv.ipynb @@ -138,7 +138,7 @@ "charged_frag_types = ['b_z1','y_z1','y_modloss_z1']\n", "precursor_df = pd.DataFrame({\n", " 'sequence': ['ASGHCEWMKYR']*repeat+['ASGHCEWMAAR'],\n", - " 'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat+[''],\n", + " 'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat+[''],\n", " 'mod_sites': ['0;4;8']*repeat+[''],\n", " 'nAA': 11,\n", " 'NCE': 20,\n", diff --git a/peptdeep/hla/hla_class1.py b/peptdeep/hla/hla_class1.py index 3a9f5789..f093d53b 100644 --- a/peptdeep/hla/hla_class1.py +++ b/peptdeep/hla/hla_class1.py @@ -6,7 +6,7 @@ from typing import Union import peptdeep.model.building_block as building_block -from peptdeep.model.model_interface import ModelInterface +from peptdeep.model.model_interface import ModelInterface, append_nAA_column_if_missing from peptdeep.model.featurize import get_ascii_indices from peptdeep.pretrained_models import pretrain_dir, download_models, global_settings @@ -380,6 +380,42 @@ def predict_from_proteins( peptide_df["sequence"] = get_seq_series(peptide_df, self._cat_protein_sequence) return peptide_df + def _concat_neg_df(self, precursor_df, column_to_train="HLA"): + precursor_df = append_nAA_column_if_missing(precursor_df) + precursor_df[column_to_train] = 1 + df_list = [precursor_df] + for nAA, group_df in precursor_df.groupby("nAA"): + rnd_seqs = get_random_sequences( + self.protein_df, n=len(group_df), pep_len=nAA + ) + df_list.append( + pd.DataFrame({"sequence": rnd_seqs, "nAA": nAA, column_to_train: 0}) + ) + return pd.concat(df_list).reset_index(drop=True) + + def test(self, precursor_df): + df = self._concat_neg_df(precursor_df) + self.predict(df) + prob_list = [] + precision_list = [] + recall_list = [] + fp_list = [] + for prob in [0.5, 0.6, 0.7, 0.8, 0.9]: + prob_list.append(prob) + precision_list.append(df[df.HLA_prob_pred > prob].HLA.mean()) + recall_list.append(df[df.HLA_prob_pred > prob].HLA.sum() / len(df) * 2) + fp_list.append( + 1 - (1 - df[df.HLA_prob_pred < prob].HLA).sum() / len(df) * 2 + ) + return pd.DataFrame( + dict( + HLA_prob_pred=prob_list, + precision=precision_list, + recall=recall_list, + false_positive=fp_list, + ) + ) + def _download_pretrained_hla_model(self): download_models(url=self._model_url, target_path=self._model_zip) diff --git a/peptdeep/hla/hla_utils.py b/peptdeep/hla/hla_utils.py index d74d36b0..ae29f31f 100644 --- a/peptdeep/hla/hla_utils.py +++ b/peptdeep/hla/hla_utils.py @@ -95,12 +95,12 @@ def nonspecific_digest_cat_proteins( pd.DataFrame A dataframe sorted by `nAA` with three columns: `start_pos`: the start index of the peptide in cat_protein - `end_pos`: the stop/end index of the peptide in cat_protein + `stop_pos`: the stop/end index of the peptide in cat_protein `nAA`: the number of amino acids (peptide length). """ pos_starts, pos_ends = get_substring_indices(cat_sequence, min_len, max_len) - digest_df = pd.DataFrame(dict(start_pos=pos_starts, end_pos=pos_ends)) - digest_df["nAA"] = digest_df.end_pos - digest_df.start_pos + digest_df = pd.DataFrame(dict(start_pos=pos_starts, stop_pos=pos_ends)) + digest_df["nAA"] = digest_df.stop_pos - digest_df.start_pos digest_df.sort_values("nAA", inplace=True) digest_df.reset_index(inplace=True, drop=True) return digest_df @@ -170,7 +170,7 @@ def get_seq_series(idxes_df: pd.DataFrame, cat_prot: str) -> pd.Series: pd.Series pd.Series with sub-sequences (peptide sequences). """ - return idxes_df[["start_pos", "end_pos"]].apply( + return idxes_df[["start_pos", "stop_pos"]].apply( lambda x: cat_prot[slice(*x)], axis=1 ) diff --git a/peptdeep/model/model_interface.py b/peptdeep/model/model_interface.py index 993373ec..864e3370 100644 --- a/peptdeep/model/model_interface.py +++ b/peptdeep/model/model_interface.py @@ -79,19 +79,17 @@ def __init__( optimizer, num_warmup_steps, num_training_steps, num_cycles, last_epoch ) - def step(self, epoch: int, loss: float): + def step(self, epoch: int = None, loss=None): """ Get the learning rate for the next epoch. Parameters ---------- - epoch : int + epoch : int (Deprecated) The current epoch number. - loss : float - The loss value of the current epoch. """ - return self.lambda_lr.step(epoch) + return self.lambda_lr.step() def get_last_lr(self) -> float: """ diff --git a/peptdeep/protein/fasta.py b/peptdeep/protein/fasta.py index 160a2c72..ecf92fb8 100644 --- a/peptdeep/protein/fasta.py +++ b/peptdeep/protein/fasta.py @@ -21,7 +21,7 @@ def __init__( precursor_charge_max: int = 4, precursor_mz_min: float = 400.0, precursor_mz_max: float = 1800.0, - var_mods: list = ["Acetyl@Protein N-term", "Oxidation@M"], + var_mods: list = ["Acetyl@Protein_N-term", "Oxidation@M"], min_var_mod_num: int = 0, max_var_mod_num: int = 2, fix_mods: list = ["Carbamidomethyl@C"], diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 6edc016c..c383dfc9 100644 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -1,2 +1,6 @@ -INCLUDED_NBS=$(find ../nbs_tests -name "*.ipynb") -python -m pytest --nbmake $(echo $INCLUDED_NBS) +TEST_NBS=$(find ../nbs_tests -name "*.ipynb") +TUTORIAL_NBS=$(find ../docs/tutorials -name "*.ipynb") + +ALL_NBS=$(echo $TEST_NBS$'\n'$TUTORIAL_NBS) + +python -m pytest --nbmake $(echo $ALL_NBS)