diff --git a/assess_swathlib.pl b/assess_swathlib.pl new file mode 100644 index 0000000..5b659fa --- /dev/null +++ b/assess_swathlib.pl @@ -0,0 +1,2091 @@ +#!/usr/bin/perl +# +# script to assess ion libraries (DIALib-QC) +# +=for comment + +Copyright (C) 2020 Moritz Lab, Institute for Systems Biology + +DIALib-QC.1.0 program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +=cut + + +# Import libraries +use strict; +use Data::Dumper; +use File::Basename; +use Getopt::Long; +use Text::ParseWords; + +my $t0 = time(); +print STDERR "Starting run\n"; + +# Test for presence of optional (non-core) regression module +my $has_regression = 1; +eval "use Statistics::Regression; 1" or $has_regression = 0; +if ( !$has_regression ) { + print STDERR "Unable to load module Statistics::Regression, skipping RT correlation analysis\n"; +} + +# Global vars +my %mpep2mz; # Global peptide/fragment mz hash +my %mpep2encoded; # Global modified_peptide to encoded modpep hash +my %opts = read_options(); # Command-line options +my %problem_assays; +my $kmods = get_known_mods(); # Known modification encodings + +# Variables for SWATH bin analysis +my %swaths; +my %swadefs; +my @swaths; + +# Constants +use constant WATER_MASS => 18.010565; +use constant H_MASS => 1.00727646688; + +if ( $opts{swath_file} ) { + open SWA, $opts{swath_file}; + while ( my $line = ) { + chomp $line; + my @line = split( /\s+/, $line ); + next unless $line[0] =~ /\d/; + next unless $line[1] =~ /\d/; + $swadefs{$line[0]} = $line[1]; + } + close SWA; +} +if ( $opts{swath_file} ) { + @swaths = ( qw( swa_defined swa_missing swa_conflict swa_ok swa_conflict_assay swa_5 swa_25 ) ); +} + +my %stats = ( mc => 0 ); # Hash of output values +# Initialize stats hash +for my $key ( qw( totpeps libpeps speps mc libprots sprots mprots precursor_ok precursor_bad fragment_ok fragment_bad fragment_na ), @swaths ) { + $stats{$key} = 0; +} + +# ppeps is a peptide/protein mapping file from reference DB; pr are proteotypic +my %ppeps = ( prots => {}, peps => {}, pr => {} ); + +# read in opional peptide digest file +if ( $opts{peptide_file} ) { + print STDERR "File $opts{peptide_file} does not exist\n" if ! -e $opts{peptide_file}; + + open PEPS, $opts{peptide_file}|| die; + while ( my $line = ) { + chomp $line; + my @line = split( /\t/, $line ); + $ppeps{peps}->{$line[1]}++; + if ( $line[0] !~ /,/ ) { + $ppeps{pr}->{$line[0]}++; + } + for my $prot ( split( /,/, $line[0] ) ) { + $ppeps{prots}->{$prot}++; + } + } + close PEPS; +} +$stats{libpeps} = scalar( keys( %{$ppeps{peps}} ) ); +$stats{libprots} = scalar( keys( %{$ppeps{prots}} ) ); +$stats{libprprots} = scalar( keys( %{$ppeps{pr}} ) ); # proteins with at least one proteotypic peptide + +my $libname = basename( $opts{ion_library} ); +$libname =~ s/\.tsv$//; +$libname =~ s/\.csv$//; + +# Open and read ion library file +open ILIB, $opts{ion_library}|| die; +my $head = 1; +my $type; + +my $dta_dir = $libname . '_dta'; +$dta_dir =~ s/\./_/g; +my $write_data = $opts{write} || 0; +if ( $write_data ) { + if ( -e "$libname.mgf" ) { + print "MGF file exists, exiting\n"; + exit; + } + open MGF, ">$libname.mgf" || die; +} + +my $curr; +my %dta = ( frg => {}, pre => {z => '', mz => ''} ); +my $pepkey; + +# hash of all accessions recorded in ion library +my %prots; + +my %peps = ( sing => {}, + mult => {}, + pre_z => {}, # precursor charge + frg_z => {}, # fragment charge + frg_n => {}, # fragment number + frg_s => {}, # tally fragment series + frg_i => { cnt => 0, inten => 0 }, # Summed intensity of top 5 + ); + +my %rt = ( mseqs => {}, iRT => {} ); +my $irt_cnt = 0; +my %is_irt; +for my $irt ( qw( ADVTPADFSEWSK DGLDAASYYAPVR GAGSSEPVTGLDAK GTFIIDPAAVIR GTFIIDPGGVIR LFLQFGAQGSPFLK LGGNEQVTR TPVISGGPYEYR TPVITGAPYEYR VEATFGVDESNAK YILAGVENSK ) ) { + $is_irt{$irt}++; +} + +# Hash of recognized neutral losses to their masses. +my %losses = ( 'H2O' => 18.010565, + '1(+C+H4+O+S)' => 64.107989, + 'NH3' => 17.026549, + 'noloss' => 0 ); + +my %decoy = ( mixed => 0, decoy => 0, fwd => 0); +my %extrema = ( precursor_min => 10000, fragment_min => 10000, precursor_max => 0, fragment_max => 0 ); + +if ( $opts{print_mass_error} ) { + my $me = $opts{output_file} . ".mass_error"; + if ( -e $me ) { + print STDERR "mass error file $me exists, cannot overwrite\n"; + exit; + } + open ME, ">$me"; +} + + +my %cmap; # Map of column headings to number (nth column) +my %idx; # Map of target information to column index +my %massmap; +my %precursorstats; +my %ion_cnt; +my %tmax; +my $osw_altmod = 0; +my $is_CSV; +my $quote_char = ''; +while ( my $line = ) { + chomp $line; + my @line = parse_line( $line ); + if ( !defined $is_CSV ) { + if ( scalar( @line ) < 5 ) { + @line = parse_csv( $line ); + if ( scalar( @line ) < 5 ) { + print STDERR "Unknown file format, must be either TSV or CSV\n"; + exit 256; + } else { + $is_CSV++; + print STDERR "CSV format detected\n"; + if ( $line =~ /^(["']).*["']\s*$/ ) { + $quote_char = $1; + print STDERR "Quote character is $quote_char\n"; + } + } + } else { + $is_CSV = 0; + } + } + + if ( $head ) { + my $idx = 0; + for my $col ( @line ) { + $cmap{$col} = $idx++; + } + if ( !$line[1] ) { + print STDERR "Error: $ARGV[0] format invalid \n"; + } elsif ( $line =~ /Q1/ && $line =~ /modification_sequence/ ) { + $type = 'pv'; + print STDERR "Peakview library detected\n"; + } elsif ( $line =~ /PrecursorMz/ ) { + if ( $line =~ /transition_group_id/ ) { + print STDERR "OpenSWATH library detected\n"; + $type = 'os'; + } elsif ( $line =~ /iRT Value/ || $line =~ /ReferenceRun/ ) { + print STDERR "Spectronaut library detected\n"; + $type = 'sn'; + } elsif ( $line =~ /FragmentLossType/ ) { + print STDERR "Prosit/Fusion library format format detected\n"; + $type = 'pf'; # 'Generic' + } else { + print STDERR "Error: $ARGV[0] format unknown\n"; + } + if ( $type eq 'os' ) { + if ( !defined $cmap{FullUniModPeptideName} ) { + $cmap{FullUniModPeptideName} = $cmap{FullPeptideName}; + } + if ( !defined $cmap{PrecursorCharge} ) { + $cmap{PrecursorCharge} = $cmap{Charge}; + } + if ( !defined $cmap{Tr_recalibrated} ) { + $cmap{Tr_recalibrated} = $cmap{RetentionTime}; + } + die "Can't find peptide name" if !defined $cmap{FullUniModPeptideName}; + } + } else { + print STDERR "Error: $ARGV[0] format invalid \n"; + } + exit unless $type; + $head = 0; + + # Modifed to fetch indexes because of perl default array indexing could hide issues. + # find_index gets up to 3 args: arrayref of possible headings, checked in order, common + # name of item being sought (for debug messages), and (opt) whether it is OK to have null + $idx{frg_z} = find_index( [qw(frg_z FragmentCharge ProductCharge)], 'Fragment ion charge' ); + $idx{ion_s} = find_index( ['frg_type','FragmentType','FragmentIonType'], 'Ion series' ); + $idx{ion_n} = find_index( ['frg_nr','FragmentNumber','FragmentSeriesNumber'], 'Ion series number' ); + $idx{mseq} = find_index( ['modification_sequence','FullUniModPeptideName', 'PeptideModifiedSequence', 'ModifiedPeptide','IntModifiedPeptide'], 'Modified peptide sequence' ); + $idx{seq} = find_index( ['stripped_sequence','PeptideSequence', 'StrippedPeptide'], 'Stripped peptide sequence' ); + $idx{precursor} = find_index( ['Q1','PrecursorMz'], 'Precursor m/z' ); + $idx{fragment} = find_index( ['Q3','ProductMz', 'FragmentMz'], 'Fragment m/z' ); + $idx{rt} = find_index( ['RT_detected','Tr_recalibrated', 'iRT Value', 'iRT'], 'Retention time' ); + $idx{pre_z} = find_index( [qw(prec_z PrecursorCharge)], 'Precursor ion charge' ); + # Missing value OK for these three + $idx{protstr} = find_index( ['uniprot_id','protein_name','ProteinName','Protein Name','ProteinId', 'UniprotID', 'UniProtIds' ],'Protein name(s)',1 ); + $idx{decoy} = find_index( ['decoy'], 'Decoy',1 ); + $idx{lossType} = find_index( [qw( FragmentLossType ) ], 'Fragment neutral loss', 1 ); + next; + } + + # OS decoy skipping column triage + if ( $opts{skip_decoys} ) { + if ( $type eq 'os' ) { + next if !defined $idx{decoy}; + if ( $line[$idx{decoy}] eq 'TRUE' || !$line[$idx{decoy}] ) { + next; + } + } + } + + ## Extract assay-related data based on library type + + # Precursor charge + my $pre_z = $line[$idx{pre_z}]; + $peps{pre_z}->{$pre_z}++; + + # Fragment charge + my $frg_z = $line[$idx{frg_z}]; + $peps{frg_z}->{$frg_z}++; + + # Ion series + my $ion_s = $line[$idx{ion_s}]; + $peps{frg_s}->{$ion_s}++; + + # Ion series number + my $ion_n = $line[$idx{ion_n}]; + $peps{frg_n}->{$ion_n}++; + + # Modified sequence + my $mseq = $line[$idx{mseq}]; + $mseq =~ s/^_//g; + $mseq =~ s/_$//g; + + # Stripped sequence + my $seq = $line[$idx{seq}]; + + # Retention time + my $rt = $line[$idx{rt}]; + + # Fragment m/z + my $fragment = $line[$idx{fragment}]; + + #Precursor m/z + my $precursor = $line[$idx{precursor}]; + $precursorstats{$precursor}++; + + # Protein string (may be multiple). + my $protstr = $line[$idx{protstr}]; + if ( !defined $idx{protstr} ) { + $protstr = 'Default'; + } + + my $lossType = ( defined $idx{lossType} ) ? $line[$idx{lossType}] : ''; + if ( $lossType && !defined( $losses{$lossType} ) ) { + print STDERR "unknown loss type $lossType specified, known values include:\n"; + for my $ls ( sort( keys( %losses ) ) ) { + print STDERR join( "\t", $ls, $losses{$ls} ) . "\n"; + } + exit 144; + } + + if ( $opts{debug} ) { + print STDERR "Dumping parsed values"; + print STDERR qq~ +protstr: $protstr +precursor: $precursor +fragment: $fragment +rt: $rt +seq: $seq +mseq: $mseq +pre_z: $pre_z +frg_s: $ion_s +frg_n: $ion_n +frg_z: $frg_z + +~; + + print STDERR "Dumping column mappings\n"; + for my $f ( sort( keys( %cmap ) ) ) { + print STDERR join( "\t", $cmap{$f}, $f, $line[$cmap{$f}] ) . "\n"; + } + exit; + } + + + # Mod seq + charge + $pepkey = $mseq . '_' . $pre_z; + $ion_cnt{$pepkey}++; + + if ( $type ne 'pv' && !$osw_altmod ) { + $osw_altmod++ if $mseq =~ /\[\d+\]/; + } + + # Do some clean-up on protein string (defined above) + $protstr =~ s/\"//g; + $protstr =~ s/^\d+\/*//; + $protstr ||= 'DEFAULT'; + if ( $type eq 'os' ) { + $protstr =~ s/;/,/g; + } + + my @prot; + if ( $protstr =~ /,/ ) { + @prot = split( /,/, $protstr, -1 ); + } else { + @prot = split( /\//, $protstr, -1 ); + } + +# fragment, Q3 plus intensity +# my $fragkey = "$line[1] $line[5]"; +# my $fragkey = ( $type eq 'pv' ) ? "$line[1] $line[5]" : "$line[$cmap{ProductMz}] $line[$cmap{}]"; +# my $ion_s = ( $type eq 'pv' ) ? "$line[9]" : "$line[15]"; + + $peps{t6_frg_s} ||= {}; + $peps{t6_frg_s}->{$ion_s}++ unless $ion_cnt{$pepkey} > 6; + + if ( $fragment > $precursor ) { + $stats{fragment_above_precursor}++; + } + $stats{fragment}++; + + my $swakey = $pepkey; + if ( $opts{swath_file} ) { + if ( !$swaths{$swakey} ) { + $swaths{$swakey} = []; + for my $min ( keys( %swadefs ) ) { + if ( $precursor >= $min && $precursor <= $swadefs{$min} ) { + push @{$swaths{$swakey}}, { min => $min, max => $swadefs{$min} }; + } + } + if ( !scalar( @{$swaths{$swakey}} ) ) { + $stats{swa_missing}++; + $problem_assays{$pepkey}++; + next; + } else { + $stats{swa_defined}++ + } + } + my $is_bad = 0; + for my $swa ( @{$swaths{$swakey}} ) { + if ( $fragment >= $swa->{min} && $fragment <= $swa->{max} ) { + print STDERR "$fragment ($precursor) in CONFLICT (suppressing further warnings)\n" if $opts{verbose} && !$stats{swa_conflict}; # Complain, but just once + $stats{swa_conflict}++; + $problem_assays{$pepkey}++; + $is_bad++; + } else { + $stats{swa_ok}++; + } + } + $stats{swa_conflict_assay} ||= {}; + $stats{swa_conflict_assay}->{$precursor}++ if $is_bad; +# $stats{swa_conflict_precursor}++ if $is_bad; + if ( abs( $precursor - $fragment ) <= 5 ) { + $stats{swa_5}++; + } elsif ( abs( $precursor - $fragment ) <= 25 ) { + $stats{swa_25}++; + } + } + +# Some hand-edited libraries might have blank lines... + next if $line =~ /^\s*$/; + + if ( !defined $fragment || !defined $precursor ) { + print STDERR "Unable to find prec a/o fragment ($precursor and $fragment) from $line\n"; + die; + } + + $extrema{precursor_max} = $precursor if $precursor > $extrema{precursor_max}; + $extrema{fragment_max} = $fragment if $fragment > $extrema{fragment_max}; + $extrema{precursor_min} = $precursor if $precursor < $extrema{precursor_min}; + $extrema{fragment_min} = $fragment if $fragment < $extrema{fragment_min}; + + + if ( $curr && $curr ne $pepkey ) { + + my $fcnt = 0; + for my $mz ( sort { $a <=> $b }( keys( %{$dta{frg}} ) ) ) { + $peps{frg_i}->{cnt}++; + $peps{frg_i}->{inten} += $dta{frg}->{$mz}; + last if $fcnt++ >= 5; + } + + if ( $write_data ) { + my $mw = sprintf( "%0.4f", $dta{pre}->{mz}/$dta{pre}->{z} ); + print MGF qq~BEGIN IONS +PEPMASS=$mw +CHARGE=$dta{pre}->{z} +TITLE=$pepkey +~; + for my $mz ( sort { $a <=> $b }( keys( %{$dta{frg}} ) ) ) { + print MGF join( "\t", $mz, $dta{frg}->{$mz} ) . "\n"; + } + print MGF "END IONS\n"; + } + if ( scalar(keys(%{$dta{frg}})) > 5 ) { + $stats{frg_cnt_ok}++; + } else { + $stats{frg_cnt_short}++; + } + %dta = ( frg => {}, pre => {z => '', mz => ''} ); + } + $curr = $pepkey; + $dta{pre}->{mz} = $precursor; + $dta{pre}->{z} = $pre_z; + my $mz = sprintf( "%0.4f", $line[1]); + my $int = $line[5]; + $dta{frg}->{$mz} = $int; + + $tmax{$pepkey} ||= { idx => 1, + max => 0, + max_idx => 1 }; + if ( $int > $tmax{$pepkey}->{max} ) { + $tmax{$pepkey}->{max} = $int; + $tmax{$pepkey}->{max_idx} = $tmax{$pepkey}->{idx}; + } + $tmax{$pepkey}->{idx}++; + + if ( $opts{assess_massdiff} ) { + my $masskey = sprintf( "%0.4f", $precursor ); + $massmap{$pepkey} ||= {}; + my $skip = 0; + my ( $pseq, $pchg ) = split( /_/, $pepkey ); + my $eseq = encode_modifications( $pseq ); +# print STDERR "$pseq becomes $eseq \n" if $pseq ne $eseq; + my %ion2mz; + if ( $massmap{$pepkey}->{$masskey} ) { + if ( !$massmap{$pepkey}->{$masskey}->{precursor_ok} ) { + $problem_assays{$pepkey}++; + +# Misleading because known fragments labeled as na - revisit if performance becomes an issue +# # Issue with this precursor already documented, no use checking fragment +# $skip++ unless $opts{correct_mz}; + } + } else { + + my $pmz = get_peptide_mass( $eseq, $pchg, 1 ); + my $ions = generate_ions( $eseq ); + + for ( my $fchg = 1; $fchg <= $pchg; $fchg++ ) { + for my $yion ( keys( %{$ions->{y}} ) ) { + my $ion_key = 'y' . $yion . '_' . $fchg; + $ion2mz{$ion_key} = get_peptide_mass( $ions->{y}->{$yion}, $fchg ); + } + for my $bion ( keys( %{$ions->{b}} ) ) { + my $ion_key = 'b' . $bion . '_' . $fchg; + $ion2mz{$ion_key} = get_peptide_mass( $ions->{b}->{$bion}, $fchg ) - WATER_MASS/$fchg; + } + } + + my $delta = abs( $precursor - $pmz ); + if ( $delta ) { + my $dawn = $delta * 200000; + if ( $dawn > $precursor - 0 ) { # More than 5ppm different + $massmap{$pepkey}->{$masskey}->{precursor_ok} = 0; + $problem_assays{$pepkey}++; + if ( $opts{print_mass_error} ) { + print ME join( "\t", "Precursor", $pepkey, $eseq, $pchg, $precursor, $pmz, $delta, $dawn ) . "\n"; + } + print STDERR "$pepkey has precursor error with $delta from $pmz and $precursor\n" if $opts{verbose}; + $stats{precursor_bad}++; + +# Misleading because known fragments labeled as na - revisit if performance becomes an issue +# # Issue with this precursor already documented, no use checking fragment +# $skip++ unless $opts{correct_mz}; + } else { + $massmap{$pepkey}->{$masskey}->{precursor_ok} = 1; + $stats{precursor_ok}++; + } + } else { + $massmap{$pepkey}->{$masskey}->{precursor_ok} = 1; + $stats{precursor_ok}++; + } + } + unless ( $skip ) { + if ( !defined $massmap{$pepkey}->{$masskey}->{peaks} ) { + $massmap{$pepkey}->{$masskey}->{peaks} = \%ion2mz; + } + } + my $peak_key = $ion_s . $ion_n . '_' . $frg_z; +# if ( $massmap{$pepkey}->{$masskey}->{peaks}->{$ion_s . $ion_n . '_' . $frg_z} ) { + if ( $massmap{$pepkey}->{$masskey}->{peaks}->{$peak_key} ) { + my $tmz = $massmap{$pepkey}->{$masskey}->{peaks}->{$peak_key}; + if ( $lossType ) { + $tmz -= $losses{$lossType}/$frg_z; + } + my $tdelta = abs( $tmz - $fragment ); + $stats{delta_cnt}++; + $stats{delta_sum} += $tdelta; + if ( $tdelta ) { + if ( $tdelta * 1000000 > $fragment ) { # More than 1ppm different + $stats{fragment_bad}++; + $problem_assays{$pepkey}++; + if ( $opts{print_mass_error} ) { + print ME join( "\t", "Fragment", $pepkey, $ion_s . $ion_n . '_' . $frg_z, $tmz, $fragment, $tdelta, $lossType ) . "\n"; + } + } else { + $stats{fragment_ok}++; + } + } else { + $stats{fragment_ok}++; + } + } else { + print STDERR "Can't find $peak_key for $pepkey and $masskey\n"; +# die Dumper( $massmap{$pepkey}->{$masskey}->{peaks} ); + + $stats{fragment_na}++; + } + } + + + # Cache trans per pep, pep per prot + my $mult = ( scalar( @prot ) > 1 ) ? 'mult' : 'sing'; + + my $fwd; + my $decoy; + for my $prot ( @prot ) { + $prots{$prot} ||= { sing => {}, mult => {}, stripped => {} }; + $prots{$prot}->{$mult}->{$pepkey}++; + $prots{$prot}->{stripped}->{$seq}++; + if ( $prot =~ /DECOY/ ) { + $decoy++; + } elsif ( $opts{alt_decoy} && $prot =~ /$opts{alt_decoy}/ ) { + $decoy++; + } else { + $fwd++; + } + } + if ( $decoy ) { + if ( $fwd ) { + $decoy{mixed}++; + } else { + $decoy{decoy}++; + } + } else { + $decoy{fwd}++; + } + + $peps{$mult}->{$pepkey}++; + if ( !$ion_s || ($ion_s !~ /y/i && $ion_s !~ /b/i ) ) { + $peps{ion_s}->{$pepkey}++; + } + + if ( !$ion_n || $ion_n < 3 ) { + $peps{ion_n}->{$pepkey}++; + } + +# 0 PrecursorMz [ 778.4129855 ] +# 1 ProductMz [ 498.26707303 ] +# 2 Tr_recalibrated [ 57.3 ] +# 3 transition_name [ 6_AAAAAAAAAAAAAAAASAGGK_2 ] +# 4 CE [ -1 ] +# 5 LibraryIntensity [ 7324.6 ] +# 6 transition_group_id [ 1_AAAAAAAAAAAAAAAASAGGK_2 ] +# 7 decoy [ 0 ] +# 8 PeptideSequence [ AAAAAAAAAAAAAAAASAGGK ] +# 9 ProteinName [ 1/P0CG40 ] +# 10 Annotation [ b7/-0.009,b14^2/-0.009,m10:16/-0.009 ] +# 11 FullUniModPeptideName [ AAAAAAAAAAAAAAAASAGGK ] +# 12 PrecursorCharge [ 2 ] +# 13 GroupLabel [ light ] +# 14 UniprotID [ 1/P0CG40 ] +# 15 FragmentType [ b ] +# 16 FragmentCharge [ 1 ] +# 17 FragmentSeriesNumber [ 7 ] + + + if ( $is_irt{$seq} ) { + $rt{irt}->{$seq}++; + $irt_cnt++; + } + $rt{mseqs}->{$mseq} ||= {}; + $rt{mseqs}->{$mseq}->{$pre_z} ||= $rt; +} +for my $ext ( keys( %extrema ) ) { + $extrema{$ext} = sprintf( "%0.1f", $extrema{$ext} ); +} + +my $tmax_sum; +my $tmax_cnt; +for my $key ( keys( %tmax ) ) { + $tmax_cnt++; + $tmax_sum += $tmax{$key}->{max_idx}; +} +my $tmax_avg = sprintf( "%0.1f", $tmax_sum/$tmax_cnt ); + +# Finish with last accumulated pepion +close ILIB; + +if ( $opts{print_mass_error} ) { + close ME; +} + +# RT data calculation +my $n_irt = scalar( keys( %{$rt{irt}} ) ); + +my @rt_all; +my @rt_two; +my @rt_three; +my %pepions; +for my $mseq ( keys( %{$rt{mseqs}} ) ) { + for my $ch ( keys( %{$rt{mseqs}->{$mseq}} ) ) { + $pepions{$mseq . '_' . $ch}++; + push @rt_all, $rt{mseqs}->{$mseq}->{$ch}; + } + next unless $rt{mseqs}->{$mseq}->{2} && $rt{mseqs}->{$mseq}->{3}; + push @rt_two, $rt{mseqs}->{$mseq}->{2}; + push @rt_three, $rt{mseqs}->{$mseq}->{3}; +} + +@rt_all = sort { $a <=> $b } ( @rt_all ); +my $rt_min = $rt_all[0]; +my $rt_max = $rt_all[$#rt_all]; +my $rt_med = $rt_all[int($#rt_all/2)]; + +my $rsq = 'na'; +my $rt_five = 'na'; +if ( scalar( @rt_two ) < 8 ) { + print STDERR "Too few points: ". scalar( @rt_two ) . " to run regression \n"; +} else { + if ( $opts{rt_stats} ) { + my $rt = $opts{output_file} . ".RT"; + if ( -e $rt ) { + print STDERR "RT stats file $rt exists, cannot overwrite\n"; + exit; + } + print STDERR "Printing +2/+3 pairs to $rt\n"; + open RT, ">$rt"; + } + + my $reg; + if ( $has_regression ) { + $reg = Statistics::Regression->new( 'Y', [ 'Z', 'X' ] ); + } + my $imperfect = 0; + my $ok = 0; + my $ok_delta = 5; + for ( my $i = 0; $i <= $#rt_two; $i++ ) { + print RT join( "\t", $rt_two[$i], $rt_three[$i] ) . "\n" if $opts{rt_stats}; + + if ( $has_regression ) { + $reg->include( $rt_two[$i], [ 1.0, $rt_three[$i] ] ); + } + $imperfect++ unless $rt_two[$i] == $rt_three[$i]; + $ok++ if abs( $rt_two[$i] - $rt_three[$i] ) <= $ok_delta; + } + close RT if $opts{rt_stats}; + $rsq = 1.000; + if ( !$has_regression ) { + $rsq = 'NoStats'; + print STDERR "Unable to run regression without Statistics::Regression\n" if $opts{verbose}; + } elsif ( $imperfect ) { + print STDERR "Running regression with " . scalar( @rt_two ) . " Total points\n" if $opts{verbose}; + $rsq = sprintf( "%0.3f", $reg->rsq() ); + } else { + print STDERR "Skipping regression with " . scalar( @rt_two ) . " identical values\n" if $opts{verbose}; + } + if ( $rt_max < 600 ) { + print STDERR "Assuming rt in minutes due to $rt_max\n"; + } else { + print STDERR "Assuming rt in seconds\n"; + $ok_delta *= 60; + } + $rt_five = sprintf( "%0.1f", ($ok/scalar(@rt_two))*100 ); +} + +my $dtot; +for my $dtype ( qw( fwd decoy mixed ) ) { + $dtot += $decoy{$dtype}; +} +my $decoy_pct = ( $decoy{decoy} ) ? sprintf( "%0.1f", 100*($decoy{decoy}/$dtot)) : 0; +my $mixed_pct = ( $decoy{mixed} ) ? sprintf( "%0.1f", 100*($decoy{mixed}/$dtot)) : 0; +my $fwd_pct = ( $decoy{fwd} ) ? sprintf( "%0.1f", 100*($decoy{fwd}/$dtot)) : 0; + + +my $fcnt = 0; +for my $mz ( sort { $a <=> $b }( keys( %{$dta{frg}} ) ) ) { + $peps{frg_i}->{cnt}++; + $peps{frg_i}->{inten} += $dta{frg}->{$mz}; + last if $fcnt++ >= 5; +} + +if ( $write_data ) { + my $mw = sprintf( "%0.4f", $dta{pre}->{mz} ); + print MGF qq~BEGIN IONS +PEPMASS=$mw +CHARGE=$dta{pre}->{z} +TITLE=$pepkey +~; + for my $mz ( sort { $a <=> $b }( keys( %{$dta{frg}} ) ) ) { + print MGF join( "\t", $mz, $dta{frg}->{$mz} ) . "\n"; + } + print MGF "END IONS\n"; + close MGF; +} + + +$peps{stripped_len} = {}; +my %stripped; +for my $type ( qw( sing mult ) ) { + for my $pep ( keys( %{$peps{$type}} ) ) { + $stats{$type}->{pepcnt}++; + my $tpep = $pep; + $tpep =~ s/\[[^]]+\]//g; + $tpep =~ s/\([^)]+\)//g; + $tpep =~ s/\d//g; + $tpep =~ s/_//g; + $tpep =~ s/-//g; + if ( $tpep !~ /^\w+$/ ) { + die "Issue with $pep: stripped value is $tpep\n"; + } +# $stats{ 'str_' . $type }; + $stripped{$tpep}++; + my $tlen = length( $tpep ); + $peps{stripped_len}->{$tlen}++; + $stats{$type}->{len} += $tlen; + $stats{$type}->{frags} += $peps{$type}->{$pep}; + } +} + +# Which ppeps were seeen? +my %seen = ( prots => {}, mprots => {}, peps => {}, mpeps => {} ); +for my $sp ( keys( %stripped ) ) { + if ( $ppeps{peps}->{$sp} ) { + $seen{peps}->{$sp}++; + } else { + $seen{mpeps}->{$sp}++; + } + if ( $sp =~ /[KR][^P]/ ) { + $stats{mc}++; + } +} + +#for my $p ( sort( keys( %prots ) ) ) { +# my $pcnt = scalar( keys( %{$prots{$p}->{stripped}} ) ); +# print join( "\t", $p, $pcnt ) . "\n"; +#} +#exit; + +# How many prots were seen +my %pepsperprot; +my @protcnts; +my $prottot; +my %protstats; +for my $prot ( keys( %prots ) ) { + $pepsperprot{$prot} = scalar( keys( %{$prots{$prot}->{mult}} ) ); + $pepsperprot{$prot} += scalar( keys( %{$prots{$prot}->{sing}} ) ); + if ( $pepsperprot{$prot} < 0 ) { + print STDERR "$prot\n"; +# print STDERR Dumper( $prots{$prot} ); +# exit; + } + $prottot += $pepsperprot{$prot}; + push @protcnts, $pepsperprot{$prot}; + $protstats{$pepsperprot{$prot}}++; + if ( $ppeps{prots}->{$prot} ) { + $seen{prots}->{$prot}++; + } elsif ( $prot =~ /RT-Cal/ ) { + $seen{prots}->{$prot}++; + } else { + $seen{mprots}->{$prot}++; + } +} + + +my @sorted = sort( { $a <=> $b } @protcnts ); +my $half = int( $#protcnts/2 + 1 ); +my $med = $sorted[$half]; +my $mean = sprintf( "%0.1f", $prottot/(1+$#protcnts) ); +my $devs; +for ( my $i = 0; $i <= $#protcnts; $i++ ) { + $devs += ( $protcnts[$i] - $mean )**2; +} +my $stddev = sprintf( "%0.1f", ($devs/(1+$#protcnts))**0.5 ); + +my $overprots = 0; +for my $cnt ( sort { $b <=> $a } keys( %protstats ) ) { + last if $cnt < ( $mean + ( 3 * $stddev ) ); + $overprots += $protstats{$cnt}; +} + + + + +$stats{totpeps} = scalar( keys( %stripped ) ); +$stats{sprots} = scalar( keys( %{$seen{prots}} ) ); +$stats{mprots} = scalar( keys( %{$seen{mprots}} ) ); +if ( $stats{libprots} < 1 ) { + $stats{sprots} = 'N/A'; + $stats{mprots} = 'N/A'; +} + +if ( $opts{print_ux} ) { + my $ux = $opts{output_file} . ".UX_prots"; + if ( -e $ux ) { + print STDERR "Unexplained proteins file $ux exists, cannot overwrite\n"; + exit; + } + print STDERR "Printing 'unexplained' proteins to $ux\n"; + open UX, ">$ux"; + for my $p ( sort( keys( %{$seen{mprots}} ) ) ) { + print UX "$p\n"; + } + close UX; +} + + +#$stats{allprots} = $stats{sprots} + $stats{mprots}; +$stats{allprots} = scalar( keys( %prots ) ); + +#print join( "\n", sort( keys( %{$seen{mprots}} ) ) ) . "\n"; +$stats{speps} = scalar( keys( %{$seen{peps}} ) ); +$stats{mpeps} = scalar( keys( %{$seen{mpeps}} ) ); + + +# print join( "\t", qw( Library Pepions Ptyp Mult Ptyp_perc Mult_perc alt_ser low_nr alen afrg afrglen Tot_peps MC_peps Lib_peps Seen_peps Lib_prots Seen_prots UX_prots ) ) . "\n"; +# print join( "\t", qw( Library Pepions Ptyp Mult alt_ser low_nr alen afrg afrglen Ptyp_perc Mult_perc alt_ser_perc low_nr_perc ) ) . "\n"; + + $stats{sing}->{pepcnt} ||= 0; + $stats{mult}->{pepcnt} ||= 0; + $stats{sing}->{len} ||= 0; + $stats{mult}->{len} ||= 0; + my $pepcnt = $stats{sing}->{pepcnt} + $stats{mult}->{pepcnt}; + for my $type ( qw( sing mult ) ) { + $stats{$type}->{perc} = sprintf( "%0.2f", 100 * $stats{$type}->{pepcnt} / $pepcnt ); + } + $stats{mult}->{frags} ||= 0; + $stats{sing}->{frags} ||= 0; + my $tot_len = $stats{sing}->{len} + $stats{mult}->{len}; + my $tot_frags = $stats{sing}->{frags} + $stats{mult}->{frags}; + + my $alen = sprintf( "%0.1f", $tot_len/$pepcnt ); + my $afrg = sprintf( "%0.2f", $tot_frags/$pepcnt ); + my $afrglen = sprintf( "%0.2f", $afrg/$alen ); + + + # depricate s_cnt (alt series), instead do y and b seriest cnt/perc +# my $s_cnt = scalar(keys(%{$peps{ion_s}} ) ); +# my $s_perc = ( $s_cnt ) ? '0.00' : sprintf( "%0.2f", 100*($s_cnt/$pepcnt)); + my $y_cnt = $peps{frg_s}->{y} || 0; + my $b_cnt = $peps{frg_s}->{b} || 0; + my $y_perc = ( $tot_frags ) ? sprintf( "%0.1f", ($y_cnt/$tot_frags)*100 ) : 0.0; + my $b_perc = ( $tot_frags ) ? sprintf( "%0.1f", ($b_cnt/$tot_frags)*100 ) : 0.0; + + my $top_y = $peps{t6_frg_s}->{y} || 0; + my $top_b = $peps{t6_frg_s}->{b} || 0; + my $T6_y_perc = ( $top_y || $top_b ) ? sprintf( "%0.1f", (100*$top_y/($top_y+$top_b)) ) : 0.0; + + my $ainten = ( $peps{frg_i}->{cnt} ) ? sprintf( "%0.1f", $peps{frg_i}->{inten}/$peps{frg_i}->{cnt} ) : '0.0'; + + my $tot_pre = 0; + for my $z ( keys( %{$peps{pre_z}} ) ) { + next if $z eq 'peps'; + $tot_pre += $peps{pre_z}->{$z}; + } + my $pre_2 = ( $tot_pre && $peps{pre_z}->{2} ) ? sprintf( "%0.1f", 100*($peps{pre_z}->{2}/$tot_pre) ) : '0.0'; + my $pre_3 = ( $tot_pre && $peps{pre_z}->{3} ) ? sprintf( "%0.1f", 100*($peps{pre_z}->{3}/$tot_pre) ) : '0.0'; + + # The number of missing or < 3 frg numbers + my $n_cnt = scalar(keys(%{$peps{ion_n}})); + + my $short_perc = ( $stats{frg_cnt_short} ) ? sprintf( "%0.1f", ($stats{frg_cnt_short}/($stats{frg_cnt_short}+$stats{frg_cnt_ok})*100 )) : 0; + + my %mods; + my %mcnt; + my %mseen; + + + for my $mpep ( keys( %{$peps{sing}}), keys(%{$peps{mult}} ) ) { + + # Only count one charge state + $mpep =~ s/_\d+$//; + next if $mseen{$mpep}++; + $mcnt{all}++; + my $regex = '(\w?\[[^\]]+\])'; + if ( $mpep =~ /UniMod/ ) { + $regex = '(\w?\(UniMod:\d+\))'; + } else { + next unless $mpep =~ /\[/; + } + my $has_mods = 0; + for my $m ( $mpep =~ /$regex/ig ) { + $mods{$m}++; + $has_mods++; + } + $mcnt{mod}++ if ( $has_mods ); + } + my $mcnt = 0; + $peps{mods} = { none => $mcnt{all} - $mcnt{mod}, any => $mcnt{mod} }; + for my $mod ( sort( keys( %mods ) ) ) { + $mcnt += $mods{$mod}; + $peps{mods}->{$mod} = $mods{$mod}; + } + my $mpct = sprintf( "%0.1f", (($mcnt{mod}/$mcnt{all})*100) ); + print STDERR "Saw $mcnt{mod} peptides with modifications, $mcnt total mods in those peptides, and $mcnt{all} distinct modified peptides (includes non-mod)\n"; + + my $tot_fragment = $stats{fragment}; + my $fragment_above_pct = ( $stats{fragment_above_precursor} ) ? sprintf( "%0.3f", $stats{fragment_above_precursor}/$stats{fragment}) : 0; + + my @checks; + if ( $opts{assess_massdiff} ) { + @checks = ( qw( precursor_ok precursor_bad fragment_ok fragment_bad fragment_na fragment_avg_mdiff ) ); + } + + + my $problem_assays = scalar( keys( %problem_assays ) ); + my $n_perc = ( $n_cnt ) ? '0.00' : sprintf( "%0.2f", 100*($n_cnt/$pepcnt)); + my $pepion_cnt = scalar(keys(%pepions)) || 0; + my @headings = ( qw( library + format + pepions + fragments + ptp_percent + shared_percent + shared_pepions + peptides + mod_peps + mod_percent + total_mods + chg_2 + chg_3 + precursor_min + precursor_max + fragment_min + fragment_max + avg_len + avg_num_frags + avg_frag_len + short_perc + fragment_above_precursor + y_perc + b_perc + t6_y_perc + avg_intensity + rt_min + rt_max + rt_med + rt_rsq + rt_five + n_irt + irt_cnt + low_nr + db_peps + seen_peps + mc_peps + db_prots + lib_prots + seen_prots + ux_prots + decoy_pct + mixed_pct + fwd_pct + med_ppp + mean_ppp + stddev_ppp + 3_sigma_ppp + max_intensity_idx + ), @checks, @swaths, 'problem_assays' ); + + + my @checkvals = (); + if ( $opts{assess_massdiff} ) { + $stats{fragment_avg_mdiff} = ( $stats{delta_cnt} ) ? sprintf( "%0.4f", $stats{delta_sum}/$stats{delta_cnt}) : 0; + for my $k ( @checks ) { + push @checkvals, $stats{$k} || 0; + } + } + + my @swathvals = (); + if ( $opts{swath_file} ) { + for my $skey ( @swaths ) { + if ( $skey eq 'swa_conflict_assay' ) { + push @swathvals, scalar(keys(%{$stats{$skey}} ) ); + } else { + push @swathvals, $stats{$skey}; + } + } + } + + $rt_min = sprintf( "%0.1f", $rt_min ); + $rt_max = sprintf( "%0.1f", $rt_max ); + $rt_med = sprintf( "%0.1f", $rt_med ); + + my %type2sw = ( os => 'OpenSWATH', + pv => 'Peakview', + sn => 'Spectronaut', + pf => 'Prosit/Fusion' ); + + my @out_values = ( + $libname, + $type2sw{$type}, + $pepion_cnt, + $tot_fragment, + $stats{sing}->{perc}, + $stats{mult}->{perc}, + $stats{mult}->{pepcnt}, + $stats{totpeps}, + $mcnt{all}, + $mpct, + $mcnt, + $pre_2, + $pre_3, + $extrema{precursor_min}, + $extrema{precursor_max}, + $extrema{fragment_min}, + $extrema{fragment_max}, + $alen, + $afrg, + $afrglen, + $short_perc, + $fragment_above_pct, + $y_perc, + $b_perc, + $T6_y_perc, + $ainten, + $rt_min, + $rt_max, + $rt_med, + $rsq, + $rt_five, + $n_irt, + $irt_cnt, + $n_cnt, + $stats{libpeps}, + $stats{speps}, + $stats{mc}, + $stats{libprots}, + $stats{allprots}, + $stats{sprots}, + $stats{mprots}, + $decoy_pct, + $mixed_pct, + $fwd_pct, + $med, + $mean, + $stddev, + $overprots, +# die "over is $overprots med is $med, mean is $mean, and stddev is $stddev\n"; + $tmax_avg, + @checkvals, + @swathvals, + $problem_assays + ); + + my $defs = get_coldefs(); + my @defs; + for my $head ( @headings ) { + push @defs, $defs->{$head} || ''; + } + +# if ( !$opts{redirect} && !($opts{filter_assays} || $opts{correct_mz} ) ) { + if ( !$opts{redirect} ) { + open OUT, ">$opts{output_file}.QC"; + print STDERR "Opening $opts{output_file}.QC for writing!\n"; + if ( $opts{invert_output} ) { + for ( my $i = 0; $i <= $#headings; $i++ ) { + print OUT join( "\t", $headings[$i], $out_values[$i] ); + if ( $opts{coldefs} ) { + print OUT "\t$defs[$i]"; + } + print OUT "\n"; + } + } else { + print OUT join( "\t", @headings ) . "\n"; + print OUT join( "\t", @out_values ) . "\n"; + if ( $opts{coldefs} ) { + print OUT join( "\t", @defs ) . "\n"; + } + } + close OUT; +# } elsif ( !($opts{filter_assays} || $opts{correct_mz} ) ) { + } else { + if ( $opts{invert_output} ) { + for ( my $i = 0; $i <= $#headings; $i++ ) { + print join( "\t", $headings[$i], $out_values[$i] ); + if ( $opts{coldefs} ) { + print "\t$defs[$i]"; + } + print "\n"; + } + } else { + print join( "\t", @headings ) . "\n"; + print join( "\t", @out_values ) . "\n"; + if ( $opts{coldefs} ) { + print join( "\t", @defs ) . "\n"; + } + } + } + + if ( !$problem_assays ) { + print STDERR "No problem assays found!\n"; + } else { + print STDERR "$problem_assays problem assays found!\n"; + } + + # User has requested a clean library + if ( $opts{filter_assays} && $problem_assays ) { + + my $clean = $opts{ion_library} . ".clean"; + my $problem = $opts{ion_library} . ".problem"; + if ( -e $problem ) { + print STDERR "problem library $problem exists, cannot overwrite\n"; + } elsif ( -e $clean ) { + print STDERR "clean library $clean exists, cannot overwrite\n"; + } else { + open ILIB, $opts{ion_library}|| die; + + print STDERR "Printing problem assays to $problem\n"; + print STDERR "Printing good assays to $clean\n"; + open CLEANLIB, ">$clean" || die; + open PROBLEMLIB, ">$problem" || die; + my $head = 1; + while ( my $line = ) { + if ( $head ) { + print CLEANLIB $line; + print PROBLEMLIB $line; + $head = 0; + next; + } + chomp $line; +# my @line = split( /\t/, $line, -1 ); + my @line = parse_line( $line ); + + my $pre_z = $line[$idx{pre_z}]; + my $mseq = $line[$idx{mseq}]; + + my $pepkey = $mseq . '_' . $pre_z; + + if ( $problem_assays{$pepkey} ) { + print PROBLEMLIB "$line\n"; + } else { + print CLEANLIB "$line\n"; + } + } + close ILIB; + close CLEANLIB; + close PROBLEMLIB; + } + } + + # User has requested an mz-corrected library + if ( $opts{correct_mz} ) { + + if ( $stats{fragment_na} ) { + print STDERR "Some ($stats{fragment_na}) unknown fragment types were found, unable to do mz correction\n"; + exit; + } + unless ( $stats{fragment_bad} || $stats{precursor_bad} ) { + print STDERR "No m/z discrepancy with either precursor or fragment ions was found, cntl-d to stop run (10 seconds)"; + sleep 10; + } + + my $mz_corr = $opts{ion_library} . ".mz_corrected"; + if ( -e $mz_corr ) { + print STDERR "m/z corrected library $mz_corr exists, cannot overwrite\n"; + } else { + open ILIB, $opts{ion_library}|| die; + + print STDERR "Printing mz-adjusted assays to $mz_corr\n"; + open MZLIB, ">$mz_corr" || die; + my $head = 1; + my %mz_stats; + while ( my $line = ) { + if ( $head ) { + print MZLIB $line; + $head = 0; + next; + } + chomp $line; +# my @line = split( /\t/, $line, -1 ); + my @line = parse_line( $line ); + + # Pull out needed info + my $precursor = sprintf( "%0.4f", $line[$idx{precursor}] ); + my $fragment = sprintf( "%0.4f", $line[$idx{fragment}] ); + my $pre_z = $line[$idx{pre_z}]; + my $frg_z = $line[$idx{frg_z}]; + my $ion_s = $line[$idx{ion_s}]; + my $ion_n = $line[$idx{ion_n}]; + my $mseq = $line[$idx{mseq}]; + $mseq =~ s/^_//g; + $mseq =~ s/_$//g; + my $ion_key = $ion_s . $ion_n . '_' . $frg_z; + my $lossType = ( defined $idx{lossType} ) ? $line[$idx{lossType}] : ''; + + my $pepkey = $mseq . '_' . $pre_z; + my $eseq = encode_modifications( $mseq ); + my $pmz = get_peptide_mass( $eseq, $pre_z ); + my $imz = $massmap{$pepkey}->{$precursor}->{peaks}->{$ion_key}; + + # Adjust for neutral losses (if any) + $imz -= $losses{$lossType}/$frg_z if ( $lossType ); + +# print STDERR qq~ PRE: $precursor PRZ: $pre_z FRG: $fragment FRZ: $frg_z MSQ: $mseq PKY: $pepkey INS: $ion_s INN: $ion_n INK: $ion_key pMZ: $pmz fMZ: $imz ~; + + if ( !$pmz ) { + die "Unable to calculate precursor m/z from $line\n"; + } elsif ( !$imz ) { + die "Unable to calculate fragment ion m/z from $line\n"; + } + if ( $pmz != $precursor ) { + $mz_stats{prec_changed}++; + } else { + $mz_stats{prec_ok}++; + } + if ( $imz != $fragment ) { + $mz_stats{frag_changed}++; + } else { + $mz_stats{frag_ok}++; + } + + # Calc and assign fragment m/z + $line[$idx{precursor}] = $pmz; + $line[$idx{fragment}] = $imz; + + if ( $is_CSV ) { + print MZLIB $quote_char . join( $quote_char . ',' . $quote_char, @line ) . $quote_char . "\n"; + } else { + print MZLIB join( "\t", @line ) . "\n"; + } + } + close ILIB; + close MZLIB; + for my $s ( sort( keys( %mz_stats ) ) ) { + print STDERR "$s\t$mz_stats{$s}\n"; + } + } + } + + my $t1 = time(); + my $runtime = $t1 - $t0; + print STDERR "Finished run in $runtime seconds\n"; + exit unless $opts{full_stats}; + + my $fullstats = $opts{output_file} . ".fullstats"; + if ( -e $fullstats ) { + print STDERR "full_stats file $fullstats exists, cannot overwrite\n"; + exit; + } + print STDERR "Printing full stats to $fullstats\n"; + + open FULL, ">$fullstats"; + for my $key ( qw( frg_s mods ) ) { + my $line = "$key:"; + my $sep = ''; + for my $val ( sort ( keys( %{$peps{$key}} ) ) ) { + my $item = "$sep$val=$peps{$key}->{$val}"; + $item =~ s/:/_/g; + $line .= $item; + $sep = ','; + } + print FULL "$line\n"; + } + + + + for my $key ( qw( pre_z frg_z frg_n stripped_len ) ) { + my $line = "$key:"; + my $sep = ''; + my @keys = sort { $a <=> $b } keys( %{$peps{$key}} ); + my $min = $keys[0]; + my $max = $keys[$#keys]; + for ( my $idx = $min; $idx <= $max; $idx++ ) { + if ( $opts{zero_pad} ) { + $peps{$key}->{$idx} ||= 0; + } else { + next unless defined $peps{$key}->{$idx}; + } + $line .= "$sep$idx=$peps{$key}->{$idx}"; + $sep = ','; + } + print FULL "$line\n"; + } + my $pepsing = $peps{sing}; + my $pepmult = $peps{mult}; + my %pepchg; + for my $pep ( keys{%{$pepsing}}, keys{%{$pepmult}} ) { + my ( $mseq, $chg ) = split( /_/, $pep ); + $pepchg{$chg}++; + } + my $sep = ''; + my @keys = sort { $a <=> $b } keys( %pepchg ); + my $min = $keys[0]; + my $max = $keys[$#keys]; + my $line = 'Distinct_pre_z:'; + for ( my $idx = $min; $idx <= $max; $idx++ ) { + if ( $opts{zero_pad} ) { + $pepchg{$idx} ||= 0; + } else { + next unless defined $pepchg{$idx}; + } + $line .= "$sep$idx=$pepchg{$idx}"; + $sep = ','; + } + print FULL "$line\n"; + +# This section causes the numbers > 5 to be summed into the 5 bin. +# $protstats{$pepsperprot{$prot}}++; + my %top5; + for my $p ( keys( %protstats ) ) { + if ( $p >= 5 ) { + $top5{5} += $protstats{$p}; + } else { + $top5{$p} = $protstats{$p}; + } + } +# %protstats = %top5; + + + my $sep = ''; + my @keys = sort { $a <=> $b } keys( %protstats ); + my $min = $keys[0]; + my $max = $keys[$#keys]; + my $line = 'PepIonsPerProtein:'; + for ( my $idx = $min; $idx <= $max; $idx++ ) { + if ( $opts{zero_pad} ) { + $protstats{$idx} ||= 0; + } else { + next unless defined $protstats{$idx}; + } + $line .= "$sep$idx=$protstats{$idx}"; + $sep = ','; + } + print FULL "$line\n"; + + my %real_ppp; + for my $prot ( keys( %prots ) ) { + my $scnt = scalar( keys( %{$prots{$prot}->{stripped}} ) ); + # This statement causes the numbers > 5 to be summed into the 5 bin. + # $scnt = 5 if $scnt > 5; + $real_ppp{$scnt}++; + } + my $sep = ''; + my @keys = sort { $a <=> $b } keys( %real_ppp ); + my $min = $keys[0]; + my $max = $keys[$#keys]; + my $line = 'PepsPerProtein:'; + for ( my $idx = $min; $idx <= $max; $idx++ ) { + if ( $opts{zero_pad} ) { + $real_ppp{$idx} ||= 0; + } else { + next unless defined $real_ppp{$idx}; + } + $line .= "$sep$idx=$real_ppp{$idx}"; + $sep = ','; + } + print FULL "$line\n"; + + +# $precursorstats{$pepsperprot{$prot}}++; + my $sep = ''; + my %pre_bin; + for my $pr ( keys( %precursorstats ) ) { + for ( my $bin = 50; $bin <= 4000; $bin += 50 ) { + if ( $bin > $pr ) { + $pre_bin{$bin}++; + last; + } + } + } + my $line = 'PrecusorCounts:'; + for my $pre ( sort { $a <=> $b } keys(%pre_bin) ) { + $line .= "$sep$pre=$pre_bin{$pre}"; + $sep = ','; + } + print FULL "$line\n"; + +# $ion_cnt{pepkey}++; + my $sep = ''; + my %cnt_bins; + for my $mpep ( keys( %ion_cnt ) ) { + $cnt_bins{$ion_cnt{$mpep}}++; + } + my @keys = sort { $a <=> $b } keys( %cnt_bins ); + my $min = $keys[0]; + my $max = $keys[$#keys]; + my $line = 'FragmentsPerPrecursor:'; + for ( my $idx = $min; $idx <= $max; $idx++ ) { + if ( $opts{zero_pad} ) { + $cnt_bins{$idx} ||= 0; + } else { + next unless defined $cnt_bins{$idx}; + } + $line .= "$sep$idx=$cnt_bins{$idx}"; + $sep = ','; + } + print FULL "$line\n"; + + close FULL; +# End MAIN program # + +#+ +# Simple CSV parser +#- +sub parse_csv { + return quotewords( ",", 0, $_[0] ); +} + +#+ +# Line parser +#- +sub parse_line { + my $line = shift; + if ( $is_CSV ) { + return parse_csv( $line ); + } else { + return split( "\t", $line, -1 ); + } +} + +#+ +# Routine to get index of specific cmap fields +#- +sub find_index { + my $names = shift; + my $target = shift; + my $fail_ok = shift || 0; + my $idx; + for my $name ( @{$names} ) { + if ( defined( $cmap{$name} ) ) { + return $cmap{$name}; + } + } + return if $fail_ok; + my $opts = join( ',', @{$names} ); + print STDERR "Unable to find an index for $target in $opts; printing debug info\n\n"; + $opts{debug}++; +} + +#+ +# routine to read and validate options +#- +sub read_options { + + # Local hash of options + my %opts; + + GetOptions(\%opts,"ion_library:s", + "alt_decoy:s", + "peptide_file:s", + "write", + "output_file:s", + "filter_assays", + "correct_mz", + "full_stats", + "rt_stats", + "verbose", + 'assess_massdiff', + 'print_mass_error', + 'skip_decoys', + 'swath_file:s', + 'coldefs', + 'debug', + 'print_ux', + 'invert_output', + 'neutral_loss', + 'zero_pad', + 'help' ) || die "$'"; + + $opts{mono} = { G => 57.021464, + D => 115.02694, + A => 71.037114, + Q => 128.05858, + S => 87.032029, + K => 128.09496, + P => 97.052764, + E => 129.04259, + V => 99.068414, + M => 131.04048, + T => 101.04768, + H => 137.05891, + C => 103.00919, + F => 147.06841, + L => 113.08406, + R => 156.10111, + I => 113.08406, + N => 114.04293, + Y => 163.06333, + W => 186.07931 }; + + my $err = ''; + + # Check for required params + for my $opt ( qw( ion_library ) ) { + $err .= "Missing required option $opt\n" unless $opts{$opt}; + } + + if ( $opts{help} ) { + print_usage( "" ); + } elsif ( $err ) { + print_usage( $err ); + } + + # Make sure ion library file exists + if ( ! -e $opts{ion_library} ) { + print_usage( "File $opts{ion_library} does not exist\n" ); + } + + # Determine filename base + if ( !$opts{output_file} ) { + if ( -t STDOUT ) { + my $qc = "$opts{ion_library}.QC"; + if ( -e $qc ) { + print STDERR "QC file $qc exists, cannot overwrite - printing to STDOUT\n"; + } else { + $opts{output_file} = $qc; + } + } else { + $opts{redirect}++; + } + $opts{output_file} = $opts{ion_library}; + } + + + if ( $opts{filter_assays} ) { + for my $o ( qw( correct_mz full_stats rt_stats print_ux ) ) { + die "filter_assays is incompatible with correct_mz, full_stats, rt_stats, and print_ux" if $opts{$o}; + } + if ( !$opts{assess_massdiff} ) { + print STDERR "filter_assays option requires assess_massdiff, enabling\n"; + $opts{assess_massdiff}++; + } + } elsif ( $opts{correct_mz} ) { + for my $o ( qw( full_stats rt_stats print_ux ) ) { + die "correct_mz is incompatible with full_stats, rt_stats, and print_ux" if $opts{$o}; + } + if ( !$opts{assess_massdiff} ) { + print STDERR "correct_mz option requires assess_massdiff, enabling\n"; + $opts{assess_massdiff}++; + } + } + + return ( %opts ); +} + + +#+ +# Routine to print usage statement if requested or error found +#- +sub print_usage { + my $msg = shift || ""; + my $prog = basename ( $0 ); + + print <<" EOM"; + + $msg + Usage: $prog --ion_library libfile [ --peptide_file pepfile --output output_file ] + + # Required + ion_library Path to ion library file in Peakview/OpenSWATH/Spectronaut/Prosit format (required) + + # Commonly used + assess_massdiff Compare library masses to theoretical monoisotopic values + c, coldefs print out column definitions + f, full_stats Print file (output_file.fullstats) + h, help Print this usage information and exit + invert_output Invert output, 3 columns by X rows + o, output_file File (base) to which to write data; given MyOut, generates + MyOut.QC, MyOut.fullstats, etc. If not specified, library name + is used. + peptide_file Path to peptide digest file, format is protein(s)\tpeptide + swath_file Path to file of SWATH definitions, format min_m/z\tmax_m/z + + # Generate filtered/corrected library + correct_mz Will compute precursor and fragment masses for each entry + in library and print new library: + ion_library.mz_corrected + Requires use of --assess, precludes use of + filter_assays, rt_stats, full_stats, print_ux + filter_assays Assesses each assay, and removes any that have a + problem (m/z error, discrepancy from SWATH file). + Creates 2 new libraries, one with all passing assays and + another with all failed (problem) assays: + ion_library.clean + ion_library.problem + Requires use of --assess, precludes use of correct_mz, + rt_stats, full_stats, print_ux + + # Other + alt_decoy Alternate decoy prefix (default is DECOY) + debug Print parsed/library values in case of format issues + print_mass_error Print out ions whose m/z values differ significantly from + theoretical + print_ux Print unexplained proteins (seen in ion library but not + in peptide_file if provided + rt_stats Print file of +2/+3 RTs for analysis to output_file.RT, + only if enough for regression + skip_decoys Skip DECOY entries when computing statistics + v, verbose Verbose output mode + write Write MGF file with info from each spectrum (rare) + zero_pad Print intermediate values=0 with full_stats. + EOM + + exit; + +} + + + +#+ +# Routine to generate output column definitions +#- +sub get_coldefs { + my %defs = ( +library => 'Name of library file being analyzed', +format => 'Library format, one of OpenSWATH, Peakview, or Spectronaut ', +pepions => 'Number of peptide ions (i.e. precursor, sequence + modifications + charge)', +fragments => 'Number of fragment (fragment ions) in library', +ptp_percent => 'Percentage of proteotypic pepions (not shared)', +shared_percent => 'Percentage of shared peptide ions (pepions)', +shared_pepions => 'Number of shared pepions', +peptides => 'Number of distinct peptide sequences', +mod_peps => 'Number of distinct modified peptides (sequences + modifications)', +mod_percent => 'percentage of distinct modified peptides with a mass modification', +total_mods => 'Number of mass modified amino acids', +chg_2 => 'Percentage of charge 2 precursors', +chg_3 => 'Percentage of charge 3 precursors', +avg_len => 'Average peptide Length', +precursor_min => 'Minimum precursor m/z (mass/charge) in library', +precursor_max => 'Maximum precursor m/z in library', +fragment_min => 'Minimum fragment m/z in library', +fragment_max => 'Maximum fragment m/z in library', +avg_num_frags => 'Average number of fragment per assay (precursor)', +avg_frag_len => 'Average fragment sequence length', +short_perc => 'Percentage of assays with 5 or fewer transitions', +y_perc => 'Percentage of y ions', +b_perc => 'Percentage of b ions', +t6_y_perc => 'Percentage of y ions considering only the top 6 fragments per assay', +avg_intensity => 'Average Intensity', +low_nr => 'Number of fragment annotated as y1,y2,b1, or b2', +fragment_above_precursor => 'Percentage of fragment m/z above precursor m/z', +max_intensity_idx => 'Average index of most intense fragment ', +med_ppp => 'Median number of pepions per protein ', +mean_ppp => 'Mean number of pepions per protein ', +stddev_ppp => 'Standard deviation of the number of pepions per protein ', +'3_sigma_ppp' => 'number of pepions per protein more than 3 standard deviations from the mean ', +rt_min => 'Minimum RT (retention time) in library ', +rt_max => 'Maximum RT in library', +rt_med => 'Median RT in library', +rt_rsq => 'r-squared value of fit between RT of +2 and +3 charge states for the same modified peptide', +rt_five => 'Percentage of +2/+3 charge pairs of the same mod pep within 5 RT units of each other', +n_irt => 'Number of iRT peptides in library', +irt_cnt => 'Number of iRT assays (precursor + fragment)', +db_peps => 'Peptides in reference library, often 7-50 AA', +seen_peps => 'Number of library peptides seen', +mc_peps => 'Number of Missed cleavage peptides', +db_prots => 'Number of Proteins in reference library', +lib_prots => 'Number of proteins with at least one assay in ion library', +seen_prots => 'Number of library proteins seen', +ux_prots => 'Unexplained (not in reference library) proteins', +decoy_pct => 'Percentage of decoy (optionally includes "alternative", non-db decoys) assays', +mixed_pct => 'Percentage of mixed decoy/target (has both decoy and no-decoy annotations) assays', +fwd_pct => 'Percentage of target (non-decoy) assays', +precursor_ok => 'Number of assays where precursor is within 5 PPM (parts per million m/z) of theoretical', +precursor_bad => 'Number of assays where fragment is more than 5 PPM from theoretical', +fragment_ok => 'Number of assays where fragment is within 1 PPM of theoretical', +fragment_bad => 'Number of assays where precursor is more than 1 PPM from theoretical', +fragment_na => 'Number of assays where peak annotation not found in expected b/y series', +fragment_avg_mdiff => 'Average m/z difference between reported and theoretical fragment', +swa_defined => 'Number of peptide ions that fall into a defined SWATH bin ', +swa_missing => 'Number of peptide ions that fail to fall into a defined SWATH bin ( Pepions - swa_defined)', +swa_conflict => 'Number of fragment_ions that fall into same SWATH(s) as precursor', +swa_ok => 'Number of fragment_ions that do not fall into same SWATH(s) as precursor', +swa_conflict_assay => 'Number of precursor that have at least one failing fragment', +swa_5 => 'Number of fragment ions that fall within 5 Th of precursor ion', +swa_25 => 'Number of fragment ions that fall within 25 Th of precursor ion', +problem_assays => 'Assays whose precursor or any fragment m/z values do not match SWATHs file or differ significantly from theoretical values' + ); + return \%defs; +} + + +#+ +# Routine to calculate peptide mz +#- +sub get_peptide_mass { + my $eseq = shift; # single-letter encoded sequence! + + # Acetyl mods handled slightly differently + $eseq =~ s/y/My/g; + $eseq =~ s/k/Kk/g; + + my $chg = shift || 2; + my $is_precursor = shift || 0; +# print STDERR "Eseq is $eseq, charge is $chg\n" if $is_precursor; + + my $mz_key = $eseq . '_' . $chg; + + return $mpep2mz{$mz_key} if $mpep2mz{$mz_key}; + + my ($ccnt) = $eseq =~ tr/c/C/; + my ($mtcnt) = $eseq =~ tr/Z/C/; + my ($pcnt) = $eseq =~ tr/z/C/; + my ($mcnt) = $eseq =~ tr/m/M/; + my ($pocnt) = $eseq =~ tr/p/P/; + my ($wcnt) = $eseq =~ tr/w/W/; + my ($ncnt) = $eseq =~ tr/n/N/; + my ($pecnt) = $eseq =~ tr/e/E/; + my ($pqcnt) = $eseq =~ tr/q/Q/; + my ($dqcnt) = $eseq =~ tr/x/Q/; + + # Acetyl mods handled slightly differently + my ($acnt) = $eseq =~ tr/a/a/; + my ($ycnt) = $eseq =~ tr/y/y/; + my ($kcnt) = $eseq =~ tr/k/k/; + $eseq =~ s/y//g; + $eseq =~ s/a//g; + $eseq =~ s/k//g; + + + my $mass = 0; + $mass += 57.0215 * $ccnt; + $mass += 39.9949 * $pcnt; + $mass += 45.987721 * $mtcnt; + $mass += 15.9949 * $mcnt; + $mass += 15.9949 * $pocnt; + $mass += 15.9949 * $wcnt; + $mass += -18.010565 * $pecnt; + $mass += -17.026549 * $pqcnt; + $mass += 0.984016 * $ncnt; + $mass += 0.984016 * $dqcnt; + $mass += 42.01056 * $acnt; + $mass += 42.01056 * $ycnt; + $mass += 42.01056 * $kcnt; +# print STDERR "eseq is $eseq, ycnt is $ycnt, mass is $mass, mzkey is $mz_key\n" if $is_precursor; + + # N-terminal +# $mass += 42.01056 * $accnt; + + for my $aa ( split( '', uc($eseq) ) ){ + die "Unknown AA $aa from $eseq" unless $opts{mono}->{$aa}; + $mass += $opts{mono}->{$aa}; + } + $mass += WATER_MASS; + +# $mass += 18.0105; + + my $h_mass = 1.00727646688; + my $pmz = sprintf( '%0.4f', ( $mass + $chg * H_MASS )/$chg ); +# print STDERR "eseq is $eseq, ycnt is $ycnt, mass is $mass, pmz is $pmz\n" if $is_precursor; + $mpep2mz{$mz_key} = $pmz; + return $pmz; +} + + +#+ +# Hash of known modifications +#- +sub get_known_mods { + +my %known_mods = ('C(UniMod:4)' => 'c', + 'C[+57]' => 'c', + 'C[+57.0]' => 'c', + 'C[57]' => 'c', + 'C[+C2+H3+N+O]' => 'c', + 'C[CAM]' => 'c', + 'C[160]' => 'c', + 'C[Carbamidomethyl (C)]' => 'c', + 'C(UniMod:39)' => 'Z', + 'C[149]' => 'Z', + 'C[46]' => 'Z', + 'C[Methylthio]' => 'Z', + 'C[+46.0]' => 'Z', + 'C[PCm]' => 'z', + 'C[143]' => 'z', + 'C[40]' => 'z', + 'C[+40]' => 'z', + 'C[+40.0]' => 'z', + 'C(UniMod:26)' => 'z', + 'C[Pyro-Carbamidomethyl (Any N-term)]' => 'z', + 'M[Oxi]' => 'm', + 'M[Oxidation (M)]' => 'm', + 'M(UniMod:35)' => 'm', + 'M[147]' => 'm', + 'M[+16]' => 'm', + 'M[+O]' => 'm', + 'M[+16.0]' => 'm', + 'M[16]' => 'm', + 'P[Oxi]' => 'p', + 'P[Oxidation (P)]' => 'p', + 'W[Oxi]' => 'w', + 'W[Oxidation (W)]' => 'w', + 'W(UniMod:35)' => 'w', + 'W[+16]' => 'w', + 'W[+16.0]' => 'w', + 'W[16]' => 'w', + 'W[220]' => 'w', + 'N[Dea]' => 'n', + 'N[Deamidation (N)]' => 'n', + 'N[Deamidation (NQ)]' => 'n', + 'N(UniMod:7)' => 'n', + 'N[115]' => 'n', + 'N[+1.0]' => 'n', + 'N[1]' => 'n', + 'E[PGE]' => 'e', + 'E[111]' => 'e', + 'E(UniMod:27)' => 'e', + 'E[-18]' => 'e', + 'E[-18.0]' => 'e', + 'E[Glu->pyro-Glu (Any N-term)]' => 'e', + 'E[Glu->pyro-Glu]' => 'e', + 'Q[PGQ]' => 'q', + 'Q[111]' => 'q', + 'Q(UniMod:28)' => 'q', + 'Q[-17]' => 'q', + 'Q[-17.0]' => 'q', + 'Q[Gln->pyro-Glu (Any N-term)]' => 'q', + 'Q[Gln->pyro-Glu]' => 'q', + + 'Q[Dea]' => 'x', + 'Q[Deamidation (NQ)]' => 'x', + + '[42]' => 'a', + '[42.0]' => 'a', + '[42]' => 'a', + '[1Ac]-' => 'a', + '[Acetyl +(N-term)]' => 'a', + '[Acetyl +(Any N-term)]' => 'a', + '[Acetyl +(Protein N-term)]' => 'a', + 'M[Acetyl (Protein N-term)]' => 'y', + 'K[Acetyl (K)]' => 'k' ); + + return \%known_mods; + +} + + + +#+ +# Routine to translate between modification namespaces +#- +sub encode_modifications { + my $seq = shift; + return $mpep2encoded{$seq} if $mpep2encoded{$seq}; + + my $regex = '(\w?\[[^\]]+\]-*)'; + + # Block for debugging mod regex + if ( 0 ) { + my $k = get_known_mods(); + for my $mod ( keys( %{$k} ) ) { + my $match = '0'; + if ( $mod =~ /$regex/ ) { + $match = $1; + } + print "$mod\t$match\n"; + } + exit; + } + + + if ( $seq =~ /UniMod/ ) { + $regex = '(\w?\(UniMod:\d+\))'; + } else { + return $seq unless $seq =~ /\[/; + } + + my %mods; + for my $m ( $seq =~ /$regex/ig ) { + $mods{$m}++; + unless( $kmods->{$m} ) { + print ">$m<"; + die "Unknown modification $m in $seq ($regex)\n"; + } + } + + my $eseq = $seq; + for my $m ( keys( %mods ) ) { + my $em = $m; + $em =~ s/\[/\\[/g; + $em =~ s/\+/\\+/g; + $em =~ s/\]/\\]/g; + $em =~ s/\(/\\(/g; + $em =~ s/\)/\\)/g; + $eseq =~ s/$em/$kmods->{$m}/g; + } + + if ( $eseq =~ /(.?\[[^\[\]]+\])/ ) { + die "1. Unknown mod $1 in $eseq\n"; + } elsif ( $eseq =~ /\d/ ) { + die "2. Unknown mod in $eseq\n"; + } elsif ( $eseq =~ /\[/ ) { + die "3. Unknown mod in $eseq\n"; + } elsif ( $eseq =~ /\(/ ) { + die "4. Unknown mod in $eseq\n"; + } + $mpep2encoded{$seq} = $eseq; + return $eseq; +} + +#+ +# Routine to generate fragment ions +#- +sub generate_ions { + my $eseq = shift; + my %b; + my %y; + my @aa = split( //, $eseq ); + my $bpep = ''; + my $ypep = ''; + + for ( my $idx = 0; $idx <= $#aa; $idx++ ) { + $bpep .= $aa[$idx]; + $b{$idx + 1} = $bpep; + $ypep = $aa[$#aa - $idx] . $ypep; + $y{$idx + 1} = $ypep; + } + my %ions = ( b => \%b, y => \%y ); + return \%ions; +} + +__DATA__ +# +# SWATH File format info +# +# peakview +# 0 Q1 [ 778.413 ] +# 1 Q3 [ 498.2579 ] +# 2 RT_detected [ 57.3 ] +# 3 protein_name [ 1/P0CG40 ] +# 4 isotype [ light ] +# 5 relative_intensity [ 7324.6 ] +# 6 stripped_sequence [ AAAAAAAAAAAAAAAASAGGK ] +# 7 modification_sequence [ AAAAAAAAAAAAAAAASAGGK ] +# 8 prec_z [ 2 ] +# 9 frg_type [ b ] +# 10 frg_z [ 1 ] +# 11 frg_nr [ 7 ] +# 12 iRT [ 57.3 ] +# 13 uniprot_id [ 1/P0CG40 ] +# 14 decoy [ FALSE ] +# 15 N [ 1 ] +# 16 confidence [ 1 ] +# 17 shared [ FALSE ] + +# openswath +# 0 PrecursorMz [ 778.4129855 ] +# 1 ProductMz [ 498.26707303 ] +# 2 Tr_recalibrated [ 57.3 ] +# 3 transition_name [ 6_AAAAAAAAAAAAAAAASAGGK_2 ] +# 4 CE [ -1 ] +# 5 LibraryIntensity [ 7324.6 ] +# 6 transition_group_id [ 1_AAAAAAAAAAAAAAAASAGGK_2 ] +# 7 decoy [ 0 ] +# 8 PeptideSequence [ AAAAAAAAAAAAAAAASAGGK ] +# 9 ProteinName [ 1/P0CG40 ] +# 10 Annotation [ b7/-0.009,b14^2/-0.009,m10:16/-0.009 ] +# 11 FullUniModPeptideName [ AAAAAAAAAAAAAAAASAGGK ] +# 12 PrecursorCharge [ 2 ] +# 13 GroupLabel [ light ] +# 14 UniprotID [ 1/P0CG40 ] +# 15 FragmentType [ b ] +# 16 FragmentCharge [ 1 ] +# 17 FragmentSeriesNumber [ 7 ] +# +# Spectronaut +# 0 PrecursorMz [ 778.4129855 ] +# 1 PrecursorCharge [ 2 ] +# 2 ProteinName [ 1/P0CG40 ] +# 3 Organism [ Human ] +# 4 iRT Value [ 57.3 ] +# 5 PeptideSequence [ AAAAAAAAAAAAAAAASAGGK ] +# 6 PeptideModifiedSequence [ AAAAAAAAAAAAAAAASAGGK ] +# 7 FragmentType [ b ] +# 8 FragmentNumber [ 6 ] +# 9 Fragmentation [ b6 ] +# 10 Losses [ ] +# 11 LossNeutralMass [ 0 ] +# 12 ProductCharge [ 1 ] +# 13 ProductMz [ 427.22995924 ] +# 14 LibraryIntensity [ 8319.7 ] +# 15 ProteinId [ 1/P0CG40 ] +# + +# 0 ReferenceRun [ C_D160304_S251-Hela-2ug-2h_MSG_R01_T0 ] +# 1 PrecursorCharge [ 3 ] +# 2 IntModifiedPeptide [ _SLDTHVTK_ ] +# 3 ModifiedPeptide [ _SLDTHVTK_ ] +# 4 StrippedPeptide [ SLDTHVTK ] +# 5 iRT [ -35.19764 ] +# 6 BGSInferenceId [ Q14166 ] +# 7 IsProteotypic [ True ] +# 8 LabeledPeptide [ _SLDTHVTK_ ] +# 9 PrecursorMz [ 300.831025070542 ] +# 10 ReferenceRunQvalue [ 0.000844789494294673 ] +# 11 ReferenceRunMS1Response [ 9.9044E+07 ] +# 12 FragmentLossType [ noloss ] +# 13 FragmentNumber [ 3 ] +# 14 FragmentType [ y ] +# 15 FragmentCharge [ 1 ] +# 16 FragmentMz [ 347.228896545912 ] +# 17 RelativeIntensity [ 100 ] +# 18 ExcludeFromAssay [ False ] +# 19 Database [ sp ] +# 20 ProteinGroups [ Q14166 ] +# 21 UniProtIds [ Q14166 ] +# 22 Protein Name [ TTL12_HUMAN ] +# 23 ProteinDescription [ Tubulin--tyrosine ligase-like protein 12 ] +# 24 Organisms [ Homo sapiens ] +# 25 Genes [ TTLL12 ] +# 26 Protein Existence [ 1 ] +# 27 Sequence Version [ 2 ] +# 28 FASTAName [ uniprot_sprot_2014-12-11_HUMAN_ISOFORMS ] +# +# PROSIT +# 0 RelativeIntensity [ 0.047176161128631676 ] +# 1 FragmentMz [ 147.112804167 ] +# 2 ModifiedPeptide [ _AAAAAAALQAK_ ] +# 3 LabeledPeptide [ AAAAAAALQAK ] +# 4 StrippedPeptide [ AAAAAAALQAK ] +# 5 PrecursorCharge [ 2 ] +# 6 PrecursorMz [ 478.77981619566503 ] +# 7 iRT [ 9.95382308959961 ] +# 8 FragmentNumber [ 1 ] +# 9 FragmentType [ y ] +# 10 FragmentCharge [ 1 ] +# 11 FragmentLossType [ noloss ] +# +# CHROMAT_LIBRARY (Modified OSW) +# 0 transition_group_id [ HEELMLGDPC[57]LK+2 ] +# 1 transition_name [ HEELMLGDPC[57]LK+2_b3+2 ] +# 2 ProteinId [ sp|P07814|SYEP_HUMAN ] +# 3 PeptideSequence [ HEELMLGDPCLK ] +# 4 ModifiedPeptideSequence [ HEELMLGDPC[57]LK ] +# 5 FullPeptideName [ HEELMLGDPC[57]LK ] +# 6 RetentionTime [ 2220.0 ] +# 7 PrecursorMz [ 721.3443378086647 ] +# 8 PrecursorCharge [ 2 ] +# 9 ProductMz [ 396.151374562 ] +# 10 ProductCharge [ 1 ] +# 11 LibraryIntensity [ 1111.5 ] +# 12 FragmentIonType [ b ] +# 13 FragmentSeriesNumber [ 3 ] +# 14 IsDecoy [ 0 ] +# 15 quantifying_transition [ 1 ] +#