diff --git a/pigeon_feather/pigeon.py b/pigeon_feather/pigeon.py index 7079a5e..573100a 100644 --- a/pigeon_feather/pigeon.py +++ b/pigeon_feather/pigeon.py @@ -1,388 +1,643 @@ -import warnings -warnings.filterwarnings("ignore") +import numpy as np import pandas as pd -pd.options.mode.chained_assignment = None -import matplotlib -from matplotlib import rcParams +import scipy.sparse as sparse +import scipy.optimize as opt +import scipy.stats as stats +from pyteomics import mass +import matplotlib.pyplot as plt import matplotlib.pyplot as plt import matplotlib.colors as colors -import numpy as np -import scipy.optimize as opt +from matplotlib import rcParams +from matplotlib import rc +import time import argparse import os -# DEFAULT PARAMS - -RTcutoff = 0.5 -MZcutoff = 0.1 -score_floor = 0 -score_threshold = 150 -ppm_threshold = 7 -fit = 'inv' -maxfev = 800 -plots = 'csvplots' -overlap_method = 'select' -skip_res = 2 -select_score = 0 -ov_threshold = 0.8 -ymin=0.9 -ylim=30 - -xmax=1e6 -ymax=1e6 - -# COLUMN LABELS - -rtlabel = 'Rt(min)' -scorelabel = 'Score' -zlabel = 'z' -mzlabel = 'Meas. M/z' -mrlabel = 'Meas. Mr' -seqlabel = 'Sequence' -rangelabel = 'Range' -startlabel = 'Start' -endlabel = 'End' -ppmlabel = 'Dev.(ppm)' - -# HELPER FNS - -def inthelper(n): - if n == '-': - return 0 - if type(n) is str: - return int(n.strip()) - return int(n) - -def fitfn(x, a, b, c): - if fit == 'inv': - return c + a / (x - b) - else: +# PARAMS + +# maximum charge of peptide ions in search +default_maxz = 3 +# length range for peptides in search (inclusive) +default_minl = 3 +default_maxl = 15 +# m/z error cutoff for initial search and for cut about trendline (in ppm) +default_ppmc_wide = 30 +default_ppmc_narrow = 7 +# score cutoff for truth peptides (for systematic error fit) +default_scorec = 0.05 +# fit to use for systematic error ('quad' or 'inv') +default_fit = 'quad' +# maximum iterations for systematic error fit +default_maxfev = 1000 +# types of fragment ions to consider in search +default_ions = ['b', 'b-H2O', 'b-NH3', 'y', 'y-H2O', 'y-NH3'] +# threshold in Da/e for matching fragments +default_threshold = 0.02 +# cutoff for matches to be considered degenerate, in m/z (Da/e) and RT (min) +default_mzc = 0.1 +default_rtc = 0.5 +# significance threshold for p-value for disambiguation +default_pvc = 0.05 +# method to use for disambiguation: keep or drop coeluting peptides? ('keep' or 'drop') +default_cut_method = 'drop' +# score floor (drop all below given match score) +default_sf = 0 + + +# FOR VECTORIZED FRAGMENT MATCHING +def match_fragment(mz1, mz2, threshold=default_threshold): + return abs(mz1 - mz2) <= threshold and (mz1 or mz2) + +vecmf = np.vectorize(match_fragment, otypes=[bool]) + + +class Pool: + def __init__(self, proteins, msms, fit=default_fit): + self.proteins = proteins + self.msms = msms + self.fit = fit + + def fitfn(self, x, a, b, c): + if self.fit == 'inv': + return c + a / (x - b) return c + a * x + b * (x ** 2) -def formatfill(data): - if 0 in data[scorelabel].values: - return (len(data), np.mean(data[scorelabel]), 100 * data[scorelabel].value_counts()[0] / len(data)) - return (len(data), np.mean(data[scorelabel]), 0.) - -def redflag(topscores, r): - # return true if they're closer in RT and mass than the cutoffs and if they're nonidentical and nonoverlapping - RTflag = (abs(topscores[rtlabel].apply(float) - float(r[rtlabel])) < RTcutoff) - MZflag = (abs(topscores[mzlabel].apply(float) - float(r[mzlabel])) < MZcutoff) | (abs(topscores[mrlabel].apply(float) - float(r[mrlabel])) < MZcutoff) - notsame = (topscores[seqlabel] != r[seqlabel]) - nonoverlapping = (topscores[startlabel].apply(int) + skip_res > int(r[endlabel])) | (int(r[startlabel]) + skip_res > topscores[endlabel].apply(int)) - return RTflag & MZflag & notsame & nonoverlapping - -def yellowflag(topscores, r): - # return true if they're closer in RT and mass than the cutoffs and if they're nonidentical but overlapping - RTflag = (abs(topscores[rtlabel].apply(float) - float(r[rtlabel])) < RTcutoff) - MZflag = (abs(topscores[mzlabel].apply(float) - float(r[mzlabel])) < MZcutoff) | (abs(topscores[mrlabel].apply(float) - float(r[mrlabel])) < MZcutoff) - notsame = (topscores[seqlabel] != r[seqlabel]) - overlapping = (topscores[startlabel].apply(int) + skip_res <= int(r[endlabel])) & (int(r[startlabel]) + skip_res <= topscores[endlabel].apply(int)) - return RTflag & MZflag & notsame & overlapping - -def scorehist(data, title, path): - figure, (ax1) = plt.subplots(1, 1, figsize=(8, 8)) - plt.xlabel(scorelabel) - plt.ylabel('n') - plt.ylim(ymin, ymax) - plt.title(title) - plt.hist(data[scorelabel], log=True, bins=100, range=[0, xmax]) - #print('% 0: ' + str(100 * data[scorelabel].value_counts()[0] / len(data))) - plt.tight_layout() - plt.savefig(path) - -def mzscatter(data, path, popt=False): - rcParams['font.sans-serif'][7] = 'DejaVu Sans' - rcParams['font.sans-serif'][0] = 'Arial' - # rcParams['mathtext.fontset'] = 'Arial' - # plt.rcParams['text.usetex'] = True - - afont = {'size': 6} - matplotlib.rc('font', **afont) - - pt = 72 - figure, (ax1) = plt.subplots(1, 1, figsize=(109.7 / pt, 109.7 / pt)) - if type(popt) != bool: - plt.plot(np.sort(data[mzlabel].apply(float)), fitfn(np.sort(data[mzlabel].apply(float)), *popt), linewidth=0.5) - plt.scatter(data[mzlabel].apply(float), data[ppmlabel].apply(float), c=data[scorelabel]+1, - norm=colors.LogNorm(vmin=1, vmax=xmax), s=50/pt, linewidth=0.05) - plt.colorbar(label=scorelabel) - plt.xlabel('$\it{{m}}$/$\it{{z}}$ (D/e)') - plt.ylabel('mass error (ppm)') - plt.ylim(-ylim, ylim) - plt.tight_layout() - plt.savefig(path) + # PLOTTING FNS + + def scorehist(self, matches, path=None, title=None, xmax=None, ymin=None, ymax=None, figsize=109.7): + rcParams['font.sans-serif'][7] = 'DejaVu Sans' + rcParams['font.sans-serif'][0] = 'Arial' + scores = [match.score for match in matches] + afont = {'size': 6} + rc('font', **afont) + + if not ymin: + ymin = 9e-1 + if not ymax: + ymax = len(matches) + if not xmax: + xmax = max(scores) + pt = 72 + figure, (ax1) = plt.subplots(1, 1, figsize=(figsize / pt, figsize / pt)) + plt.xlabel('Score') + plt.ylabel('n (matches)') + plt.ylim(ymin, ymax) + if title: + plt.title(title) + plt.hist(scores, log=True, bins=100, range=[0, xmax]) + plt.tight_layout() + if path: + plt.savefig(path) + plt.close(figure) + + def mzscatter(self, matches, path=None, popt=None, xmax=None, mzmin=None, mzmax=None, figsize=109.7): + rcParams['font.sans-serif'][7] = 'DejaVu Sans' + rcParams['font.sans-serif'][0] = 'Arial' + + afont = {'size': 6} + rc('font', **afont) + + pt = 72 + figure, (ax1) = plt.subplots(1, 1, figsize=(figsize / pt, figsize / pt)) + + mzs = np.asarray([match.pep.mz[match.z] for match in matches]) + ppms = np.asarray([match.ppmerr for match in matches]) + scores = np.asarray([match.score for match in matches]) + if not xmax: + xmax = max(scores) + if popt is not None: + plt.plot(np.sort(mzs), self.fitfn(np.sort(mzs), *popt), linewidth=0.5) + plt.scatter(mzs, ppms, c=scores + 1e-7, + norm=colors.LogNorm(vmin=1e-7, vmax=xmax), s=50 / pt, linewidth=0.05) + plt.colorbar(label='Score') + plt.xlabel('$\it{{m}}$/$\it{{z}}$ (D/e)') + plt.ylabel('mass error (ppm)') + plt.ylim(-30, 30) + if mzmin and mzmax: + plt.xlim(mzmin, mzmax) + plt.tight_layout() + if path: + plt.savefig(path) + plt.close(figure) + + # MAKE TABLE OF FEATURES FOR MATCHED PEPTIDES + def pool_output(self, matches, path=None): + cmpdnum = [match.peak.cmpdnum for match in matches] + runid = [match.peak.runid for match in matches] + sequence = [match.pep.sequence for match in matches] + start = [match.pep.start + 1 for match in matches] + end = [match.pep.end for match in matches] + + int = [match.peak.intensity for match in matches] + z = [match.z for match in matches] + m_calc = [match.pep.m for match in matches] + mz_calc = [match.pep.mz[match.z] for match in matches] + m_meas = [match.peak.mz * match.z for match in matches] + mz_meas = [match.peak.mz for match in matches] + ppm_err = [match.ppmerr for match in matches] + da_err = [match.peak.mz * match.z - match.pep.mz[match.z] for match in matches] + + rt = [match.peak.rt for match in matches] + n_hits = [match.fragmatch.size for match in matches] + n_tries = [np.prod(match.fragmatch.shape) for match in matches] + score = [match.score for match in matches] + + df = pd.DataFrame(data={'compound #': cmpdnum, 'run ID': runid, 'sequence': sequence, + 'Start': start, 'End': end, 'intensity': int, + 'z': z, 'mass (calc)': m_calc, 'm/z (calc)': mz_calc, + 'mass (meas)': m_meas, 'm/z (meas)': mz_meas, + 'error (ppm)': ppm_err, 'error (Da/e)': da_err, + 'RT (min)': rt, '# fragment matches': n_hits, + '# peaks * fragments': n_tries, 'score': score + }) + df.sort_values(by=['Start', 'End', 'z', 'score'], inplace=True) + if path: + df.to_csv(path, index=False) + return df + + # MAKE TABLE OF SEQUENCE RANGES FOR MATCHED PEPTIDES + def rangeslist(self, matches, path=None): + start = [match.pep.start + 1 for match in matches] + end = [match.pep.end for match in matches] + df = pd.DataFrame(data={'Start': start, 'End': end}) + df.sort_values(by=['Start', 'End'], inplace=True) + if path: + df.to_csv(path, index=False) + return df + + # LOG STATS ABOUT EACH STEP IN ANALYSIS + def nums_out(self, match_dict, popt, path=None): + out = '' + for key in match_dict: + n = len(match_dict[key]) + p_0 = len([match for match in match_dict[key] if not match.score]) / n * 100 + mean_score = np.mean([match.score for match in match_dict[key]]) + rmsdev = np.sqrt( + np.mean([(match.ppmerr - self.fitfn(match.pep.mz[match.z], *popt)) ** 2 for match in match_dict[key]])) + out = out + f'{key}: n={n}, %0={p_0}, mean_score={mean_score}, RMSD={rmsdev}\n' + if path: + with open(path, 'w') as file: + file.write(out) + return out + + # INITIAL MATCH COMPARING ALL PEAKS TO ALL THEORETICAL CHARGE STATES + def match(self, mzc=default_ppmc_wide, maxz=default_maxz): + self.matches = [] + for protein in self.proteins: + for pep in protein.peps: + for peak in self.msms.peaks: + if peak.z: + if peak.z in pep.mz: + if abs(pep.mz[peak.z] - peak.mz) <= mzc * pep.mz[peak.z] * 1e-6: + ppmerr = (peak.mz - pep.mz[peak.z]) / pep.mz[peak.z] * 1e6 + self.matches.append(Match(pep, peak, peak.z, ppmerr, self)) + else: + for z in range(1, maxz + 1): + if abs(pep.mz[z] - peak.mz) <= mzc * pep.mz[z] * 1e-6: + ppmerr = (peak.mz - pep.mz[z]) / pep.mz[z] * 1e6 + self.matches.append(Match(pep, peak, z, ppmerr, self)) + break + self.score_max = max([match.score for match in self.matches]) + + # CUT ALL BUT THOSE PASSING HIGH SCORE CUTOFF, FIT TRENDLINE FOR SYSTEMATIC ERROR + def truth_cut(self, matches, score_c=default_scorec, maxfev=default_maxfev, fit=default_fit): + self.fit = fit + best_peaks = [] + mzs = [] + ppms = [] + scores = [] + for match in matches: + if match.score >= score_c: + best_peaks.append(match) + mzs.append(match.pep.mz[match.z]) + ppms.append(match.ppmerr) + scores.append(match.score) + popt, pcov = opt.curve_fit(self.fitfn, mzs, ppms, maxfev=maxfev, + bounds=([-np.inf, -np.inf, -np.inf], + [np.inf, min(mzs) - 100 if fit == 'inv' else np.inf, np.inf])) + return best_peaks, popt, pcov + + # CUT ALL FARTHER THAN ppm_c FROM TRENDLINE + def ppm_cut(self, popt, matches, ppm_c=default_ppmc_narrow): + return [match for match in matches if abs(match.ppmerr - self.fitfn(match.pep.mz[match.z], *popt)) < ppm_c] + + # CUT ALL BUT BEST MATCH FOR EACH PEPTIDE + def best_peak_cut(self, matches): + best_peaks = [] + peps = list(set([match.pep for match in matches])) + for pep in peps: + maxscore = max([match.score for match in pep.matches if match in matches]) + best = [match for match in pep.matches if match.score >= maxscore and match in matches] + if len(best) > 1: + maxint = max([match.peak.intensity for match in best]) + best = [match for match in best if match.peak.intensity >= maxint] + best_peaks = best_peaks + best + # self.matches = best_peaks + return best_peaks + + # CUT ALL BELOW GIVEN SCORE (OPTIONAL) + def scorefloor_cut(self, matches, score_f=default_sf): + return [match for match in matches if match.score >= score_f] + + # CUT USING FRAGMENT-BASED DISAMBIGUATION + def fragment_cut(self, matches, allmatches, threshold=default_threshold, pvc=0, mzc=default_mzc, rtc=default_rtc, + cut_method=default_cut_method): + degen_dict = {} + best_peaks = [] + exclude = [] + for match1 in matches: + print() + print() + print('match1:') + print(f'{match1.pep.sequence}-{match1.pep.start+1}-{match1.pep.end}+{match1.z}-{match1.pep.mz[match1.z]}-{match1.peak.runid}-cmpd{match1.peak.cmpdnum}') + print() + degen = [match2 for match2 in allmatches + if (np.abs(match1.pep.mz[match1.z] - match2.pep.mz[match2.z]) < mzc + and np.abs(match1.peak.rt - match2.peak.rt) < rtc + and (match1.pep.sequence != match2.pep.sequence + or match1.pep.start != match2.pep.start + or match1.pep.end != match2.pep.end))] + degen_dict[match1] = degen + if not degen: + print('not degen. keeping') + best_peaks.append(match1) + elif match1.fragmatch.size == 0: + exclude.append(match1) + print('no support, degen. dropping.') + else: + same_pep = [match2 for match2 in allmatches + if (np.abs(match1.pep.mz[match1.z] - match2.pep.mz[match2.z]) < mzc + and np.abs(match1.peak.rt - match2.peak.rt) < rtc + and match1.pep.sequence == match2.pep.sequence)] + flag = False + + pvs = [] + match1_mzs = match1.fragmatch.sum(0) * match1.pep.fragpeaks + mask = np.ones(match1_mzs.shape) + print('degen:') + for match2 in degen: + match2_mzs = match2.fragmatch.sum(0) * match2.pep.fragpeaks + same_frags_1 = sparse.coo_array( + vecmf(match1_mzs, match2.pep.fragpeaks.reshape(1, match2.pep.fragpeaks.shape[0]).T, + threshold=threshold)) + same_frags_2 = sparse.coo_array( + vecmf(match2_mzs, match1.pep.fragpeaks.reshape(1, match1.pep.fragpeaks.shape[0]).T, + threshold=threshold)) + mask1 = np.logical_not(same_frags_1.sum(0)) + mask = np.logical_and(mask, mask1) + mask2 = np.logical_not(same_frags_2.sum(0)) + print(f'{match2.pep.sequence}-{match2.pep.start+1}-{match2.pep.end}+{match2.z}-{match2.pep.mz[match2.z]}-{match2.peak.runid}-cmpd{match2.peak.cmpdnum}') + print(match2_mzs) + print(mask2 * match2_mzs) + if not (mask1 * match1_mzs).any(): + flag = True + print('no unique support, degen. dropping.') + break + elif cut_method == 'drop': + n_unique = np.count_nonzero(mask2 * match2_mzs) + if not pvc and n_unique: + flag = True + print('(WRONG) 1 hit, dropping') + break + elif pvc: + maxmz = max([max(match2.pep.fragpeaks), max(match2.peak.ms2s)]) + minmz = min([min(match2.pep.fragpeaks), min(match2.peak.ms2s)]) + prob = match2.peak.ms2s.size * 2 * threshold / (maxmz - minmz) + pvs.append( + stats.binomtest(n_unique, match2.pep.fragpeaks.size, prob, alternative='greater').pvalue) + print(pvs[-1]) + + print() + print('same_pep:') + if flag: + exclude.append(match1) + else: + if not pvc: + print('(WRONG) no hits, keeping') + best_peaks.append(match1) + else: + samepvs = [] + for match2 in same_pep: + match2_mzs = match2.fragmatch.sum(0) * match2.pep.fragpeaks + print(f'{match2.pep.sequence}-{match2.pep.start+1}-{match2.pep.end}+{match2.z}-{match2.pep.mz[match2.z]}-{match2.peak.runid}-cmpd{match2.peak.cmpdnum}') + print(match2_mzs) + print(mask * match2_mzs) + n_unique = np.count_nonzero(mask * match2_mzs) + maxmz = max([max(match2.pep.fragpeaks), max(match2.peak.ms2s)]) + minmz = min([min(match2.pep.fragpeaks), min(match2.peak.ms2s)]) + prob = match2.peak.ms2s.size * 2 * threshold / (maxmz - minmz) + samepvs.append( + stats.binomtest(n_unique, match2.pep.fragpeaks.size, prob, alternative='greater').pvalue) + print(samepvs[-1]) + if stats.combine_pvalues(samepvs).pvalue >= pvc or ( + cut_method == 'drop' and stats.combine_pvalues(pvs).pvalue <= pvc): + exclude.append(match1) + print('pvalue, dropping') + else: + print('pvalue, keeping') + best_peaks.append(match1) + print(samepvs) + print(stats.combine_pvalues(samepvs).pvalue) + print() + print(pvs) + print(stats.combine_pvalues(pvs).pvalue) + print() + return best_peaks, exclude, degen_dict + + +class Match: + def __init__(self, pep, peak, z, ppmerr, pool): + self.pep = pep + self.peak = peak + self.z = z + self.ppmerr = ppmerr + self.pool = pool + self.fragmatch = sparse.coo_array(vecmf(pep.fragpeaks, peak.ms2s.reshape(1, peak.ms2s.shape[0]).T)) + self.score = self.fragmatch.size ** 2 / np.prod(self.fragmatch.shape) if np.prod(self.fragmatch.shape) > 0 else 0 + pep.matches.append(self) + peak.matches.append(self) + + def remove(self): + pep.matches.remove(self) + peak.matches.remove(self) + pool.matches.remove(self) + + +class Protein: + def __init__(self, sequence, minl=default_minl, maxl=default_maxl, maxz=default_maxz, ions=default_ions): + self.sequence = sequence + self.peps = [] + self.make_peps(minl=minl, maxl=maxl) + for pep in self.peps: + pep.get_mass(maxz=maxz) + pep.get_fragments(maxz=maxz, ions=ions) + + def make_peps(self, minl=default_minl, maxl=default_maxl): + for start in range(len(self.sequence) - minl): + for end in range(start + minl, min(len(self.sequence) + 1, start + maxl + 1)): + self.peps.append(Peptide(self.sequence[start:end], start, end)) + + +class Peptide: + def __init__(self, sequence, start, end): + self.sequence = sequence + self.start = start + self.end = end + self.matches = [] + + def get_mass(self, maxz=default_maxz): + self.m = mass.calculate_mass(sequence=self.sequence) + self.mz = {} + for z in range(1, maxz + 1): + self.mz[z] = mass.calculate_mass(sequence=self.sequence, charge=z) + + def get_fragments(self, maxz=default_maxz, ions=default_ions): + self.fragments = [] + for ion in ions: + for i in range(len(self.sequence)): + for z in range(1, maxz + 1): + if ion[0] == 'b': + seq = self.sequence[0:i + 1] + self.fragments.append(Fragment(len(self.fragments), seq, 0, i, ion, + mass.calculate_mass(sequence=seq, ion_type=ion, charge=z), z)) + else: + seq = self.sequence[i:] + self.fragments.append(Fragment(len(self.fragments), seq, i, len(self.sequence) - 1, ion, + mass.calculate_mass(sequence=seq, ion_type=ion, charge=z), z)) + self.fragpeaks = np.empty(len(self.fragments), np.float64) + for frag in range(len(self.fragments)): + self.fragpeaks[frag] = self.fragments[frag].mz + + +class Fragment: + def __init__(self, i, seq, start, end, ion, mz, z): + self.index = i + self.seq = seq + self.start = start + self.end = end + self.ion = ion + self.mz = mz + self.z = z + + +class MSMS: + def __init__(self): + self.peaks = [] + self.runids = [] + + def first(self, s, l): + for i in range(len(l)): + if s == l[i]: + return i + + # MGF FORMAT-SPECIFIC I/O + def read_block(self, block, runid): + z = 0 + mz = np.nan + cmpdnum = np.nan + rt = np.nan + ms1int = np.nan + + ms2s = [] + for line in block: + if 'CHARGE=' in line: + z = int(line.split('CHARGE=')[1].split('+')[0]) + if 'RTINSECONDS=' in line: + rt = np.float64(line.split('=')[1]) / 60 + if 'TITLE=' in line: + if 'TITLE=Cmpd' in line: + cmpdnum = int(line.split('TITLE=Cmpd ')[1].split(',')[0]) + if 'scan=' in line: + cmpdnum = int(line.split('scan=')[1].split('"')[0]) + if 'PEPMASS=' in line: + if len(line.split()) > 1: + ms1int = np.float64(line.split()[1]) + mz = np.float64(line.split()[0].split('=')[1]) + if line[0].isdigit(): + ms2s.append(line) + ms2ints = np.asarray([np.float64(line.split()[1]) for line in ms2s]) + ms2mzs = np.asarray([np.float64(line.split()[0]) for line in ms2s]) + return Peak(runid, z, mz, cmpdnum, rt, ms1int, ms2ints, ms2mzs) + + def read_mgf(self, paths): + for path in paths: + f = open(path, 'r') + lines = f.readlines() + f.close() + lines = [line for line in lines if 'MaxRes' not in line and '###' not in line] + header = lines[0:self.first('BEGIN IONS\n', lines)] + data = lines[self.first('BEGIN IONS\n', lines):] + startpoints = [i for i in range(len(data)) if 'BEGIN IONS' in data[i]] + ms1blocks = [data[startpoints[i]:startpoints[i + 1]] for i in range(len(startpoints) - 1)] + [ + data[startpoints[-1]:]] + for block in ms1blocks: + self.peaks.append(self.read_block(block, path)) + self.runids.append(path) + + +class Peak: + def __init__(self, runid, z, mz, cmpdnum, rt, ms1int, ms2ints, ms2mzs): + self.runid = runid + self.z = z + self.mz = mz + self.cmpdnum = cmpdnum + self.rt = rt + self.intensity = ms1int + self.ms2ints = ms2ints + self.ms2s = ms2mzs + self.matches = [] + + +# END-TO-END MATCHING + DISAMBIGUATION + +def analyze(seqs, files, maxz=default_maxz, minl=default_minl, maxl=default_maxl, ppmc_wide=default_ppmc_wide, + ppmc_narrow=default_ppmc_narrow, scorec=default_scorec, fit=default_fit, maxfev=default_maxfev, + ions=default_ions, threshold=default_threshold, mzc=default_mzc, rtc=default_rtc, pvc=default_pvc, + cut_method=default_cut_method, sf=default_sf, + path='./', plots_dir=None, pool_out=None, rangeslist=None, nums_out=None): + + t0 = time.time() + pool = Pool([Protein(seq, minl=minl, maxl=maxl, maxz=maxz, ions=ions) for seq in seqs], MSMS()) + print(f'Generating sequences {time.time() - t0}') + + t0 = time.time() + pool.msms.read_mgf(files) + print(f'Reading files: {time.time() - t0}') + + t0 = time.time() + pool.match(mzc=ppmc_wide, maxz=maxz) + print(f'Matching: {time.time() - t0}') + + t0 = time.time() + truth, popt, pcov = pool.truth_cut(pool.matches, score_c=scorec, maxfev=maxfev, fit=fit) + print(f'Truth cut: {time.time() - t0}') + + t0 = time.time() + ppmc = pool.ppm_cut(popt, pool.matches, ppm_c=ppmc_narrow) + print(f'PPM cut: {time.time() - t0}') + + t0 = time.time() + best = pool.best_peak_cut(ppmc) + print(f'single-peak cut: {time.time() - t0}') + + t0 = time.time() + scorefloor = pool.scorefloor_cut(best, score_f=sf) + print(f'scorefloor cut: {time.time() - t0}') + + t0 = time.time() + disambig, ambig, degen_dict = pool.fragment_cut(scorefloor, ppmc, threshold=threshold, pvc=pvc, + mzc=mzc, rtc=rtc, cut_method=cut_method) + print(f'disambiguation cut: {time.time() - t0}') + + t0 = time.time() + match_dict = {'all': pool.matches, 'truth': truth, 'ppm_cut': ppmc, 'single': best, 'clean': disambig, + 'ambig': ambig} + if sf: + match_dict['scorefloor'] = scorefloor + if plots_dir: + if not os.path.isdir(path + plots_dir): + os.mkdir(path + plots_dir) + xmax = max([match.score for match in pool.matches]) + mzmin = min([match.pep.mz[match.z] for match in pool.matches]) + mzmax = max([match.pep.mz[match.z] for match in pool.matches]) + for step in match_dict: + pool.mzscatter(match_dict[step], popt=popt, xmax=xmax, mzmin=mzmin, mzmax=mzmax, + path=f'{path}{plots_dir}/{step}-scatter.pdf') + pool.scorehist(match_dict[step], title=step, ymax=len(pool.matches), xmax=xmax, + path=f'{path}{plots_dir}/{step}-hist.pdf') + if rangeslist: + for i, rl in enumerate(rangeslist): + ml = [match for match in disambig if seqs[i][match.pep.start:match.pep.end] == match.pep.sequence] + pool.rangeslist(ml, path=f'{path}{rl}') + if pool_out: + for i, ol in enumerate(pool_out): + ml = [match for match in disambig if seqs[i][match.pep.start:match.pep.end] == match.pep.sequence] + pool.pool_output(ml, path=f'{path}{ol}') + if nums_out: + nums = pool.nums_out(match_dict, popt, path=f'{path}{nums_out}') + + print(f'plots + output: {time.time() - t0}') + if nums_out: + print(nums) + return pool, match_dict -def parse_args(): +def parse_args(): # CMD LINE ARGS CODE - parser = argparse.ArgumentParser(description='Biotools CSV cleaner: pool csvs and disambiguate duplicates') - parser.add_argument('--t', '--table', dest='table', help="paths to input csvs", nargs='+', required=True) - parser.add_argument('--o', '--out', dest='out', help="path to output csv") + parser = argparse.ArgumentParser(description='PIGEON: match peptides and disambiguate duplicates') + parser.add_argument('--s', '--seq', '--sequence', dest='sequence', help="input sequence", nargs='+', required=True) + parser.add_argument('--f', '--files', dest='files', help="paths to input files", nargs='+', required=True) + + parser.add_argument('--path', dest='path', help="path for outputs") parser.add_argument('--p', '--plots', dest='plots', help="directory for output plots") - parser.add_argument('--ov', '--overlap', dest='overlap', help="how to treat overlapping duplicates: keep/drop/select?") - parser.add_argument('--sf', '--scorefloor', dest='scorefloor', type=float, help='Score floor for assignments') - parser.add_argument('--r', '--rangeslist', dest='rangeslist', help="destination for rangeslist") - parser.add_argument('--RTC', dest='RTC', type=float, help='RT cutoff for duplicates') - parser.add_argument('--MZC', dest='MZC', type=float, help='M/Z cutoff for duplicates') - parser.add_argument('--fit', dest='fit', help='type of curve fit: inv, or quad for quadratic') - parser.add_argument('--scoreC', dest='scoreC', type=float, help='Score threshold for initial curve fit') - parser.add_argument('--ppmC', dest='ppmC', type=float, help='ppm cutoff after curve fit') - parser.add_argument('--maxfev', dest='maxfev', type=int, help='maxfev for curve fit') + parser.add_argument('--o', '--output', dest='output', nargs='+', help="pool output") + parser.add_argument('--r', '--rangeslist', dest='rangeslist', nargs='+', help="rangeslist output") + parser.add_argument('--n', '--nums', dest='nums', help="diagnostic stats output") + + parser.add_argument('--maxz', dest='maxz', type=float, help='max charge for peptides') + parser.add_argument('--minl', dest='minl', type=float, help='min length for peptides') + parser.add_argument('--maxl', dest='maxl', type=float, help='max length for peptides') + parser.add_argument('--ppmc_w', dest='ppmc_wide', type=float, help='ppm error cutoff for initial match') + parser.add_argument('--ppmc_n', dest='ppmc_narrow', type=float, help='ppm cutoff after curve fit') + parser.add_argument('--scorec', dest='scorec', type=float, help='score threshold for initial curve fit') + parser.add_argument('--fit', dest='fit', help='how to fit systematic error: quad or inv?') + parser.add_argument('--maxfev', dest='maxfev', type=int, help='maximum iterations for systematic error fit') + + parser.add_argument('--ions', dest='ions', help="which ions to consider in match", nargs='+') + parser.add_argument('--ft', '--threshold', dest='threshold', type=float, + help='threshold (Da/e) for fragment matches') + parser.add_argument('--mzc', dest='mzc', type=float, help='m/z cutoff for duplicates') + parser.add_argument('--rtc', dest='rtc', type=float, help='RT cutoff for duplicates') + parser.add_argument('--pvc', dest='pvc', type=float, help='p value cutoff for duplicates') + parser.add_argument('--c', '--method', '--cm', dest='cut_method', help="how to treat coeluting peptides: keep or drop?") + parser.add_argument('--scorefloor', '--sf', dest='scorefloor', type=float, help='minimum score for score cut') args = parser.parse_args() - if args.RTC: RTcutoff = args.RTC - if args.MZC: MZcutoff = args.MZC - if args.scorefloor: score_floor = args.scorefloor - if args.scoreC: score_threshold = args.scoreC - if args.ppmC: ppm_threshold = args.ppmC - if args.fit: fit = args.fit - if args.maxfev: maxfev = args.maxfev - if args.plots: plots = args.plots - if args.overlap: overlap_method = args.overlap - print(args) return args +# FOR CMD LINE + def main(): args = parse_args() - # MAKE PLOTS DIRECTORY - - if not os.path.isdir(plots): - os.mkdir(plots) - - # FILE IN - - data = pd.DataFrame() - for table in args.table: - data = pd.concat([data, pd.read_csv(table)]) - - data = data[data[mzlabel] != '-'] - data = data[data[ppmlabel] != '-'] - data.dropna(subset=[seqlabel], inplace=True) - data[scorelabel].fillna(0, inplace=True) - data[scorelabel] = data[scorelabel].apply(inthelper) - - #data = data.loc[data['Tree hierarchy'] != 'no peak'] - data.sort_values(scorelabel, inplace=True, ascending=False) - - print('pooled: ' + str(data.shape)) - print('mean score: ' + str(np.mean(data[scorelabel]))) - print('% 0: ' + str(100 * data[scorelabel].value_counts()[0] / len(data))) - print() - - ymax=len(data) - xmax = max(data[scorelabel]) - - plt.rc('font', size=25) - - # hist of initial score distribution - - scorehist(data, "pooled: %d\n mean score: %1.5f\n %%0: %1.5f" %formatfill(data), plots + '/hist-scores-all.pdf') - - # scatterplot of full dataset - - mzscatter(data, plots + '/scatter-30ppm.pdf') - - # threshold scores - hsdata = data.loc[data[scorelabel] > score_threshold] - - # hist of thresholded score distribution - - scorehist(hsdata, "high scores: %d\n mean score: %1.5f" %(len(hsdata), np.mean(hsdata[scorelabel])), plots + '/hist-scores-cut.pdf') - - print('highscoring: ' + str(hsdata.shape)) - print('mean score: ' + str(np.mean(hsdata[scorelabel]))) - print() - - popt, pcov = opt.curve_fit(fitfn, hsdata[mzlabel].apply(float), hsdata[ppmlabel].apply(float), maxfev=maxfev, - bounds=([-np.inf, -np.inf, -np.inf], [np.inf, min(data[mzlabel].apply(float))-100 if fit == 'inv' else np.inf, np.inf])) - - # scatterplot of score-thresholded data - - mzscatter(hsdata, plots + '/scatter-hscoring.pdf', popt) + print(args) - # hist of ppm distribution, before & after cut + seqs = args.sequence + files = args.files + + maxz = args.maxz if args.maxz else default_maxz + minl = args.minl if args.minl else default_minl + maxl = args.maxl if args.maxl else default_maxl + ppmc_wide = args.ppmc_wide if args.ppmc_wide else default_ppmc_wide + ppmc_narrow = args.ppmc_narrow if args.ppmc_narrow else default_ppmc_narrow + scorec = args.scorec if args.scorec else default_scorec + fit = args.fit if args.fit else default_fit + maxfev = args.maxfev if args.maxfev else default_maxfev + + ions = args.ions if args.ions else default_ions + threshold = args.threshold if args.threshold else default_threshold + mzc = args.mzc if args.mzc else default_mzc + rtc = args.rtc if args.rtc else default_rtc + pvc = args.pvc if args.pvc else default_pvc + cut_method = args.cut_method if args.cut_method else default_cut_method + sf = args.scorefloor if args.scorefloor else default_sf + + path = args.path if args.path else '' + plots_dir = args.plots if args.plots else None + pool_out = args.output if args.output else None + rangeslist = args.rangeslist if args.rangeslist else None + nums_out = args.nums if args.nums else None + + analyze(seqs, files, maxz=maxz, minl=minl, maxl=maxl, ppmc_wide=ppmc_wide, + ppmc_narrow=ppmc_narrow, scorec=scorec, fit=fit, maxfev=maxfev, + ions=ions, threshold=threshold, mzc=mzc, rtc=rtc, pvc=pvc, + cut_method=cut_method, sf=sf, + path=path, plots_dir=plots_dir, pool_out=pool_out, + rangeslist=rangeslist, nums_out=nums_out) + return - figure, (ax1) = plt.subplots(1, 1, figsize=(6,6)) - plt.xlabel('|delta-ppm|') - plt.ylabel('n') - plt.hist(abs(data[ppmlabel].apply(float) - fitfn(data[mzlabel].apply(float), *popt)), log=True, bins=100) - plt.vlines(ppm_threshold, 0, len(data), linestyles='dashed', colors='k') - plt.savefig(plots + '/hist-ppms.pdf') - - print('m/z deviation (30ppm): ' + str(np.sqrt(np.mean((data[ppmlabel].apply(float) - fitfn(data[mzlabel].apply(float), *popt)) ** 2)))) - print('m/z deviation (hsdata): ' + str(np.sqrt(np.mean((hsdata[ppmlabel].apply(float) - fitfn(hsdata[mzlabel].apply(float), *popt)) ** 2)))) - - - # cut everything further than ppm_threshold from the trendline - data = data.loc[abs(data[ppmlabel].apply(float) - fitfn(data[mzlabel].apply(float), *popt)) < ppm_threshold] - - print('ppm threshold: ' + str(data.shape)) - print('mean score: ' + str(np.mean(data[scorelabel]))) - print('% 0: ' + str(100 * data[scorelabel].value_counts()[0] / len(data))) - print() - - print('m/z deviation (7ppm): ' + str(np.sqrt(np.mean((data[ppmlabel].apply(float) - fitfn(data[mzlabel].apply(float), *popt)) ** 2)))) - - # PLOTS - mzscatter(data, plots + '/scatter-7ppm.pdf', popt) - scorehist(data, "ppm: %d\n mean score: %1.5f\n %%0: %1.5f" %formatfill(data), plots + '/hist-scores-ppm.pdf') - - - # DISAMBIGUATION SECTION (REMOVE DUPLICATE PEPTIDES BASED ON SCORE) - - if rangelabel in data.columns: - data[startlabel] = data[rangelabel].str.split(expand=True)[0] - data[endlabel] = data[rangelabel].str.split(expand=True)[2] - data[startlabel] = data[startlabel].apply(int) - data[endlabel] = data[endlabel].apply(int) - - grouped = data.groupby([seqlabel, zlabel]) - topscores = pd.DataFrame(columns=data.columns) - ties = 0 - for name, group in grouped: - topscoring = group.loc[group[scorelabel] == max(group[scorelabel])] - if(len(topscoring[scorelabel]) > 1): - ties = ties + 1 - topscores = pd.concat([topscores, topscoring.loc[topscoring['Int.'] == max(topscoring['Int.'])]]) - topscores.reset_index(inplace=True, drop=True) - - print('ties: ' + str(ties)) - print('topscores: ' + str(topscores.shape)) - print('mean score: ' + str(np.mean(topscores[scorelabel]))) - print('% 0: ' + str(100 * topscores[scorelabel].value_counts()[0] / len(topscores))) - print('m/z deviation (topscores): ' + str(np.sqrt(np.mean((topscores[ppmlabel].apply(float) - fitfn(topscores[mzlabel].apply(float), *popt)) ** 2)))) - print() - - # PLOTS - scorehist(topscores, "topscores: %d\n mean score: %1.5f\n %%0: %1.5f" %formatfill(topscores), plots + '/hist-scores-topscores.pdf') - mzscatter(topscores, plots + '/scatter-topscore.pdf', popt) - - if score_floor: - topscores = topscores.loc[topscores[scorelabel] >= score_floor] - - # PLOTS - scorehist(topscores, "score-floor: %d\n mean score: %1.5f\n %%0: %1.5f" %formatfill(topscores), plots + '/hist-scores-scorefloor.pdf') - mzscatter(topscores, plots + '/scatter-scorefloor.pdf', popt) - - print('scorefloor: ' + str(topscores.shape)) - print('mean score: ' + str(np.mean(topscores[scorelabel]))) - print() - - topscores.sort_values(['Start', 'End'], inplace=True) - topscores.reset_index(inplace=True, drop=True) - - flagged = pd.DataFrame() - yellow = pd.DataFrame() - ties = 0 - overlaps = 0 - for i in topscores.index: - r = topscores.loc[i] - grp = topscores.loc[redflag(topscores, r)] - overlap = topscores.loc[yellowflag(topscores, r)] - r = topscores.loc[topscores.index == i] - # if not grp.empty: - # if r[scorelabel].iloc[0] == max(grp[scorelabel]): - # ties = ties + 1 - # if r[scorelabel].iloc[0] <= max(grp[scorelabel]): - # flagged = pd.concat([flagged, r]) - # continue - # if not overlap.empty: - # if overlap_method == 'drop' or overlap_method != 'keep' and r[scorelabel].iloc[0] <= max(overlap[scorelabel]): - # yellow = pd.concat([yellow, r]) - # overlaps = overlaps + 1 - - if not grp.empty: - if r[scorelabel].iloc[0] == max(grp[scorelabel]): - ties = ties + 1 - if r[scorelabel].iloc[0] <= select_score or (overlap_method != 'keep'): # max(grp[scorelabel]) > select_score and - print('redflag') - print(r[[mzlabel, mrlabel, scorelabel, startlabel, endlabel, seqlabel]]) - print(grp[[mzlabel, mrlabel, scorelabel, startlabel, endlabel, seqlabel]]) - print('\n\n\n') - flagged = pd.concat([flagged, r]) - continue - if not overlap.empty: - lens_overlap = [] - for j in overlap.index: - ovrow = overlap.loc[j] - lens_overlap.append(min(np.abs(int(ovrow[startlabel]) + skip_res - int(r[endlabel])), np.abs( - int(r[startlabel]) + skip_res - int(ovrow[endlabel])))) - ovfrac = min(lens_overlap) / (int(ovrow[endlabel]) - (int(ovrow[startlabel]) + skip_res)) - if r[scorelabel].iloc[0] <= select_score or (max(overlap[scorelabel]) > select_score and (overlap_method == 'drop' or overlap_method != 'keep' and ovfrac < ov_threshold)): - print('yellowflag') - print(r[[mzlabel, mrlabel, scorelabel, startlabel, endlabel, seqlabel]]) - print(overlap[[mzlabel, mrlabel, scorelabel, startlabel, endlabel, seqlabel]]) - print('\n\n\n') - yellow = pd.concat([yellow, r]) - overlaps = overlaps + 1 - - print('ties: ' + str(ties)) - print('overlaps: ' + str(overlaps)) - - # HANDLE REDFLAG PEPTIDES - - if not flagged.empty: - - print('flagged: ' + str(flagged.shape)) - print('mean score: ' + str(np.mean(flagged[scorelabel]))) - if 0 in flagged[scorelabel].values: - print('% 0: ' + str(100 * flagged[scorelabel].value_counts()[0] / len(flagged))) - else: - print('% 0: ' + str(0)) - print('m/z deviation (flagged): ' + str(np.sqrt(np.mean((flagged[ppmlabel].apply(float) - fitfn(flagged[mzlabel].apply(float), *popt)) ** 2)))) - print() - - # PLOTS - mzscatter(flagged, plots + '/scatter-flagged.pdf', popt) - scorehist(flagged, "flagged: %d\n mean score: %1.5f\n %%0: %1.5f" % formatfill(flagged), - plots + '/hist-scores-flagged.pdf') - - # HANDLE YELLOW FLAG PEPTIDES - - if not yellow.empty: - - print('yellow: ' + str(yellow.shape)) - print('mean score: ' + str(np.mean(yellow[scorelabel]))) - if 0 in yellow[scorelabel].values: - print('% 0: ' + str(100 * yellow[scorelabel].value_counts()[0] / len(yellow))) - else: - print('% 0: ' + str(0)) - print('m/z deviation (yellow): ' + str(np.sqrt(np.mean((yellow[ppmlabel].apply(float) - fitfn(yellow[mzlabel].apply(float), *popt)) ** 2)))) - - print() - - # PLOTS - mzscatter(yellow, plots + '/scatter-yellow.pdf', popt) - scorehist(yellow, "yellow: %d\n mean score: %1.5f\n %%0: %1.5f" % formatfill(yellow), - plots + '/hist-scores-yellow.pdf') - - clean = topscores.drop(flagged.index.union(yellow.index)) - - # SORT & SAVE TO CSV - - clean.sort_values([startlabel, endlabel], inplace=True) - if args.out: - clean.to_csv(args.out, index=False) - if args.rangeslist: - clean[[startlabel, endlabel]].to_csv(args.rangeslist, index=False) - - print('clean: ' + str(clean.shape)) - print('mean score: ' + str(np.mean(clean[scorelabel]))) - if not yellow.empty: - print('ambiguous: ' + str(overlaps - yellow.shape[0])) - if 0 in clean[scorelabel].values: - print('% 0: ' + str(100 * clean[scorelabel].value_counts()[0] / len(clean))) - else: - print('% 0: ' + str(0)) - print('m/z deviation (clean): ' + str(np.sqrt(np.mean((clean[ppmlabel].apply(float) - fitfn(clean[mzlabel].apply(float), *popt)) ** 2)))) - - # PLOTS - mzscatter(clean, plots + '/scatter-clean.pdf', popt) - scorehist(clean, "clean: %d\n mean score: %1.5f\n %%0: %1.5f" %formatfill(clean), plots + '/hist-scores-clean.pdf') if __name__ == '__main__': main() \ No newline at end of file diff --git a/pigeon_feather/pigeon_v3.py b/pigeon_feather/pigeon_v3.py deleted file mode 100644 index 573100a..0000000 --- a/pigeon_feather/pigeon_v3.py +++ /dev/null @@ -1,643 +0,0 @@ -import numpy as np -import pandas as pd -import scipy.sparse as sparse -import scipy.optimize as opt -import scipy.stats as stats -from pyteomics import mass -import matplotlib.pyplot as plt -import matplotlib.pyplot as plt -import matplotlib.colors as colors -from matplotlib import rcParams -from matplotlib import rc -import time -import argparse -import os - -# PARAMS - -# maximum charge of peptide ions in search -default_maxz = 3 -# length range for peptides in search (inclusive) -default_minl = 3 -default_maxl = 15 -# m/z error cutoff for initial search and for cut about trendline (in ppm) -default_ppmc_wide = 30 -default_ppmc_narrow = 7 -# score cutoff for truth peptides (for systematic error fit) -default_scorec = 0.05 -# fit to use for systematic error ('quad' or 'inv') -default_fit = 'quad' -# maximum iterations for systematic error fit -default_maxfev = 1000 -# types of fragment ions to consider in search -default_ions = ['b', 'b-H2O', 'b-NH3', 'y', 'y-H2O', 'y-NH3'] -# threshold in Da/e for matching fragments -default_threshold = 0.02 -# cutoff for matches to be considered degenerate, in m/z (Da/e) and RT (min) -default_mzc = 0.1 -default_rtc = 0.5 -# significance threshold for p-value for disambiguation -default_pvc = 0.05 -# method to use for disambiguation: keep or drop coeluting peptides? ('keep' or 'drop') -default_cut_method = 'drop' -# score floor (drop all below given match score) -default_sf = 0 - - -# FOR VECTORIZED FRAGMENT MATCHING -def match_fragment(mz1, mz2, threshold=default_threshold): - return abs(mz1 - mz2) <= threshold and (mz1 or mz2) - -vecmf = np.vectorize(match_fragment, otypes=[bool]) - - -class Pool: - def __init__(self, proteins, msms, fit=default_fit): - self.proteins = proteins - self.msms = msms - self.fit = fit - - def fitfn(self, x, a, b, c): - if self.fit == 'inv': - return c + a / (x - b) - return c + a * x + b * (x ** 2) - - # PLOTTING FNS - - def scorehist(self, matches, path=None, title=None, xmax=None, ymin=None, ymax=None, figsize=109.7): - rcParams['font.sans-serif'][7] = 'DejaVu Sans' - rcParams['font.sans-serif'][0] = 'Arial' - scores = [match.score for match in matches] - afont = {'size': 6} - rc('font', **afont) - - if not ymin: - ymin = 9e-1 - if not ymax: - ymax = len(matches) - if not xmax: - xmax = max(scores) - pt = 72 - figure, (ax1) = plt.subplots(1, 1, figsize=(figsize / pt, figsize / pt)) - plt.xlabel('Score') - plt.ylabel('n (matches)') - plt.ylim(ymin, ymax) - if title: - plt.title(title) - plt.hist(scores, log=True, bins=100, range=[0, xmax]) - plt.tight_layout() - if path: - plt.savefig(path) - plt.close(figure) - - def mzscatter(self, matches, path=None, popt=None, xmax=None, mzmin=None, mzmax=None, figsize=109.7): - rcParams['font.sans-serif'][7] = 'DejaVu Sans' - rcParams['font.sans-serif'][0] = 'Arial' - - afont = {'size': 6} - rc('font', **afont) - - pt = 72 - figure, (ax1) = plt.subplots(1, 1, figsize=(figsize / pt, figsize / pt)) - - mzs = np.asarray([match.pep.mz[match.z] for match in matches]) - ppms = np.asarray([match.ppmerr for match in matches]) - scores = np.asarray([match.score for match in matches]) - if not xmax: - xmax = max(scores) - if popt is not None: - plt.plot(np.sort(mzs), self.fitfn(np.sort(mzs), *popt), linewidth=0.5) - plt.scatter(mzs, ppms, c=scores + 1e-7, - norm=colors.LogNorm(vmin=1e-7, vmax=xmax), s=50 / pt, linewidth=0.05) - plt.colorbar(label='Score') - plt.xlabel('$\it{{m}}$/$\it{{z}}$ (D/e)') - plt.ylabel('mass error (ppm)') - plt.ylim(-30, 30) - if mzmin and mzmax: - plt.xlim(mzmin, mzmax) - plt.tight_layout() - if path: - plt.savefig(path) - plt.close(figure) - - # MAKE TABLE OF FEATURES FOR MATCHED PEPTIDES - def pool_output(self, matches, path=None): - cmpdnum = [match.peak.cmpdnum for match in matches] - runid = [match.peak.runid for match in matches] - sequence = [match.pep.sequence for match in matches] - start = [match.pep.start + 1 for match in matches] - end = [match.pep.end for match in matches] - - int = [match.peak.intensity for match in matches] - z = [match.z for match in matches] - m_calc = [match.pep.m for match in matches] - mz_calc = [match.pep.mz[match.z] for match in matches] - m_meas = [match.peak.mz * match.z for match in matches] - mz_meas = [match.peak.mz for match in matches] - ppm_err = [match.ppmerr for match in matches] - da_err = [match.peak.mz * match.z - match.pep.mz[match.z] for match in matches] - - rt = [match.peak.rt for match in matches] - n_hits = [match.fragmatch.size for match in matches] - n_tries = [np.prod(match.fragmatch.shape) for match in matches] - score = [match.score for match in matches] - - df = pd.DataFrame(data={'compound #': cmpdnum, 'run ID': runid, 'sequence': sequence, - 'Start': start, 'End': end, 'intensity': int, - 'z': z, 'mass (calc)': m_calc, 'm/z (calc)': mz_calc, - 'mass (meas)': m_meas, 'm/z (meas)': mz_meas, - 'error (ppm)': ppm_err, 'error (Da/e)': da_err, - 'RT (min)': rt, '# fragment matches': n_hits, - '# peaks * fragments': n_tries, 'score': score - }) - df.sort_values(by=['Start', 'End', 'z', 'score'], inplace=True) - if path: - df.to_csv(path, index=False) - return df - - # MAKE TABLE OF SEQUENCE RANGES FOR MATCHED PEPTIDES - def rangeslist(self, matches, path=None): - start = [match.pep.start + 1 for match in matches] - end = [match.pep.end for match in matches] - df = pd.DataFrame(data={'Start': start, 'End': end}) - df.sort_values(by=['Start', 'End'], inplace=True) - if path: - df.to_csv(path, index=False) - return df - - # LOG STATS ABOUT EACH STEP IN ANALYSIS - def nums_out(self, match_dict, popt, path=None): - out = '' - for key in match_dict: - n = len(match_dict[key]) - p_0 = len([match for match in match_dict[key] if not match.score]) / n * 100 - mean_score = np.mean([match.score for match in match_dict[key]]) - rmsdev = np.sqrt( - np.mean([(match.ppmerr - self.fitfn(match.pep.mz[match.z], *popt)) ** 2 for match in match_dict[key]])) - out = out + f'{key}: n={n}, %0={p_0}, mean_score={mean_score}, RMSD={rmsdev}\n' - if path: - with open(path, 'w') as file: - file.write(out) - return out - - # INITIAL MATCH COMPARING ALL PEAKS TO ALL THEORETICAL CHARGE STATES - def match(self, mzc=default_ppmc_wide, maxz=default_maxz): - self.matches = [] - for protein in self.proteins: - for pep in protein.peps: - for peak in self.msms.peaks: - if peak.z: - if peak.z in pep.mz: - if abs(pep.mz[peak.z] - peak.mz) <= mzc * pep.mz[peak.z] * 1e-6: - ppmerr = (peak.mz - pep.mz[peak.z]) / pep.mz[peak.z] * 1e6 - self.matches.append(Match(pep, peak, peak.z, ppmerr, self)) - else: - for z in range(1, maxz + 1): - if abs(pep.mz[z] - peak.mz) <= mzc * pep.mz[z] * 1e-6: - ppmerr = (peak.mz - pep.mz[z]) / pep.mz[z] * 1e6 - self.matches.append(Match(pep, peak, z, ppmerr, self)) - break - self.score_max = max([match.score for match in self.matches]) - - # CUT ALL BUT THOSE PASSING HIGH SCORE CUTOFF, FIT TRENDLINE FOR SYSTEMATIC ERROR - def truth_cut(self, matches, score_c=default_scorec, maxfev=default_maxfev, fit=default_fit): - self.fit = fit - best_peaks = [] - mzs = [] - ppms = [] - scores = [] - for match in matches: - if match.score >= score_c: - best_peaks.append(match) - mzs.append(match.pep.mz[match.z]) - ppms.append(match.ppmerr) - scores.append(match.score) - popt, pcov = opt.curve_fit(self.fitfn, mzs, ppms, maxfev=maxfev, - bounds=([-np.inf, -np.inf, -np.inf], - [np.inf, min(mzs) - 100 if fit == 'inv' else np.inf, np.inf])) - return best_peaks, popt, pcov - - # CUT ALL FARTHER THAN ppm_c FROM TRENDLINE - def ppm_cut(self, popt, matches, ppm_c=default_ppmc_narrow): - return [match for match in matches if abs(match.ppmerr - self.fitfn(match.pep.mz[match.z], *popt)) < ppm_c] - - # CUT ALL BUT BEST MATCH FOR EACH PEPTIDE - def best_peak_cut(self, matches): - best_peaks = [] - peps = list(set([match.pep for match in matches])) - for pep in peps: - maxscore = max([match.score for match in pep.matches if match in matches]) - best = [match for match in pep.matches if match.score >= maxscore and match in matches] - if len(best) > 1: - maxint = max([match.peak.intensity for match in best]) - best = [match for match in best if match.peak.intensity >= maxint] - best_peaks = best_peaks + best - # self.matches = best_peaks - return best_peaks - - # CUT ALL BELOW GIVEN SCORE (OPTIONAL) - def scorefloor_cut(self, matches, score_f=default_sf): - return [match for match in matches if match.score >= score_f] - - # CUT USING FRAGMENT-BASED DISAMBIGUATION - def fragment_cut(self, matches, allmatches, threshold=default_threshold, pvc=0, mzc=default_mzc, rtc=default_rtc, - cut_method=default_cut_method): - degen_dict = {} - best_peaks = [] - exclude = [] - for match1 in matches: - print() - print() - print('match1:') - print(f'{match1.pep.sequence}-{match1.pep.start+1}-{match1.pep.end}+{match1.z}-{match1.pep.mz[match1.z]}-{match1.peak.runid}-cmpd{match1.peak.cmpdnum}') - print() - degen = [match2 for match2 in allmatches - if (np.abs(match1.pep.mz[match1.z] - match2.pep.mz[match2.z]) < mzc - and np.abs(match1.peak.rt - match2.peak.rt) < rtc - and (match1.pep.sequence != match2.pep.sequence - or match1.pep.start != match2.pep.start - or match1.pep.end != match2.pep.end))] - degen_dict[match1] = degen - if not degen: - print('not degen. keeping') - best_peaks.append(match1) - elif match1.fragmatch.size == 0: - exclude.append(match1) - print('no support, degen. dropping.') - else: - same_pep = [match2 for match2 in allmatches - if (np.abs(match1.pep.mz[match1.z] - match2.pep.mz[match2.z]) < mzc - and np.abs(match1.peak.rt - match2.peak.rt) < rtc - and match1.pep.sequence == match2.pep.sequence)] - flag = False - - pvs = [] - match1_mzs = match1.fragmatch.sum(0) * match1.pep.fragpeaks - mask = np.ones(match1_mzs.shape) - print('degen:') - for match2 in degen: - match2_mzs = match2.fragmatch.sum(0) * match2.pep.fragpeaks - same_frags_1 = sparse.coo_array( - vecmf(match1_mzs, match2.pep.fragpeaks.reshape(1, match2.pep.fragpeaks.shape[0]).T, - threshold=threshold)) - same_frags_2 = sparse.coo_array( - vecmf(match2_mzs, match1.pep.fragpeaks.reshape(1, match1.pep.fragpeaks.shape[0]).T, - threshold=threshold)) - mask1 = np.logical_not(same_frags_1.sum(0)) - mask = np.logical_and(mask, mask1) - mask2 = np.logical_not(same_frags_2.sum(0)) - print(f'{match2.pep.sequence}-{match2.pep.start+1}-{match2.pep.end}+{match2.z}-{match2.pep.mz[match2.z]}-{match2.peak.runid}-cmpd{match2.peak.cmpdnum}') - print(match2_mzs) - print(mask2 * match2_mzs) - if not (mask1 * match1_mzs).any(): - flag = True - print('no unique support, degen. dropping.') - break - elif cut_method == 'drop': - n_unique = np.count_nonzero(mask2 * match2_mzs) - if not pvc and n_unique: - flag = True - print('(WRONG) 1 hit, dropping') - break - elif pvc: - maxmz = max([max(match2.pep.fragpeaks), max(match2.peak.ms2s)]) - minmz = min([min(match2.pep.fragpeaks), min(match2.peak.ms2s)]) - prob = match2.peak.ms2s.size * 2 * threshold / (maxmz - minmz) - pvs.append( - stats.binomtest(n_unique, match2.pep.fragpeaks.size, prob, alternative='greater').pvalue) - print(pvs[-1]) - - print() - print('same_pep:') - if flag: - exclude.append(match1) - else: - if not pvc: - print('(WRONG) no hits, keeping') - best_peaks.append(match1) - else: - samepvs = [] - for match2 in same_pep: - match2_mzs = match2.fragmatch.sum(0) * match2.pep.fragpeaks - print(f'{match2.pep.sequence}-{match2.pep.start+1}-{match2.pep.end}+{match2.z}-{match2.pep.mz[match2.z]}-{match2.peak.runid}-cmpd{match2.peak.cmpdnum}') - print(match2_mzs) - print(mask * match2_mzs) - n_unique = np.count_nonzero(mask * match2_mzs) - maxmz = max([max(match2.pep.fragpeaks), max(match2.peak.ms2s)]) - minmz = min([min(match2.pep.fragpeaks), min(match2.peak.ms2s)]) - prob = match2.peak.ms2s.size * 2 * threshold / (maxmz - minmz) - samepvs.append( - stats.binomtest(n_unique, match2.pep.fragpeaks.size, prob, alternative='greater').pvalue) - print(samepvs[-1]) - if stats.combine_pvalues(samepvs).pvalue >= pvc or ( - cut_method == 'drop' and stats.combine_pvalues(pvs).pvalue <= pvc): - exclude.append(match1) - print('pvalue, dropping') - else: - print('pvalue, keeping') - best_peaks.append(match1) - print(samepvs) - print(stats.combine_pvalues(samepvs).pvalue) - print() - print(pvs) - print(stats.combine_pvalues(pvs).pvalue) - print() - return best_peaks, exclude, degen_dict - - -class Match: - def __init__(self, pep, peak, z, ppmerr, pool): - self.pep = pep - self.peak = peak - self.z = z - self.ppmerr = ppmerr - self.pool = pool - self.fragmatch = sparse.coo_array(vecmf(pep.fragpeaks, peak.ms2s.reshape(1, peak.ms2s.shape[0]).T)) - self.score = self.fragmatch.size ** 2 / np.prod(self.fragmatch.shape) if np.prod(self.fragmatch.shape) > 0 else 0 - pep.matches.append(self) - peak.matches.append(self) - - def remove(self): - pep.matches.remove(self) - peak.matches.remove(self) - pool.matches.remove(self) - - -class Protein: - def __init__(self, sequence, minl=default_minl, maxl=default_maxl, maxz=default_maxz, ions=default_ions): - self.sequence = sequence - self.peps = [] - self.make_peps(minl=minl, maxl=maxl) - for pep in self.peps: - pep.get_mass(maxz=maxz) - pep.get_fragments(maxz=maxz, ions=ions) - - def make_peps(self, minl=default_minl, maxl=default_maxl): - for start in range(len(self.sequence) - minl): - for end in range(start + minl, min(len(self.sequence) + 1, start + maxl + 1)): - self.peps.append(Peptide(self.sequence[start:end], start, end)) - - -class Peptide: - def __init__(self, sequence, start, end): - self.sequence = sequence - self.start = start - self.end = end - self.matches = [] - - def get_mass(self, maxz=default_maxz): - self.m = mass.calculate_mass(sequence=self.sequence) - self.mz = {} - for z in range(1, maxz + 1): - self.mz[z] = mass.calculate_mass(sequence=self.sequence, charge=z) - - def get_fragments(self, maxz=default_maxz, ions=default_ions): - self.fragments = [] - for ion in ions: - for i in range(len(self.sequence)): - for z in range(1, maxz + 1): - if ion[0] == 'b': - seq = self.sequence[0:i + 1] - self.fragments.append(Fragment(len(self.fragments), seq, 0, i, ion, - mass.calculate_mass(sequence=seq, ion_type=ion, charge=z), z)) - else: - seq = self.sequence[i:] - self.fragments.append(Fragment(len(self.fragments), seq, i, len(self.sequence) - 1, ion, - mass.calculate_mass(sequence=seq, ion_type=ion, charge=z), z)) - self.fragpeaks = np.empty(len(self.fragments), np.float64) - for frag in range(len(self.fragments)): - self.fragpeaks[frag] = self.fragments[frag].mz - - -class Fragment: - def __init__(self, i, seq, start, end, ion, mz, z): - self.index = i - self.seq = seq - self.start = start - self.end = end - self.ion = ion - self.mz = mz - self.z = z - - -class MSMS: - def __init__(self): - self.peaks = [] - self.runids = [] - - def first(self, s, l): - for i in range(len(l)): - if s == l[i]: - return i - - # MGF FORMAT-SPECIFIC I/O - def read_block(self, block, runid): - z = 0 - mz = np.nan - cmpdnum = np.nan - rt = np.nan - ms1int = np.nan - - ms2s = [] - for line in block: - if 'CHARGE=' in line: - z = int(line.split('CHARGE=')[1].split('+')[0]) - if 'RTINSECONDS=' in line: - rt = np.float64(line.split('=')[1]) / 60 - if 'TITLE=' in line: - if 'TITLE=Cmpd' in line: - cmpdnum = int(line.split('TITLE=Cmpd ')[1].split(',')[0]) - if 'scan=' in line: - cmpdnum = int(line.split('scan=')[1].split('"')[0]) - if 'PEPMASS=' in line: - if len(line.split()) > 1: - ms1int = np.float64(line.split()[1]) - mz = np.float64(line.split()[0].split('=')[1]) - if line[0].isdigit(): - ms2s.append(line) - ms2ints = np.asarray([np.float64(line.split()[1]) for line in ms2s]) - ms2mzs = np.asarray([np.float64(line.split()[0]) for line in ms2s]) - return Peak(runid, z, mz, cmpdnum, rt, ms1int, ms2ints, ms2mzs) - - def read_mgf(self, paths): - for path in paths: - f = open(path, 'r') - lines = f.readlines() - f.close() - lines = [line for line in lines if 'MaxRes' not in line and '###' not in line] - header = lines[0:self.first('BEGIN IONS\n', lines)] - data = lines[self.first('BEGIN IONS\n', lines):] - startpoints = [i for i in range(len(data)) if 'BEGIN IONS' in data[i]] - ms1blocks = [data[startpoints[i]:startpoints[i + 1]] for i in range(len(startpoints) - 1)] + [ - data[startpoints[-1]:]] - for block in ms1blocks: - self.peaks.append(self.read_block(block, path)) - self.runids.append(path) - - -class Peak: - def __init__(self, runid, z, mz, cmpdnum, rt, ms1int, ms2ints, ms2mzs): - self.runid = runid - self.z = z - self.mz = mz - self.cmpdnum = cmpdnum - self.rt = rt - self.intensity = ms1int - self.ms2ints = ms2ints - self.ms2s = ms2mzs - self.matches = [] - - -# END-TO-END MATCHING + DISAMBIGUATION - -def analyze(seqs, files, maxz=default_maxz, minl=default_minl, maxl=default_maxl, ppmc_wide=default_ppmc_wide, - ppmc_narrow=default_ppmc_narrow, scorec=default_scorec, fit=default_fit, maxfev=default_maxfev, - ions=default_ions, threshold=default_threshold, mzc=default_mzc, rtc=default_rtc, pvc=default_pvc, - cut_method=default_cut_method, sf=default_sf, - path='./', plots_dir=None, pool_out=None, rangeslist=None, nums_out=None): - - t0 = time.time() - pool = Pool([Protein(seq, minl=minl, maxl=maxl, maxz=maxz, ions=ions) for seq in seqs], MSMS()) - print(f'Generating sequences {time.time() - t0}') - - t0 = time.time() - pool.msms.read_mgf(files) - print(f'Reading files: {time.time() - t0}') - - t0 = time.time() - pool.match(mzc=ppmc_wide, maxz=maxz) - print(f'Matching: {time.time() - t0}') - - t0 = time.time() - truth, popt, pcov = pool.truth_cut(pool.matches, score_c=scorec, maxfev=maxfev, fit=fit) - print(f'Truth cut: {time.time() - t0}') - - t0 = time.time() - ppmc = pool.ppm_cut(popt, pool.matches, ppm_c=ppmc_narrow) - print(f'PPM cut: {time.time() - t0}') - - t0 = time.time() - best = pool.best_peak_cut(ppmc) - print(f'single-peak cut: {time.time() - t0}') - - t0 = time.time() - scorefloor = pool.scorefloor_cut(best, score_f=sf) - print(f'scorefloor cut: {time.time() - t0}') - - t0 = time.time() - disambig, ambig, degen_dict = pool.fragment_cut(scorefloor, ppmc, threshold=threshold, pvc=pvc, - mzc=mzc, rtc=rtc, cut_method=cut_method) - print(f'disambiguation cut: {time.time() - t0}') - - t0 = time.time() - match_dict = {'all': pool.matches, 'truth': truth, 'ppm_cut': ppmc, 'single': best, 'clean': disambig, - 'ambig': ambig} - if sf: - match_dict['scorefloor'] = scorefloor - if plots_dir: - if not os.path.isdir(path + plots_dir): - os.mkdir(path + plots_dir) - xmax = max([match.score for match in pool.matches]) - mzmin = min([match.pep.mz[match.z] for match in pool.matches]) - mzmax = max([match.pep.mz[match.z] for match in pool.matches]) - for step in match_dict: - pool.mzscatter(match_dict[step], popt=popt, xmax=xmax, mzmin=mzmin, mzmax=mzmax, - path=f'{path}{plots_dir}/{step}-scatter.pdf') - pool.scorehist(match_dict[step], title=step, ymax=len(pool.matches), xmax=xmax, - path=f'{path}{plots_dir}/{step}-hist.pdf') - if rangeslist: - for i, rl in enumerate(rangeslist): - ml = [match for match in disambig if seqs[i][match.pep.start:match.pep.end] == match.pep.sequence] - pool.rangeslist(ml, path=f'{path}{rl}') - if pool_out: - for i, ol in enumerate(pool_out): - ml = [match for match in disambig if seqs[i][match.pep.start:match.pep.end] == match.pep.sequence] - pool.pool_output(ml, path=f'{path}{ol}') - if nums_out: - nums = pool.nums_out(match_dict, popt, path=f'{path}{nums_out}') - - print(f'plots + output: {time.time() - t0}') - if nums_out: - print(nums) - return pool, match_dict - - -def parse_args(): - # CMD LINE ARGS CODE - - parser = argparse.ArgumentParser(description='PIGEON: match peptides and disambiguate duplicates') - parser.add_argument('--s', '--seq', '--sequence', dest='sequence', help="input sequence", nargs='+', required=True) - parser.add_argument('--f', '--files', dest='files', help="paths to input files", nargs='+', required=True) - - parser.add_argument('--path', dest='path', help="path for outputs") - parser.add_argument('--p', '--plots', dest='plots', help="directory for output plots") - parser.add_argument('--o', '--output', dest='output', nargs='+', help="pool output") - parser.add_argument('--r', '--rangeslist', dest='rangeslist', nargs='+', help="rangeslist output") - parser.add_argument('--n', '--nums', dest='nums', help="diagnostic stats output") - - parser.add_argument('--maxz', dest='maxz', type=float, help='max charge for peptides') - parser.add_argument('--minl', dest='minl', type=float, help='min length for peptides') - parser.add_argument('--maxl', dest='maxl', type=float, help='max length for peptides') - parser.add_argument('--ppmc_w', dest='ppmc_wide', type=float, help='ppm error cutoff for initial match') - parser.add_argument('--ppmc_n', dest='ppmc_narrow', type=float, help='ppm cutoff after curve fit') - parser.add_argument('--scorec', dest='scorec', type=float, help='score threshold for initial curve fit') - parser.add_argument('--fit', dest='fit', help='how to fit systematic error: quad or inv?') - parser.add_argument('--maxfev', dest='maxfev', type=int, help='maximum iterations for systematic error fit') - - parser.add_argument('--ions', dest='ions', help="which ions to consider in match", nargs='+') - parser.add_argument('--ft', '--threshold', dest='threshold', type=float, - help='threshold (Da/e) for fragment matches') - parser.add_argument('--mzc', dest='mzc', type=float, help='m/z cutoff for duplicates') - parser.add_argument('--rtc', dest='rtc', type=float, help='RT cutoff for duplicates') - parser.add_argument('--pvc', dest='pvc', type=float, help='p value cutoff for duplicates') - parser.add_argument('--c', '--method', '--cm', dest='cut_method', help="how to treat coeluting peptides: keep or drop?") - parser.add_argument('--scorefloor', '--sf', dest='scorefloor', type=float, help='minimum score for score cut') - - args = parser.parse_args() - - return args - - -# FOR CMD LINE - -def main(): - args = parse_args() - print(args) - - seqs = args.sequence - files = args.files - - maxz = args.maxz if args.maxz else default_maxz - minl = args.minl if args.minl else default_minl - maxl = args.maxl if args.maxl else default_maxl - ppmc_wide = args.ppmc_wide if args.ppmc_wide else default_ppmc_wide - ppmc_narrow = args.ppmc_narrow if args.ppmc_narrow else default_ppmc_narrow - scorec = args.scorec if args.scorec else default_scorec - fit = args.fit if args.fit else default_fit - maxfev = args.maxfev if args.maxfev else default_maxfev - - ions = args.ions if args.ions else default_ions - threshold = args.threshold if args.threshold else default_threshold - mzc = args.mzc if args.mzc else default_mzc - rtc = args.rtc if args.rtc else default_rtc - pvc = args.pvc if args.pvc else default_pvc - cut_method = args.cut_method if args.cut_method else default_cut_method - sf = args.scorefloor if args.scorefloor else default_sf - - path = args.path if args.path else '' - plots_dir = args.plots if args.plots else None - pool_out = args.output if args.output else None - rangeslist = args.rangeslist if args.rangeslist else None - nums_out = args.nums if args.nums else None - - analyze(seqs, files, maxz=maxz, minl=minl, maxl=maxl, ppmc_wide=ppmc_wide, - ppmc_narrow=ppmc_narrow, scorec=scorec, fit=fit, maxfev=maxfev, - ions=ions, threshold=threshold, mzc=mzc, rtc=rtc, pvc=pvc, - cut_method=cut_method, sf=sf, - path=path, plots_dir=plots_dir, pool_out=pool_out, - rangeslist=rangeslist, nums_out=nums_out) - return - - -if __name__ == '__main__': - main() \ No newline at end of file