-
Notifications
You must be signed in to change notification settings - Fork 1
/
low-input-cell.py
101 lines (80 loc) · 5.45 KB
/
low-input-cell.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
sns.set(font="Arial")
sns.set_theme(style="ticks")
palette_dict = {
"publication": sns.color_palette()[9],
"diaumpire": sns.color_palette()[6],
"diaumpire hybrid": sns.color_palette()[8],
"spectronaut": sns.color_palette()[4],
"diann": sns.color_palette()[3],
"maxdia": sns.color_palette()[2],
"msfraggerdia": sns.color_palette()[0],
"msfragger hybrid": sns.color_palette()[1]
}
def process(input_df, overlap_df, other_union_df, tool_name):
input_overlap = input_df.loc[overlap_df]
tt = input_overlap.dropna(axis=0, thresh=3)
input_overlap_cv = pd.DataFrame({"cv": np.nanstd(tt, 1) * 100 / np.nanmean(tt, 1), "tool": tool_name, "type": "overlap"})
input_unique = input_df.loc[~input_df.index.isin(other_union_df)]
tt = input_unique.dropna(axis=0, thresh=3)
input_unique_cv = pd.DataFrame({"cv": np.nanstd(tt, 1) * 100 / np.nanmean(tt, 1), "tool": tool_name, "type": "unique"})
return input_overlap_cv, input_unique_cv
os.chdir(r"G:\Dropbox\papers_Fengchao\msfragger_dia\script\results")
# plot protein barplot
diaumpire_path_list = [r"low-input-cell_0.75ng_1.5ng\diaumpire\protein_maxlfq.tsv", r"low-input-cell_7.5ng_1ug\diaumpire\protein_maxlfq.tsv"]
diann_path_list = [r"low-input-cell_0.75ng_1.5ng\diann\protein_maxlfq.tsv", r"low-input-cell_7.5ng_1ug\diann\protein_maxlfq.tsv"]
maxdia_path_list = [r"low-input-cell_0.75ng_1.5ng\maxdia\combined\txt\proteinGroups.txt", r"low-input-cell_7.5ng_1ug\maxdia\combined\txt\proteinGroups.txt"]
msfraggerdia_path_list = [r"low-input-cell_0.75ng_1.5ng\msfraggerdia\protein_maxlfq.tsv", r"low-input-cell_7.5ng_1ug\msfraggerdia\protein_maxlfq.tsv"]
experiment_list = ["0.75 ng", "7.5 ng"]
msfraggerdia_cols = [["Unnamed: 0", "20210430_PC9-750pg_60K-IT118-DIA-IW10_01", "20210430_PC9-750pg_60K-IT118-DIA-IW10_02", "20210430_PC9-750pg_60K-IT118-DIA-IW10_03"],
["Unnamed: 0", "20210430_PC9-7500pg_60K-IT118-DIA-IW10_01", "20210430_PC9-7500pg_60K-IT118-DIA-IW10_02", "20210430_PC9-7500pg_60K-IT118-DIA-IW10_03"]]
for expIdx in range(len(experiment_list)):
diaumpire = pd.read_csv(diaumpire_path_list[expIdx], sep="\t", index_col=0, na_values=[0, "", "NA"], header=0, usecols=msfraggerdia_cols[expIdx])
diann = pd.read_csv(diann_path_list[expIdx], sep="\t", index_col=0, na_values=[0, "", "NA"], header=0, usecols=msfraggerdia_cols[expIdx])
maxdia = pd.read_csv(maxdia_path_list[expIdx], sep="\t", index_col=0, na_values=[0, "", "NaN"], header=0, usecols=["Protein IDs", "Intensity 1", "Intensity 2", "Intensity 3", "Reverse", "Potential contaminant"])
msfraggerdia = pd.read_csv(msfraggerdia_path_list[expIdx], sep="\t", index_col=0, na_values=[0, "", "NA"], header=0, usecols=msfraggerdia_cols[expIdx])
# remove rows with missing values to calculate CVs
diaumpire.dropna(how="any", inplace=True)
diann.dropna(how="any", inplace=True)
maxdia = maxdia.loc[pd.isna(maxdia["Reverse"]) & pd.isna(maxdia["Potential contaminant"])]
maxdia.drop(labels=["Reverse", "Potential contaminant"], axis=1, inplace=True)
maxdia.dropna(how="any", inplace=True)
msfraggerdia.dropna(how="any", inplace=True)
diaumpire = diaumpire.groupby(level=0).max()
diann = diann.groupby(level=0).max()
maxdia = maxdia.groupby(level=0).max()
msfraggerdia = msfraggerdia.groupby(level=0).max()
total_counts = pd.DataFrame({"FP-DIAU": len(diaumpire.index), "DIA-NN\nlib-free": len(diann.index), "MaxDIA": len(maxdia.index), "FP-MSF": len(msfraggerdia.index)}, index=[0])
diaumpire_cv = np.nanstd(diaumpire, 1) * 100 / np.nanmean(diaumpire, 1)
diann_cv = np.nanstd(diann, 1) * 100 / np.nanmean(diann, 1)
maxdia_cv = np.nanstd(maxdia, 1) * 100 / np.nanmean(maxdia, 1)
msfraggerdia_cv = np.nanstd(msfraggerdia, 1) * 100 / np.nanmean(msfraggerdia, 1)
total_counts2 = pd.DataFrame({"FP-DIAU": sum(diaumpire_cv < 20), "DIA-NN\nlib-free": sum(diann_cv < 20), "MaxDIA": len(maxdia_cv < 20), "FP-MSF": sum(msfraggerdia_cv < 20)}, index=[0])
sns_plot = sns.barplot(data=total_counts, palette=[palette_dict["diaumpire"], palette_dict["diann"], palette_dict["maxdia"], palette_dict["msfraggerdia"]], alpha=0.6)
sns_plot = sns.barplot(data=total_counts2, palette=[palette_dict["diaumpire"], palette_dict["diann"], palette_dict["maxdia"], palette_dict["msfraggerdia"]])
sns_plot.set(xlabel=None)
sns_plot.set_ylabel("quantified proteins", fontsize=13)
sns_plot.set_xticklabels(sns_plot.get_xticklabels(), fontsize=11)
sns_plot.set_title(experiment_list[expIdx], fontsize=15)
sns_plot.figure.savefig("low_input_cell_protein_" + experiment_list[expIdx] + ".pdf")
plt.figure()