Skip to content

Commit

Permalink
Reviews: reviewer 2
Browse files Browse the repository at this point in the history
  • Loading branch information
marakeby committed Apr 8, 2021
1 parent 9452cb7 commit f7f2284
Show file tree
Hide file tree
Showing 11 changed files with 207 additions and 32 deletions.
2 changes: 1 addition & 1 deletion analysis/figure_1/figure_1_d_auc_prc.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def plot_prc(ax, y_test, y_pred_score, save_dir, color, label=''):

pnet_base_dir = join(base_dir , 'pnet/onsplit_average_reg_10_tanh_large_testing')
df_pnet = pd.read_csv(join(pnet_base_dir, 'P-net_ALL_testing.csv'), sep=',', index_col=0, header=[0, 1])
all_models_dict['P-net'] = df_pnet
all_models_dict['P-NET'] = df_pnet
n = len(models)+1

def plot_prc_all(ax):
Expand Down
4 changes: 3 additions & 1 deletion analysis/figure_1/figure_1_e_confusion_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,12 @@ def plot_confusion_matrix_all(ax):
models_base_dir = join(base_dir, 'onsplit_average_reg_10_tanh_large_testing')
filename = join(models_base_dir, 'P-net_ALL_testing.csv')
df = pd.read_csv(filename, index_col=0)
df.pred = df.pred_scores > 0.5

# df.pred = df.pred_scores > 0.5
df.head()

y_t = df.y

y_pred_test = df.pred
cnf_matrix = confusion_matrix(y_t, y_pred_test)
print cnf_matrix
Expand Down
2 changes: 1 addition & 1 deletion analysis/figure_2/figure_2_a_pnet_vs_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def plot_compaison(ax1, label, df_pnet, df_dense):
# ax1.set_xlabel('Number of samples', fontdict=dict(weight='bold', fontsize=10))
ax1.set_ylabel(label, fontdict=dict(family='Arial',weight='bold', fontsize=14))

ax1.legend(['P-net', 'Dense'], fontsize=8, loc= 'upper left')
ax1.legend(['P-NET', 'Dense'], fontsize=8, loc= 'upper left')

ax1.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
# ax1.grid()
Expand Down
5 changes: 3 additions & 2 deletions analysis/figure_2/figure_2_b_external_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def plot_stacked(ax, filename, correct, wrong):
top_strs = ['%1.2f' % i + '%' for i in top]

table = ax.table(cellText=[bottom_strs, top_strs],
rowLabels=['True Rate', 'Error Rate'],
rowLabels=['True Ra te', 'Error Rate'],
rowColours=selected_colors,
colLabels=labels,
loc='bottom', fontsize=14, cellLoc='center')
Expand Down Expand Up @@ -125,7 +125,8 @@ def plot_external_validation_all(ax):

def plot_external_validation_matrix(ax):
normalize = True
labels = np.array([['TR', 'TR'], ['ER ', 'ER']])
# labels = np.array([['TR', 'TR'], ['ER ', 'ER']])
labels = np.array([['TN', 'FP'], ['FN ', 'TP']])
cmap = plt.cm.Reds
cm = np.array([primary, mets])

Expand Down
18 changes: 15 additions & 3 deletions data/data_access_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,25 @@
data_params = {'id': 'ALL', 'type': 'prostate_paper',
'params': {
# 'data_type': ['mut_important', 'cnv_del', 'cnv_amp', 'gene_expression'],
'data_type': ['mut_important', 'cnv_del', 'cnv_amp'],
# 'data_type': ['mut_important_plus_hotspots'],
# 'data_type': ['mut_important'],
# 'data_type': ['mut_important', 'cnv_del', 'cnv_amp'],
# 'data_type': ['mut_important_plus_hotspots', 'cnv_del', 'cnv_amp'],
'data_type': ['mut_hotspots', 'cnv_del', 'cnv_amp'],
# 'data_type': ['mut_important', 'cnv_del', 'fusion_genes'],
# 'data_type': ['mut_important', 'gene_expression'],
# 'data_type': ['CNV_burden', 'TMB'],
# 'account_for_data_type' : ['fusions'],
'account_for_data_type' : None,
'drop_AR': False,
'cnv_levels': 3,
'mut_binary': True,
'mut_binary': False,
'balanced_data': False,
'combine_type': 'union', # intersection
'use_coding_genes_only': True,
'selected_genes': selected_genes,
'selected_samples': selected_samples,
# 'selected_samples': selected_samples,
'selected_samples': None,
'training_split': 0,
}
}
Expand All @@ -35,5 +40,12 @@

print columns.levels
print x_train.shape, x_test.shape, y_train.shape, y_test.shape
print x_train.sum().sum()


x,y, info, columns = data_adapter.get_data()
x_df = pd.DataFrame(x, columns = columns, index=info )
print x_df.shape
print x_df.sum().sum()
# print x_train_df['genomics'].shape
# print x_train_df['account_for'].shape
6 changes: 3 additions & 3 deletions data/gmt_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def load_data(self, filename, genes_col=1, pathway_col=0):

data_list = gmt.readlines()

print data_list[0]
# print data_list[0]
for row in data_list:
genes = row.strip().split('\t')
genes = [re.sub('_copy.*', '', g) for g in genes]
Expand All @@ -26,7 +26,7 @@ def load_data(self, filename, genes_col=1, pathway_col=0):
data_dict_list.append(dict)

df = pd.DataFrame(data_dict_list)
print df.head()
# print df.head()

return df

Expand All @@ -37,7 +37,7 @@ def load_data_dict(self, filename):
with open(os.path.join(data_dir, filename)) as gmt:
data_list = gmt.readlines()

print data_list[0]
# print data_list[0]
for row in data_list:
genes = row.split('\t')
dict[genes[0]] = genes[2:]
Expand Down
2 changes: 1 addition & 1 deletion data/prostate_paper/data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def load_data_type(data_type='gene', cnv_levels=5, cnv_filter_single_event=True,
x[x == -1.] = 0.0
x[x == -2.] = 1.0
x[x == 1.] = 0.0
x[x == -2.] = 1.0
x[x == 2.] = 1.0
else:
x[x < 0.] = -1.
x[x > 0.] = 1.
Expand Down
33 changes: 33 additions & 0 deletions review/17-mutual_genes/mpschr-mutex-b1898f6/plot_upset.ipynb

Large diffs are not rendered by default.

20 changes: 1 addition & 19 deletions review/4-TMB_and_CNV_burden/TMB and CNV burden.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -662,26 +662,8 @@
]
},
{
"cell_type": "code",
"execution_count": 76,
"cell_type": "raw",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'accuracy': 0.601010101010101,\n",
" 'auc': 0.7753736046418973,\n",
" 'aupr': 0.7084550561135279,\n",
" 'f1': 0.5647382920110192,\n",
" 'precision': 0.43897216274089934,\n",
" 'recall': 0.7915057915057915}"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evalualte(y_test, pred>=0.2, y_pred_score=pred)"
]
Expand Down
1 change: 0 additions & 1 deletion review/9-hotspot/compare_auc.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def plot_auc_all(all_models_dict, ax):
# sort based on area under prc
n = len(all_models_dict.keys())
colors = sns.color_palette(None, n)

sorted_dict = sort_dict(all_models_dict)
for i, k in enumerate(sorted_dict.keys()):
print('model {} , auc= {}'.format(k, sorted_dict[k]))
Expand Down
146 changes: 146 additions & 0 deletions utils/stats_utils_delong_xu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import pandas as pd
import numpy as np
import scipy.stats
'''
MIT License
Copyright (c) 2021 Nikita Kazeev
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
'''
# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = 0.5*(i + j - 1)
i = j
T2 = np.empty(N, dtype=np.float)
# Note(kazeevn) +1 is due to Python using 0-based indexing
# instead of 1-based in the AUC formula in the paper
T2[J] = T + 1
return T2


def fastDeLong(predictions_sorted_transposed, label_1_count):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
@article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]

tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank(positive_examples[r, :])
ty[r, :] = compute_midrank(negative_examples[r, :])
tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
v01 = (tz[:, :m] - tx[:, :]) / n
v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov


def calc_pvalue(aucs, sigma):
"""Computes log(10) of p-values.
Args:
aucs: 1D array of AUCs
sigma: AUC DeLong covariances
Returns:
log10(pvalue)
"""
l = np.array([[1, -1]])
z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth):
assert np.array_equal(np.unique(ground_truth), [0, 1])
order = (-ground_truth).argsort()
label_1_count = int(ground_truth.sum())
return order, label_1_count


def delong_roc_variance(ground_truth, predictions):
"""
Computes ROC AUC variance for a single set of predictions
Args:
ground_truth: np.array of 0 and 1
predictions: np.array of floats of the probability of being class 1
"""
order, label_1_count = compute_ground_truth_statistics(ground_truth)
predictions_sorted_transposed = predictions[np.newaxis, order]
aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
return aucs[0], delongcov


def delong_roc_test(ground_truth, predictions_one, predictions_two):
"""
Computes log(p-value) for hypothesis that two ROC AUCs are different
Args:
ground_truth: np.array of 0 and 1
predictions_one: predictions of the first model,
np.array of floats of the probability of being class 1
predictions_two: predictions of the second model,
np.array of floats of the probability of being class 1
"""
order, label_1_count = compute_ground_truth_statistics(ground_truth)
predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
return calc_pvalue(aucs, delongcov)

0 comments on commit f7f2284

Please sign in to comment.