Reviews: reviewer 2

1830416002 · Apr 8, 2021 · f7f2284 · f7f2284
1 parent 9452cb7
commit f7f2284
Show file tree

Hide file tree

Showing 11 changed files with 207 additions and 32 deletions.
diff --git a/analysis/figure_1/figure_1_d_auc_prc.py b/analysis/figure_1/figure_1_d_auc_prc.py
@@ -86,7 +86,7 @@ def plot_prc(ax, y_test, y_pred_score, save_dir, color, label=''):
 
 pnet_base_dir = join(base_dir , 'pnet/onsplit_average_reg_10_tanh_large_testing')
 df_pnet = pd.read_csv(join(pnet_base_dir, 'P-net_ALL_testing.csv'), sep=',', index_col=0, header=[0, 1])
-all_models_dict['P-net'] = df_pnet
+all_models_dict['P-NET'] = df_pnet
 n = len(models)+1
 
 def plot_prc_all(ax):

diff --git a/analysis/figure_1/figure_1_e_confusion_matrix.py b/analysis/figure_1/figure_1_e_confusion_matrix.py
@@ -129,10 +129,12 @@ def plot_confusion_matrix_all(ax):
     models_base_dir = join(base_dir, 'onsplit_average_reg_10_tanh_large_testing')
     filename = join(models_base_dir, 'P-net_ALL_testing.csv')
     df = pd.read_csv(filename, index_col=0)
-    df.pred = df.pred_scores > 0.5
+
+    # df.pred = df.pred_scores > 0.5
     df.head()
 
     y_t = df.y
+
     y_pred_test = df.pred
     cnf_matrix = confusion_matrix(y_t, y_pred_test)
     print cnf_matrix

diff --git a/analysis/figure_2/figure_2_a_pnet_vs_dense.py b/analysis/figure_2/figure_2_a_pnet_vs_dense.py
@@ -87,7 +87,7 @@ def plot_compaison(ax1, label, df_pnet, df_dense):
     # ax1.set_xlabel('Number of samples', fontdict=dict(weight='bold', fontsize=10))
     ax1.set_ylabel(label, fontdict=dict(family='Arial',weight='bold', fontsize=14))
 
-    ax1.legend(['P-net', 'Dense'], fontsize=8, loc= 'upper left')
+    ax1.legend(['P-NET', 'Dense'], fontsize=8, loc= 'upper left')
 
     ax1.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
     # ax1.grid()

diff --git a/analysis/figure_2/figure_2_b_external_validation.py b/analysis/figure_2/figure_2_b_external_validation.py
@@ -90,7 +90,7 @@ def plot_stacked(ax, filename, correct, wrong):
     top_strs = ['%1.2f' % i + '%' for i in top]
 
     table = ax.table(cellText=[bottom_strs, top_strs],
-                      rowLabels=['True Rate', 'Error Rate'],
+                      rowLabels=['True Ra   te', 'Error Rate'],
                       rowColours=selected_colors,
                       colLabels=labels,
                       loc='bottom', fontsize=14, cellLoc='center')
@@ -125,7 +125,8 @@ def plot_external_validation_all(ax):
 
 def plot_external_validation_matrix(ax):
     normalize = True
-    labels = np.array([['TR', 'TR'], ['ER ', 'ER']])
+    # labels = np.array([['TR', 'TR'], ['ER ', 'ER']])
+    labels = np.array([['TN', 'FP'], ['FN ', 'TP']])
     cmap = plt.cm.Reds
     cm = np.array([primary, mets])
 

diff --git a/data/data_access_test.py b/data/data_access_test.py
@@ -7,20 +7,25 @@
 data_params = {'id': 'ALL', 'type': 'prostate_paper',
              'params': {
                  # 'data_type': ['mut_important', 'cnv_del', 'cnv_amp', 'gene_expression'],
-                 'data_type': ['mut_important', 'cnv_del', 'cnv_amp'],
+                 # 'data_type': ['mut_important_plus_hotspots'],
+                 # 'data_type': ['mut_important'],
+                 # 'data_type': ['mut_important', 'cnv_del', 'cnv_amp'],
+                 # 'data_type': ['mut_important_plus_hotspots', 'cnv_del', 'cnv_amp'],
+                 'data_type': ['mut_hotspots', 'cnv_del', 'cnv_amp'],
                  # 'data_type': ['mut_important', 'cnv_del', 'fusion_genes'],
                  # 'data_type': ['mut_important',  'gene_expression'],
                 #  'data_type':  ['CNV_burden', 'TMB'],
                 #  'account_for_data_type' : ['fusions'],
                  'account_for_data_type' : None,
                  'drop_AR': False,
                  'cnv_levels': 3,
-                 'mut_binary': True,
+                 'mut_binary': False,
                  'balanced_data': False,
                  'combine_type': 'union',  # intersection
                  'use_coding_genes_only': True,
                  'selected_genes': selected_genes,
-                 'selected_samples': selected_samples,
+                 # 'selected_samples': selected_samples,
+                 'selected_samples': None,
                  'training_split': 0,
              }
              }
@@ -35,5 +40,12 @@
 
 print columns.levels
 print x_train.shape, x_test.shape, y_train.shape, y_test.shape
+print x_train.sum().sum()
+
+
+x,y, info, columns = data_adapter.get_data()
+x_df = pd.DataFrame(x, columns = columns, index=info )
+print x_df.shape
+print x_df.sum().sum()
 # print x_train_df['genomics'].shape
 # print x_train_df['account_for'].shape
diff --git a/data/gmt_reader.py b/data/gmt_reader.py
@@ -15,7 +15,7 @@ def load_data(self, filename, genes_col=1, pathway_col=0):
 
             data_list = gmt.readlines()
 
-            print data_list[0]
+            # print data_list[0]
             for row in data_list:
                 genes = row.strip().split('\t')
                 genes = [re.sub('_copy.*', '', g) for g in genes]
@@ -26,7 +26,7 @@ def load_data(self, filename, genes_col=1, pathway_col=0):
                     data_dict_list.append(dict)
 
         df = pd.DataFrame(data_dict_list)
-        print df.head()
+        # print df.head()
 
         return df
 
@@ -37,7 +37,7 @@ def load_data_dict(self, filename):
         with open(os.path.join(data_dir, filename)) as gmt:
             data_list = gmt.readlines()
 
-            print data_list[0]
+            # print data_list[0]
             for row in data_list:
                 genes = row.split('\t')
                 dict[genes[0]] = genes[2:]

diff --git a/data/prostate_paper/data_reader.py b/data/prostate_paper/data_reader.py
@@ -127,7 +127,7 @@ def load_data_type(data_type='gene', cnv_levels=5, cnv_filter_single_event=True,
                     x[x == -1.] = 0.0
                     x[x == -2.] = 1.0
                     x[x == 1.] = 0.0
-                    x[x == -2.] = 1.0
+                    x[x == 2.] = 1.0
                 else:
                     x[x < 0.] = -1.
                     x[x > 0.] = 1.

diff --git a/review/17-mutual_genes/mpschr-mutex-b1898f6/plot_upset.ipynb b/review/17-mutual_genes/mpschr-mutex-b1898f6/plot_upset.ipynb
diff --git a/review/4-TMB_and_CNV_burden/TMB and CNV burden.ipynb b/review/4-TMB_and_CNV_burden/TMB and CNV burden.ipynb
@@ -662,26 +662,8 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 76,
+   "cell_type": "raw",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'accuracy': 0.601010101010101,\n",
-       " 'auc': 0.7753736046418973,\n",
-       " 'aupr': 0.7084550561135279,\n",
-       " 'f1': 0.5647382920110192,\n",
-       " 'precision': 0.43897216274089934,\n",
-       " 'recall': 0.7915057915057915}"
-      ]
-     },
-     "execution_count": 76,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "evalualte(y_test, pred>=0.2, y_pred_score=pred)"
    ]

diff --git a/review/9-hotspot/compare_auc.py b/review/9-hotspot/compare_auc.py
@@ -113,7 +113,6 @@ def plot_auc_all(all_models_dict, ax):
     # sort based on area under prc
     n = len(all_models_dict.keys())
     colors = sns.color_palette(None, n)
-
     sorted_dict = sort_dict(all_models_dict)
     for i, k in enumerate(sorted_dict.keys()):
         print('model {} , auc= {}'.format(k, sorted_dict[k]))

diff --git a/utils/stats_utils_delong_xu.py b/utils/stats_utils_delong_xu.py
@@ -0,0 +1,146 @@
+import pandas as pd
+import numpy as np
+import scipy.stats
+'''
+MIT License
+
+Copyright (c) 2021 Nikita Kazeev
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+'''
+# AUC comparison adapted from
+# https://github.com/Netflix/vmaf/
+def compute_midrank(x):
+    """Computes midranks.
+    Args:
+       x - a 1D numpy array
+    Returns:
+       array of midranks
+    """
+    J = np.argsort(x)
+    Z = x[J]
+    N = len(x)
+    T = np.zeros(N, dtype=np.float)
+    i = 0
+    while i < N:
+        j = i
+        while j < N and Z[j] == Z[i]:
+            j += 1
+        T[i:j] = 0.5*(i + j - 1)
+        i = j
+    T2 = np.empty(N, dtype=np.float)
+    # Note(kazeevn) +1 is due to Python using 0-based indexing
+    # instead of 1-based in the AUC formula in the paper
+    T2[J] = T + 1
+    return T2
+
+
+def fastDeLong(predictions_sorted_transposed, label_1_count):
+    """
+    The fast version of DeLong's method for computing the covariance of
+    unadjusted AUC.
+    Args:
+       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
+          sorted such as the examples with label "1" are first
+    Returns:
+       (AUC value, DeLong covariance)
+    Reference:
+     @article{sun2014fast,
+       title={Fast Implementation of DeLong's Algorithm for
+              Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
+       author={Xu Sun and Weichao Xu},
+       journal={IEEE Signal Processing Letters},
+       volume={21},
+       number={11},
+       pages={1389--1393},
+       year={2014},
+       publisher={IEEE}
+     }
+    """
+    # Short variables are named as they are in the paper
+    m = label_1_count
+    n = predictions_sorted_transposed.shape[1] - m
+    positive_examples = predictions_sorted_transposed[:, :m]
+    negative_examples = predictions_sorted_transposed[:, m:]
+    k = predictions_sorted_transposed.shape[0]
+
+    tx = np.empty([k, m], dtype=np.float)
+    ty = np.empty([k, n], dtype=np.float)
+    tz = np.empty([k, m + n], dtype=np.float)
+    for r in range(k):
+        tx[r, :] = compute_midrank(positive_examples[r, :])
+        ty[r, :] = compute_midrank(negative_examples[r, :])
+        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
+    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
+    v01 = (tz[:, :m] - tx[:, :]) / n
+    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
+    sx = np.cov(v01)
+    sy = np.cov(v10)
+    delongcov = sx / m + sy / n
+    return aucs, delongcov
+
+
+def calc_pvalue(aucs, sigma):
+    """Computes log(10) of p-values.
+    Args:
+       aucs: 1D array of AUCs
+       sigma: AUC DeLong covariances
+    Returns:
+       log10(pvalue)
+    """
+    l = np.array([[1, -1]])
+    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
+    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
+
+
+def compute_ground_truth_statistics(ground_truth):
+    assert np.array_equal(np.unique(ground_truth), [0, 1])
+    order = (-ground_truth).argsort()
+    label_1_count = int(ground_truth.sum())
+    return order, label_1_count
+
+
+def delong_roc_variance(ground_truth, predictions):
+    """
+    Computes ROC AUC variance for a single set of predictions
+    Args:
+       ground_truth: np.array of 0 and 1
+       predictions: np.array of floats of the probability of being class 1
+    """
+    order, label_1_count = compute_ground_truth_statistics(ground_truth)
+    predictions_sorted_transposed = predictions[np.newaxis, order]
+    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
+    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
+    return aucs[0], delongcov
+
+
+def delong_roc_test(ground_truth, predictions_one, predictions_two):
+    """
+    Computes log(p-value) for hypothesis that two ROC AUCs are different
+    Args:
+       ground_truth: np.array of 0 and 1
+       predictions_one: predictions of the first model,
+          np.array of floats of the probability of being class 1
+       predictions_two: predictions of the second model,
+          np.array of floats of the probability of being class 1
+    """
+    order, label_1_count = compute_ground_truth_statistics(ground_truth)
+    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
+    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
+    return calc_pvalue(aucs, delongcov)