-
Notifications
You must be signed in to change notification settings - Fork 0
/
TTesting.py
101 lines (84 loc) · 3.21 KB
/
TTesting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as stats
import sklearn as sk
import sklearn.preprocessing as preprocessing
plt.ion()
plt.style.use('fivethirtyeight')
import MaclearnUtilities
from MaclearnUtilities import bhfdr, colttests, gramSchmidtSelect
import RestrictedData
xs = RestrictedData.xs
xnorms = RestrictedData.xnorms
annots = RestrictedData.annots
ys = RestrictedData.ys
ynums = RestrictedData.ynums
## -----------------------------------------------------------------
## ttest example (using equal variance t test)
## -----------------------------------------------------------------
shengene = xnorms['shen']['NM_008161']
shengene_nervous = shengene[ys['shen']]
shengene_other = shengene[~ys['shen']]
tout = stats.ttest_ind(shengene_nervous.values,
shengene_other.values,
equal_var = True)
stats.pearsonr(shengene, ynums['shen'])
## -----------------------------------------------------------------
## t tests for all genes in shen set
## -----------------------------------------------------------------
tShenAll = colttests(xnorms['shen'], ynums['shen'])
tShenAll['q'] = bhfdr(tShenAll.p)
## let's try something else...
xscShen = preprocessing.scale(xnorms['shen'])
xscShen = pd.DataFrame(xscShen,
index = xnorms['shen'].index,
columns = xnorms['shen'].columns)
xscShen.mean(axis=0)
xscShen.std(axis=0)
yscShen = preprocessing.scale(ynums['shen'].astype('float'))
tShenAll['pearson'] = np.dot(yscShen, xscShen) / len(yscShen)
## sort by p
tShenAll.sort_values('p', inplace=True)
## -----------------------------------------------------------------
## t tests for all genes in each set
## -----------------------------------------------------------------
def tTestPlus(x, y):
out = colttests(x, y)
out['q'] = bhfdr(out['p'])
out['pearson'] = np.dot(
preprocessing.scale(y.astype('float')),
preprocessing.scale(x)
) / len(y)
out.sort_values('p', inplace=True)
return out
tTestResults = {k : tTestPlus(xnorms[k], ynums[k]) for k in xnorms}
## -----------------------------------------------------------------
## generate fancy p vs pearson plot
## -----------------------------------------------------------------
plt.clf()
ax = plt.subplot(111)
colors = {
"montastier" : "black",
"patel" : "darkgray",
"hess" : "red",
"shen" : "darkred"
}
for s in tTestResults:
plotdata = pd.DataFrame({'gene' : tTestResults[s].index,
'set' : s + " (" + str(xnorms[s].shape[0]) + ")",
'p' : tTestResults[s].p,
'pearson' : tTestResults[s].pearson})
plotdata.sort_values("pearson", inplace=True)
plotdata.plot(x = "pearson",
y = "p",
color = colors[s],
logy = True,
label = s + " (" + str(xnorms[s].shape[0]) + ")",
ax = ax)
compResults = gramSchmidtSelect(x = xnorms['patel'],
y = ynums['patel'],
g = 'NAMPT')
compResults = compResults.loc[
compResults.abs().sort_values(ascending=False).index]