-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataprepare.py
64 lines (53 loc) · 1.65 KB
/
dataprepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import numpy as np
def GSEandGPLtoUSEmatrix():
matrix=pd.read_table("GSE6764_series_matrix.txt")
matrix.index=matrix.iloc[:,0]
GPL570=pd.read_table("GPL570.txt")
gene=pd.read_csv("node.csv",header=None)
genelist=gene.iloc[:,1]
genelist=np.unique(genelist)
print(matrix.head())
# print(GPL570.head())
# print(gene.head())
print(genelist)
print(len(genelist))
useID=[]
for i in range(0,len(GPL570["Gene Symbol"])):
if GPL570.iloc[i,1] in genelist:
useID.append(list(GPL570.iloc[i,]))
useID=pd.DataFrame(useID)
print(useID.iloc[:,0])
hccmatrix=matrix.loc[np.unique(useID.iloc[:,0])]
hccmatrix.insert(0,"GeneSymbol",list(useID.iloc[:,1]))
hccmatrix=pd.DataFrame(hccmatrix)
print(hccmatrix.head())
hccmatrix.to_csv("hccmatrix2.csv")
#GSEandGPLtoUSEmatrix()
hccmatrix=pd.read_csv("hccmatrix2.csv")
hccmatrix.index=hccmatrix.iloc[:,0]
list=np.unique(hccmatrix["GeneSymbol"])
usegenelist=[i for i in list if i !='']
#print(usegenelist)
print(hccmatrix.loc["ABCB1"].mean())
usematrix=[]
for i in usegenelist:
print(i)
array=hccmatrix.loc[i]
#print(array)
dim=array.shape[0]
print(dim)
if dim!=76:
#print("##########",array.mean())
term=array.mean()
usematrix.append(term)
#print(usematrix)
else:
#print("!!!!!!!!!!!!!!!!",array)
term2=array
usematrix.append(term2)
usematrix=pd.DataFrame(usematrix)
usematrix.index=usegenelist
usematrix["GeneSymbol"]=usegenelist
print(usematrix)
usematrix.to_csv("usematrix2.csv")