-
Notifications
You must be signed in to change notification settings - Fork 1
/
svm.py
115 lines (88 loc) · 3.32 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import json
import csv
import numpy as np
import os
import pandas as pd
import random
import tensorflow as tf
from functions import quan_detector, most_repeared_promoter,dataset
from sklearn.metrics import confusion_matrix
from sklearn import datasets, linear_model,svm
from sklearn.metrics import mean_squared_error, r2_score
np.random.seed(42)
tf.set_random_seed(42)
random.seed(42)
out_put_header = ['Promoter region','Posotive_zeros','Negative_zeros','Sum_zeros',
'Positive_freq', 'Negative_freq','Sum_freq',
'Sum_all','Percent_all', 'Vector_freq',
"True positive", "False positive", "True negative", "False negative", "Accuracy",
'>50%']
output_file_name = 'output_svm.csv'
with open(output_file_name,'w') as f:
writer = csv.writer(f)
writer.writerow(out_put_header)
labels_file = 'labes.csv'
labels_df = pd.read_csv(labels_file, index_col=0)
ids_csv = labels_df.FID.tolist()
promoters_list = range(100)
for promoter_num in promoters_list:
print promoter_num
promoter_file = 'promoters/chr22_'+str(promoter_num)+'.json'
# # read files
with open(promoter_file) as json_data:
ind_var = json.load(json_data)
ids_json = ind_var.keys()
var_num = []
for i in ids_csv:
id_name = str(i)
temp = ind_var[id_name]
var_seq = map(int, temp)
var_num.append(var_seq)
labels_df['vars'] = var_num
lab_num = {1: [1,0], # positive
2: [0,1]} # negative
pheno_new = []
for i in labels_df.Pheno.tolist():
pheno_new.append(lab_num[i])
d = {"Pheno": pheno_new, "Vars":labels_df.vars}
dataset_ = pd.DataFrame(d)
dataset_X = np.array(dataset_.Vars.tolist())
dataset_Y = np.array(dataset_.Pheno.tolist())
t_idx = [int(line.strip()) for line in open("train_id.txt", 'r')]
dataset_X= dataset_X[t_idx]
dataset_Y = dataset_Y[t_idx]
N = len(dataset_X)
# repeat information
per_zeros, p_zeros,n_zeros = quan_detector(dataset_X,dataset_Y)
count_zeros = p_zeros+n_zeros # sum of individuals without any variants
most_vector, max_count,count_vector = most_repeared_promoter(dataset_X,dataset_Y)
_, p_count,n_count = count_vector
vart_pos = []
for i in range(len(most_vector)):
if most_vector[i] != '0':
vart_pos.append(i)
np.random.seed(42)
tf.set_random_seed(42)
random.seed(42)
# network accuracy
x_train, y_train,x_test,y_test = dataset(dataset_X,dataset_Y,test_ratio=0.1)
y_train = np.argmax(y_train, axis=1)
y_test = np.argmax(y_test, axis=1)
# Create linear regression object
lsvm = svm.SVC(kernel='rbf', gamma=0.7, C=1.0)
# Train the model using the training sets
lsvm.fit(x_train, y_train)
# Make predictions using the testing set
y_pred = lsvm.predict(x_test)
# y_pred = np.argmax(y_pred,axis=1)
y_test_num = y_test
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel()
acc = (tp+tn)*1./(tp+fp+tn+fn)
info = ['promoter '+str(promoter_num), p_zeros,n_zeros,count_zeros,
p_count, n_count, max_count,
max_count + count_zeros, (max_count + count_zeros)*1./N, vart_pos,
tp, fp, tn, fn, acc, acc>0.5]
with open(output_file_name,'a') as f:
writer = csv.writer(f)
writer.writerow(info)
print "Done"