-
Notifications
You must be signed in to change notification settings - Fork 1
/
step2.py
127 lines (102 loc) · 3.3 KB
/
step2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
Generate individual files and chr22.json
Running this code need following command:
mkdir individual
mkdir promoters
"""
import sys
import os
import vcf
import pandas as pd
import numpy as np
import csv
import time
import json
def subfinder(mylist, pattern):
for i in range(len(mylist)):
if mylist[i] == pattern[0] and mylist[i:i + len(pattern)] == pattern:
return i, i + len(pattern)
def promoter_var_idx(p_idx, promoter, var_idx):
pmt_var = promoter[p_idx] # which promoter
p_start, p_end = subfinder(var_idx, pmt_var)
return p_start, p_end
# read used ALS variants in each .vcf file
with open('var_in_file.json') as json_data:
var_file_dic = json.load(json_data)
# read promoter table
with open('promoter1.csv', 'rb') as f:
reader = csv.reader(f)
promoter = list(reader)
# read used variant posotions
with open('all_ALS_var.txt','r') as f:
var_idx = [i.replace('\n','') for i in f]
# mapping dictionary
var_num_dict = {"0/0":'0',
"0/1":'1',
"1/0":'1',
"1/1":'2',
"./.":'-1'}
# IDS:
labels_file = 'labes.csv'
labels_df = pd.read_csv(labels_file,index_col=0)
ids = labels_df.FID.tolist()
# create individual files
print 'Create individual files'
for ind in ids:
id_file_name = 'individual/'+str(ind)+'.txt'
id_file = open(id_file_name, 'w')
id_file.write('')
## path to vcf files
#files = os.listdir('./chr22')
#files.sort()
#files = files[1:]
files = ['xaa.vcf']
print 'Start writing....'
num_vcf = len(files)
num_vcf_batch = len(files)*0.05
vcf_i = 0
for f_str in files:
if vcf_i % num_vcf_batch == 0:
print vcf_i / num_vcf*100., '%....'
file_name = '' + f_str
with open(file_name, 'r') as f:
lines = [l.replace('\n', '').split("\t") for l in f if not l.startswith('##')]
info = lines[2:]
# create dataframe
v_df = pd.DataFrame.from_records(info, columns=lines[0])
use_less_l = ["ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
v_df.drop(use_less_l, axis=1, inplace=True)
# 0/0 ==> 0 i.e.
samples = lines[0][9:]
replace_dict = {}
for i in samples:
replace_dict[i] = var_num_dict
v_df_r = v_df.replace(to_replace=replace_dict)
var_df = v_df_r[v_df_r['POS'].isin(var_file_dic[f_str])]
# write to file
for ind in ids:
# the id in vcf file is 'LP6008192-DNA_E10_LP6008192-DNA_E10' for 'LP6008192-DNA_E10'
new_ind = str(ind)+'_'+str(ind)
if new_ind in var_df.columns.tolist()[2:]:
info = var_df[new_ind].tolist()
id_file_name = 'individual/' + str(ind) + '.txt'
id_file = open(id_file_name, 'a')
for item in info:
id_file.write("%s\n" % item)
id_file.close()
else:
print ind
vcf_i += 1.
print "100.0%... Done, generated all used ALS variants for each individual"
num_pro = 0
print "Generating No.",num_pro," promoter in chr",22
p_start,p_end = promoter_var_idx(0,promoter,var_idx=var_idx)
promoter_ind = {}
for ind in ids:
indiv_file = 'individual/'+str(ind)+'.txt'
with open(indiv_file,'r') as f:
ind_v = [i.replace('\n','') for i in f]
promo = ind_v[p_start:p_end]
promoter_ind[ind] = promo
with open('promoters/chr22.json', 'w') as fp:
json.dump(promoter_ind, fp)