-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess.py
95 lines (75 loc) · 2.92 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gzip
import pdb
import time
import numpy as np
def load_test_statistics(filename):
teststats = dict()
# open file
if filename.lower().endswith(('.gz','.zip')):
handle = gzip.open(filename, 'r')
else:
handle = open(filename, 'r')
# read in effect sizes and standard errors
for line in handle:
row = line.strip().split()
try:
teststats[row[0]][row[1]] = np.array([float(row[2]), float(row[3])])
except KeyError:
teststats[row[0]] = {row[1]: np.array([float(row[2]), float(row[3])])}
handle.close()
return teststats
def load_genomic_annotations(filenames):
annotations = dict()
for filename in filenames:
# open file
if filename.lower().endswith(('.gz','.zip')):
handle = gzip.open(filename, 'r')
else:
handle = open(filename, 'r')
# read in annotation coordinates and labels
for line in handle:
row = line.strip().split()
# check if annotation exists in dictionary
try:
annotations[row[3]]
except KeyError:
annotations[row[3]] = dict()
# add entry
try:
annotations[row[3]][row[0]].append([int(row[1]), int(row[2])])
except KeyError:
annotations[row[3]][row[0]] = [[int(row[1]), int(row[2])]]
handle.close()
# order annotations by location
for label in annotations.keys():
for chrom in annotations[label].keys():
values = np.array(annotations[label][chrom])
order = np.argsort(values[:,0])
annotations[label][chrom] = values[order,:]
return annotations
def match_annotations_to_variants(test_statistics, genomic_annotations):
# get all variants
variants = dict()
for locus,stats in test_statistics.iteritems():
for variant,varstat in stats.iteritems():
chrom, position = variant.split('.')
position = int(position)
try:
variants[chrom].append(position)
except KeyError:
variants[chrom] = [position]
# initialize annotation dictionary
variant_annotations = dict()
for chrom in variants.keys():
variants[chrom] = np.unique(variants[chrom])
variant_annotations.update(dict([('%s.%d'%(chrom,pos),[]) for pos in variants[chrom]]))
for label,values in genomic_annotations.iteritems():
for chrom in values.keys():
try:
mask = np.array([np.any(np.logical_and(position>=values[chrom][:,0],position<values[chrom][:,1]))
for position in variants[chrom]])
vars = ['%s.%d'%(chrom,pos) for pos in variants[chrom][mask]]
ig = [variant_annotations[var].append(label) for var in vars]
except KeyError:
pass
return variant_annotations