-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_ldsc_out.py
62 lines (56 loc) · 2.03 KB
/
load_ldsc_out.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import pickle
import gzip
import glob
import re
import numpy as np
import pandas as pd
# import scipy.stats
def add_data(res_path, data_lst):
fields = ["CT", "Prop._SNPs", "Prop._h2", "Prop._h2_std_error", "Enrichment", "Enrichment_std_error", "Enrichment_p"]
with open(res_path) as res_file:
header = next(res_file)
cols = {val: ind for ind, val in enumerate(header.strip().split())}
# print(cols) ####
for line in res_file:
vals = line.strip().split()
info = vals[cols["GWAS"]]
study, paramstr = info.split(".", 1)
threshold_match = re.search(r"joint_sc_(.*?)(_|\.results)", paramstr)
if threshold_match is None:
threshold = 0.1
else:
threshold = float(threshold_match.group(1))
window_match = re.search(r"pm(.*?)kb", paramstr)
if window_match is None:
window = 0
else:
window = int(window_match.group(1))
print(paramstr, threshold, window) ####
data_lst.append([study, threshold, window] + [vals[cols[i]] for i in fields])
def load_ldsc_out(name, res_dir_base, out_dir):
res_dir = os.path.join(res_dir_base, name)
data_lst = []
for i in glob.glob(os.path.join(res_dir, "*.tab")):
res_path = os.path.join(res_dir, i)
add_data(res_path, data_lst)
cols = [
"Study",
"Threshold",
"Window",
"Cluster",
"SNPs",
"H2",
"H2StdError",
"Enrichment",
"EnrichmentStdError",
"EnrichmentP"
]
df = pd.DataFrame(data_lst, columns=cols)
csv_path = os.path.join(out_dir, f"{name}.csv")
df.to_csv(csv_path, index=False, na_rep="None")
if __name__ == '__main__':
res_dir_base = "/agusevlab/awang/sc_kellis/ldsc_res/"
out_dir = os.path.join(res_dir_base, "agg")
# load_ldsc_out("results_chisq", res_dir_base, out_dir)
load_ldsc_out("results_total_expression", res_dir_base, out_dir)