-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathdistructRerun.py
executable file
·152 lines (120 loc) · 4.53 KB
/
distructRerun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
from distructComline import ComLine
from distruct import Distruct
from clumpp import Clumpp
from DefaultListOrderedDict import DefaultListOrderedDict
from pathlib import Path
import json
import os
import pandas
import sys
def mergeDicts(dict1, dict2):
newDict = {**dict1, **dict2}
return newDict
def locationsDict(qd, iq, k):
tempDict = dict()
cd = os.getcwd()
path = os.path.join(cd, qd, iq)
tempDict[k] = path
return tempDict
def jsonDump(admixDir, outfile, outputDict):
outPath = os.path.join(admixDir, outfile)
with open(outPath, 'w') as jf:
json.dump(outputDict, jf, indent=4)
def createList(r1, r2):
if( r1 == r2 ):
return r1
else:
res = []
while(r1 < r2+1 ):
res.append(r1)
r1 += 1
return res
def qfileSort(qfilesDict):
for k in qfilesDict.keys():
# create initial list of columns that need to be sorted
ncols=0
if "MinClust" in str(k):
x = str(k).split(".")
ncols = 4+int(x[0])
else:
ncols = 4+int(k)
colList = createList(5, ncols)
#print(colList)
# read qfile into dataframe
df = pandas.read_csv(qfilesDict[k], delim_whitespace=True, header=None)
# create list of values telling whether or not to sort ascending
ascList = list() # hold value indicating whether sort will be ascending for each column
colDict = dict() # hold dict of key=column, val=sum(column)
for item in colList:
colDict[item] = df[item].sum()
#print(colDict[item])
ascList.append(True)
# get groups (k values) in order of least ancestry to greatest
colSorted = dict(sorted(colDict.items(), key=lambda x:x[1], reverse=True))
newlist = list()
for key, value in colSorted.items():
newlist.append(key)
# conduct sorting by qvalue columns
df.sort_values(by = newlist, ascending=ascList, ignore_index=True, inplace=True)
# write to file - this overwrites input files in best_
df.to_csv(qfilesDict[k], sep=' ', header=False, index=False)
def main():
input = ComLine(sys.argv[1:])
# The next six lines remove files to prevent duplication of data if distructRerun is executed multiple times on the same files.
cvpath = Path("cv_file.MajClust.txt")
llpath = Path("loglikelihood_file.MajClust.txt")
if cvpath.is_file():
os.remove(cvpath)
if llpath.is_file():
os.remove(llpath)
d = Distruct(input.args.directory, input.args.otl, input.args.colorbrew, input.args.pathtocolorbrew)
runsDict = DefaultListOrderedDict()
qfilesDict = dict()
for k in range(int(input.args.mink),int(input.args.maxk)+1):
drawp = "drawparams." + str(k)
outfile = "K" + str(k) + ".ps"
c = Clumpp(input.args.directory, str(k), input.args.ad)
popq,indivq,qdir = c.copyMajClustFiles() #return popq file, indivq file, and destination dir
#record new locations of Q files for major clusters
tempMajQfilesDict = locationsDict(qdir, indivq, str(k)) # record path to indivq for k
qfilesDict = mergeDicts(qfilesDict, tempMajQfilesDict)
popqList,indivqList = c.copyMinClustFiles()
tempMajDict = c.getMajorClusterRuns(input.args.majc)
runsDict = mergeDicts(runsDict, tempMajDict)
tempMinDict = c.getMinorClusterRuns()
runsDict = mergeDicts(runsDict, tempMinDict)
#record new locations of Q files for minor clusters
minClustKeys = list(tempMinDict.keys())#get keys from tempMinDict
tempMinQfilesDict = dict(zip(minClustKeys, indivqList))
for key, v in tempMinQfilesDict.items():
tempDict = locationsDict(qdir, v, key)
qfilesDict = mergeDicts(qfilesDict, tempDict)
# code to get CV and Loglikelihood values for major clusters must only be run once.
if(k == int(input.args.maxk)):
c.getMajorClusterCVvalues(input.args.majc)
c.getMajorClusterLoglikelihood(input.args.majc)
# code to get values for minor clusters operates on individual K values
c.getMinorClusterCVvalues()
c.getMinorClusterLoglikelihood()
d.copyFiles()
#drawparams for major clusters
d.writeDrawparams(drawp, popq, indivq, str(k), outfile, c.pops, c.inds, input.args.width)
#drawparams for minor clusters
for pq, iq in zip(popqList, indivqList):
temp = pq.split(".")
drawpMinC = drawp + "." + temp[-1]
outfileMinC = "K" + str(k) + "." + temp[-1] + ".ps"
d.writeDrawparams(drawpMinC, pq, iq, str(k), outfileMinC, c.pops, c.inds, input.args.width)
jsonDump(input.args.ad, "cvRuns.json", runsDict)
jsonDump(input.args.ad, "qfilePaths.json", qfilesDict)
# add file sorting here
# run distruct if option is used
if input.args.run==True:
d.runDistruct()
# sort indivq files by q values if -s/--sort option is used
if input.args.sort==True:
qfileSort(qfilesDict)
# run main function
main()
raise SystemExit