-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloki-build.py
312 lines (271 loc) · 11.4 KB
/
loki-build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#!/usr/bin/env python
"""
This script provides functionality to update a knowledge database for genetic analysis using the LOKI database system.
It allows users to update the knowledge database by downloading and processing new data from specified sources. The script supports various options to control the update process, including caching downloaded data, updating only specific sources, finalizing the database, and optimizing the database.
Usage:
<code>python loki-build.py [options]</code>
##Options:
```
-h, --help
Show this help message and exit.
--version
Show version information.
-k, --knowledge <file>
Specify the knowledge database file to use.
-a, --archive <file>
Create or reuse and update a compressed archive of downloaded source data files.
--from-archive <file>
Specify an input source data archive to reuse but not update.
--to-archive <file>
Specify an output source data archive to create or replace but not reuse.
-d, --temp-directory <dir>
Specify a directory to use for temporary storage of downloaded or archived source data files.
-l, --list-sources [<source> ...]
List versions and options for specified source loaders, or list all available sources if none specified.
-c, --cache-only
Do not download any new source data files, only use what's available in the provided archive.
-u, --update [<source> ...]
Update the knowledge database file by downloading and processing new data from specified sources, or update from all available sources if none specified.
-U, --update-except [<source> ...]
Update the knowledge database file by downloading and processing new data from all available sources except those specified.
-o, --option <source> <optionstring>
Additional option(s) to pass to the specified source loader module, in the format 'option=value[,option2=value2[,...]]'.
-r, --force-update
Update all sources even if their source data has not changed since the last update.
-f, --finalize
Finalize the knowledge database file.
--no-optimize
Do not optimize the knowledge database file after updating.
```
"""
import argparse
import os
import posixpath
import shutil
import sys
import tarfile
import tempfile
from loki import loki_db
if __name__ == "__main__":
version = "LOKI version %s" % (loki_db.Database.getVersionString())
# define arguments
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=version,
)
parser.add_argument('--version', action='version',
version=version+"\n%s version %s\n%s version %s" % (
loki_db.Database.getDatabaseDriverName(), loki_db.Database.getDatabaseDriverVersion(),
loki_db.Database.getDatabaseInterfaceName(), loki_db.Database.getDatabaseInterfaceVersion()
)
)
parser.add_argument('-k', '--knowledge', type=str, metavar='file', action='store', default=None,
help="the knowledge database file to use"
)
parser.add_argument('-a', '--archive', type=str, metavar='file', action='store', default=None,
help="create (or re-use and update) a compressed archive of downloaded source data files"
)
parser.add_argument('--from-archive', type=str, metavar='file', action='store', default=None,
help="an input source data archive to re-use but not update"
)
parser.add_argument('--to-archive', type=str, metavar='file', action='store', default=None,
help="an output source data archive to create (or replace) but not re-use"
)
parser.add_argument('-d', '--temp-directory', type=str, metavar='dir', action='store', default=None,
help="a directory to use for temporary storage of downloaded or archived source data files (default: platform dependent)"
)
# parser.add_argument('-m', '--memory', type=str, metavar='size', default=None, #TODO
# help="the target amount of system memory to use (not exact, allow some margin); default: ~1gb"
# )
parser.add_argument('-l', '--list-sources', type=str, metavar='source', nargs='*', action='append', default=None,
help="list versions and options for the specified source loaders, or if none or '+' are specified, list all available sources"
)
parser.add_argument('-c', '--cache-only', action='store_true',
help="do not download any new source data files, only use what's available in the provided archive"
)
parser.add_argument('-u', '--update', type=str, metavar='source', nargs='*', action='append', default=None,
help="update the knowledge database file by downloading and processing new data from the specified sources, "
+"or if none or '+' are specified, from all available sources"
)
parser.add_argument('-U', '--update-except', type=str, metavar='source', nargs='*', action='append', default=None,
help="update the knowledge database file by downloading and processing new data from all available sources EXCEPT those specified"
)
parser.add_argument('-o', '--option', type=str, metavar=('source','optionstring'), nargs=2, action='append', default=None,
help="additional option(s) to pass to the specified source loader module, in the format 'option=value[,option2=value2[,...]]'"
) # e.g. --option dbsnp roles=yes
parser.add_argument('-r', '--force-update', action='store_true',
help="update all sources even if their source data has not changed since the last update"
)
parser.add_argument('-f', '--finalize', action='store_true',
help="finalize the knowledge database file"
)
parser.add_argument('--no-optimize', action='store_true',
help="do not optimize the knowledge database file after updating"
)
parser.add_argument('-v', '--verbose', action='store_true',
help="print warnings and log messages (default)"
)
parser.add_argument('-q', '--quiet', action='store_true',
help="suppress warnings and log messages"
)
parser.add_argument('-t', '--test-data', action='store_true',
help="Load testing data only"
)
# if no arguments, print usage and exit
if len(sys.argv) < 2:
print (version)
print
parser.print_usage()
print
print ("Use -h for details.")
sys.exit(2)
# parse arguments
args = parser.parse_args()
# # parse memory allotment, if any
# memLimit = 64*1024*1024 # default 64mb for sqlite (+ ~1gb for updater itself)
# if args.memory:
# m = args.memory.upper()
# if m.endswith('B'):
# m = m[:-1]
# if m.endswith('T'):
# m = float(m[:-1]) * 1024 * 1024 * 1024 * 1024
# elif m.endswith('G'):
# m = float(m[:-1]) * 1024 * 1024 * 1024
# elif m.endswith('M'):
# m = float(m[:-1]) * 1024 * 1024
# elif m.endswith('K'):
# m = float(m[:-1]) * 1024
# else:
# m = float(m)
# m = long(m)
# if m < 1024*1024*1024:
# print "WARNING: ignoring '%s' memory allotment, the updater requires ~1gb at minimum" % args.memory
# else:
# print "using ~%1.1fMB of memory" % (m / (1024 * 1024))
# memLimit = max(memLimit, m - 1024*1024*1024)
# #if args.memory
# set $TMPDIR so sqlite will use it for vacuum etc.
if args.temp_directory:
if not os.path.isdir(args.temp_directory):
print ("ERROR: '%s' is not a directory")
sys.exit(1)
os.environ['TMPDIR'] = os.path.abspath(args.temp_directory)
# instantiate database object
db = loki_db.Database(testing=args.test_data, updating=True)
db.setVerbose(args.verbose or (not args.quiet))
db.attachDatabaseFile(args.knowledge)
# list sources?
if args.list_sources != None:
srcSet = set()
for srcList in args.list_sources:
srcSet |= set(srcList)
if (not srcSet) or ('+' in srcSet):
print ("available source loaders:")
srcSet = set()
else:
print ("source loader options:")
moduleVersions = db.getSourceModuleVersions(srcSet)
moduleOptions = db.getSourceModuleOptions(srcSet)
for srcName in sorted(moduleOptions.keys()):
print (" %s : %s" % (srcName,moduleVersions[srcName]))
if moduleOptions[srcName]:
for srcOption in sorted(moduleOptions[srcName].keys()):
print (" %s = %s" % (srcOption,moduleOptions[srcName][srcOption]))
elif srcSet:
print (" <no options>")
# pass options?
userOptions = {}
if args.option != None:
for optList in args.option:
srcName = optList[0]
if srcName not in userOptions:
userOptions[srcName] = {}
for optString in optList[1].split(','):
opt,val = optString.split('=',1)
userOptions[srcName][opt] = val
userOptions = userOptions or None
# parse requested update sources
srcSet = None
if args.update != None:
srcSet = set()
for srcList in args.update:
srcSet |= set(srcList)
notSet = None
if args.update_except != None:
notSet = set()
for srcList in args.update_except:
notSet |= set(srcList)
# update?
updateOK = True
if (srcSet != None) or (notSet != None):
db.testDatabaseWriteable()
if db.getDatabaseSetting('finalized',int):
print ("ERROR: cannot update a finalized database")
sys.exit(1)
if srcSet and '+' in srcSet:
srcSet = set()
srcSet = (srcSet or set(db.getSourceModules())) - (notSet or set())
# create temp directory and unpack input archive, if any
startDir = os.getcwd()
fromArchive = args.from_archive or args.archive
toArchive = args.to_archive or args.archive
cacheDir = os.path.abspath(tempfile.mkdtemp(prefix='loki_update_cache.', dir=args.temp_directory))
if args.temp_directory:
print ("using temporary directory '%s'" % cacheDir)
# try/finally to make sure we clean up the cache dir at the end
try:
if fromArchive:
if os.path.exists(fromArchive) and tarfile.is_tarfile(fromArchive):
print ("unpacking archived source data files from '%s' ..." % fromArchive)
with tarfile.open(name=fromArchive, mode='r:*') as archive:
archive.errorlevel = 2
# the archive should only contain directories named after sources,
# so we can filter members by their normalized top-level directory
for member in archive:
srcName = posixpath.normpath(member.name).split('/',1)[0]
if (not srcName) or srcName.startswith('.'):
continue
# if we're not writing an output archive, we only have to extract
# the directories for the sources we need
if (not toArchive) and (srcName not in srcSet):
continue
archive.extractall(cacheDir, [member])
#with archive
print ("... OK")
else:
print ("source data archive '%s' not found, starting fresh" % fromArchive)
#if fromArchive
os.chdir(cacheDir)
updateOK = db.updateDatabase(srcSet, userOptions, args.cache_only, args.force_update)
os.chdir(startDir)
# create output archive, if requested
if toArchive and not args.cache_only:
print ("archiving source data files in '%s' ..." % toArchive)
with tarfile.open(name=toArchive, mode='w:gz') as archive:
archive.errorlevel = 2
for filename in sorted(os.listdir(cacheDir)):
archive.add(os.path.join(cacheDir, filename), arcname=filename)
print ("... OK")
finally:
# clean up cache directory
def rmtree_error(func, path, exc):
print ("WARNING: unable to remove temporary file '%s': %s\n" % (path,exc))
shutil.rmtree(cacheDir, onerror=rmtree_error)
#update
if args.knowledge:
# finalize?
if args.finalize and (not db.getDatabaseSetting('finalized',int)):
if not updateOK:
print ("WARNING: errors encountered during knowledge database update; skipping finalization step")
else:
db.testDatabaseWriteable()
db.finalizeDatabase()
# optimize?
if (not args.no_optimize) and (not db.getDatabaseSetting('optimized',int)):
if not updateOK:
print ("WARNING: errors encountered during knowledge database update; skipping optimization step")
else:
db.testDatabaseWriteable()
db.optimizeDatabase()
#if knowledge
#__main__