-
Notifications
You must be signed in to change notification settings - Fork 4
/
variants.py
691 lines (562 loc) · 28.9 KB
/
variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
""" generate variants from pdb files """
import argparse
import hashlib
import itertools
import math
import time
from os.path import join, basename, isfile
from collections import Counter
import random
from typing import Optional, Sequence, Union
import pandas as pd
from Bio.SeqIO.PdbIO import AtomIterator
from Bio.PDB import PDBParser
import numpy as np
import sqlalchemy as sqla
# silence warnings when reading PDB files generated from Rosetta (which have comments which aren't parsed by my
# approach for getting sequences from PDB files w/ Bio.SeqIO...
import warnings
import utils
warnings.filterwarnings("ignore", message="Ignoring unrecognized record ")
def gen_all_variants(base_seq, num_subs, chars, seq_idxs):
""" generates all possible variants of base_seq with the given number of substitutions
using the given available chars and valid sequence idxs for substitution"""
# positions is a tuple of (pos(1), pos(2), ... pos(num_subs))
for positions in itertools.combinations(seq_idxs, num_subs):
# new_aas is a tuple of (aa(1), aa(2), ... aa(num_subs))
for new_aas in itertools.product(chars, repeat=num_subs):
if np.all([base_seq[pos] != new_aa for pos, new_aa in zip(positions, new_aas)]):
# note the pos+1 for 1-based indexing
variant = ",".join(["{}{}{}".format(base_seq[pos], pos+1, new_aa) for pos, new_aa in zip(positions, new_aas)])
# should be in sorted order already, but just in case, sort it here again
variant = utils.sort_variant_mutations(variant)
yield variant
def gen_sample(base_seq, num_mutants, num_subs, chars, seq_idxs, rng):
""" generates a random sample of variants with the given number of substitutions """
# using a set and a list to maintain the order
# this is slower and uses 2x the memory, but the final variant list will be orderd
mutants = set()
mutant_list = []
for mut_num in range(num_mutants):
found_valid_mutant = False
while not found_valid_mutant:
# choose the positions to mutate and sort them in ascending order
positions = rng.choice(seq_idxs, num_subs, replace=False)
positions.sort()
# choose new amino acids for each of the selected positions
subs = []
for pos in positions:
base_aa = base_seq[pos]
new_aa = rng.choice([c for c in chars if c != base_aa])
# note the pos+1 for 1-based indexing
sub = "{}{}{}".format(base_aa, pos+1, new_aa)
subs.append(sub)
# generate the mutant string
mutant = ",".join(subs)
if mutant not in mutants:
found_valid_mutant = True
mutants.add(mutant)
mutant_list.append(mutant)
# this variant list should already be in sorted order (we sort the positions above)
# but just in case, sort it again here. we need variants in sorted order to avoid accidental dupes.
mutant_list = utils.sort_variant_mutations(mutant_list)
return mutant_list
def max_possible_variants(seq_len, num_subs, num_chars):
return math.comb(seq_len, num_subs) * ((num_chars - 1) ** num_subs)
def distribute_into_buckets(n, num_buckets, bucket_sizes):
""" distributes n items evenly into num_buckets with sizes bucket_sizes
i'm sure there's a better way to do this, but this works and was quick to figure out """
# first check if the buckets have enough capacity to store all n items
if n > sum(bucket_sizes):
raise ValueError("buckets do not have enough capacity to store all n items")
# all buckets start with zero items
buckets = [0] * num_buckets
# keep track of which buckets have free space
free_bucket_idxs = list(range(num_buckets))
while True:
# remaining items left to distribute
remaining = n - sum(buckets)
# if we evenly split remaining number of items, how much goes into each bucket?
amount_per_bucket = remaining // len(free_bucket_idxs)
if amount_per_bucket == 0:
# no more to distribute (either distributed all of them, or an odd number left)
break
# can the free buckets accommodate extra items?
has_capacity = [(buckets[bi] + amount_per_bucket) <= bucket_sizes[bi] for bi in free_bucket_idxs]
if all(has_capacity):
# there is capacity, so distribute those items
for bi in free_bucket_idxs:
buckets[bi] += amount_per_bucket
else:
# one of the buckets doesn't have the capacity
fbi = has_capacity.index(False) # index into free_bucket_idxs (not buckets)
bi = free_bucket_idxs[fbi] # index into buckets
buckets[bi] = bucket_sizes[bi]
free_bucket_idxs.remove(bi)
# distribute any stragglers
num_stragglers = n - sum(buckets)
free_bucket_idxs = [i for i in range(num_buckets) if buckets[i] < bucket_sizes[i]]
# at most there will be 1 extra item to place in each free bucket so no need to loop this
for bi in free_bucket_idxs:
if num_stragglers > 0:
buckets[bi] += 1
num_stragglers -= 1
# final sanity check
over_capacity = [buckets[bi] > bucket_sizes[bi] for bi in range(num_buckets)]
if any(over_capacity):
raise AssertionError("There are buckets over capacity but that should be impossible. This function"
" needs to be fixed")
return buckets
def single_pdb_local_variants(seq, target_num, num_subs_list, chars, seq_idxs, rng):
""" generate local variants for a single PDB file.
given the target number of variants, and the max number of substitutions,
this function tries to generate an equal number of variants for each possible number of substitutions """
# print out some info
print("aa sequence: {}".format(seq))
print("aa seq len: {}".format(len(seq)))
# want to distribute number of target seqs evenly across range(max_subs)
# single mutants probably not have enough possible variants
max_variants = [max_possible_variants(len(seq_idxs), num_subs, len(chars)) for num_subs in num_subs_list]
# print("max variants: {}".format(max_variants))
# distribute the target_num variants to the range of substitutions
variants_per_num_subs = distribute_into_buckets(target_num, len(num_subs_list), max_variants)
# now generate the actual variants
variants = []
for num_subs, num_v, max_v in zip(num_subs_list, variants_per_num_subs, max_variants):
# print("getting sample: {} subs, {} variants".format(num_subs, num_v))
if num_v == max_v:
print("num_subs: {} num_v: {} max_v: {} approach: gen all".format(num_subs, num_v, max_v))
variants += list(gen_all_variants(seq, num_subs, chars, seq_idxs))
elif num_v / max_v > 0.4:
print("num_subs: {} num_v: {} max_v: {} approach: gen all, then sample".format(num_subs, num_v, max_v))
# gen_sample could be slow if we are generating a sample approaching the max number of variants
# in that case it would be much faster to just generate all and select a sample from the pre-generated ones
all_variants = list(gen_all_variants(seq, num_subs, chars, seq_idxs))
# variants += random.sample(all_variants, num_v)
variants += rng.choice(all_variants, num_v, replace=False).tolist()
else:
print("num_subs: {} num_v: {} max_v: {} approach: sample".format(num_subs, num_v, max_v))
variants += gen_sample(seq, num_v, num_subs, chars, seq_idxs, rng)
return variants
def get_subvariants(variant, num_subs):
# num_subs must be less than number of substitutions in the given variant
if num_subs >= len(variant.split(",")):
raise ValueError("num_subs must be less than the number of substitutions in the given variant ({})".format(
len(variant.split(","))))
sv = [",".join(muts) for muts in list(itertools.combinations(variant.split(","), num_subs))]
# should be in sorted order if the given main variant is in sorted order, but sort here just in case
sv = utils.sort_variant_mutations(sv)
return sv
def load_db_variants(db_fn: str, pdb_fn: str) -> set:
# todo: can this be sped up using connectorX?
print("Loading existing database variants for pdb file: {}...".format(basename(pdb_fn)))
start = time.time()
engine = sqla.create_engine('sqlite:///{}'.format(db_fn))
conn = engine.connect().execution_options(stream_results=True)
query = "SELECT mutations FROM variant WHERE `pdb_fn` == \"{}\"".format(basename(pdb_fn))
db = set(pd.read_sql_query(query, conn, coerce_float=False)["mutations"])
conn.close()
engine.dispose()
print("Loaded existing database variants in {}".format(time.time() - start))
return db
def gen_subvariants_vlist(seq: str,
target_num: int,
min_num_subs: int,
max_num_subs: int,
chars: Union[list[str], tuple[str, ...]],
seq_idxs: Sequence[int],
rng: np.random.Generator,
db_pdb_fn: str,
db_fn: Optional[str] = None):
# max_num_subs determines the maximum number of substitutions for the main variants
# min_num_subs determines the minimum number of substitutions for subvariants
# so for example, if min_num_subs is 2, then this function won't generate subvariants with 1 substitution
# target_num is the number of variants to generate (approximate)
# If db_fn is specified, this function will check to see if the generated variants exists in the DB already,
# and if so, it won't return them from this function. note it only some of the subvariants are in the db,
# then this will still return the ones that aren't in the DB.
db = None
if db_fn is not None:
db = load_db_variants(db_fn, db_pdb_fn)
# using a set and a list to maintain the order
# this is slower and uses 2x the memory, but the final variant list will be ordered
variants_set = set()
variants_list = []
while len(variants_list) < target_num:
# generate a variant with max_num_subs substitutions
main_v = gen_sample(seq, num_mutants=1, num_subs=max_num_subs, chars=chars, seq_idxs=seq_idxs, rng=rng)[0]
# if the variant has already been generated, continue to next one
if main_v in variants_set:
continue
# now generate all subvariants for this variant
# generating subvariants for all number of substitutions down to single variants
av = [main_v]
for i in reversed(range(min_num_subs, max_num_subs)):
av += get_subvariants(main_v, i)
# now add this variant and all subvariants to the main list (as long as they are not already there)
for v in av:
# check if the variant is already in the set or already in the DB
variant_in_set = v in variants_set
if variant_in_set:
print("Generated variant already in set: {}".format(v))
variant_in_db = False
if db_fn is not None:
if v in db:
variant_in_db = True
print("Generated variant already in database: {}".format(v))
if not variant_in_set and not variant_in_db:
# only add variant to master list if it's not already in the set and it's not in the db
variants_set.add(v)
variants_list.append(v)
return variants_list
def gen_subvariants_sample(db_fn: str,
db_pdb_fn: str,
target_num: int,
min_num_subs: int,
max_num_subs: int,
rng: np.random.Generator):
"""
Generate a subvariants sample of an existing database...
Will only include variants that exist in the given database,
but will sample those variants using a subvariants approach
"""
# load all the variants for the given pdb_fn from the database
db_variants_set = load_db_variants(db_fn, db_pdb_fn)
db_variants = list(db_variants_set)
df = pd.DataFrame({"variant": db_variants, "num_mutations": [len(v.split(",")) for v in db_variants]})
# iteratively sample variants with max_num_subs
df_max_subs = df[df["num_mutations"] == max_num_subs]
# using a set and a list to maintain the order
variants_set = set()
variants_list = []
# create the list of max_subs variants to sample from, basically just shuffle df_max_subs
df_max_subs = df_max_subs.sample(frac=1, random_state=rng.bit_generator).reset_index(drop=True)
main_v_index = 0
while len(variants_list) < target_num:
# sample the next variant from df_max_subs
if main_v_index > len(df_max_subs) - 1:
# there aren't enough df_max_subs variants to sample from to put together the target number of variants
raise ValueError("Not enough {}-variants to sample from".format(max_num_subs))
main_v = df_max_subs.iloc[main_v_index]["variant"]
main_v_index += 1
# generate all subvariants for this variant
av = [main_v]
for i in reversed(range(min_num_subs, max_num_subs)):
av += get_subvariants(main_v, i)
# add this variant and subvariants to the main list
# but first check to ensure they are not already in the list
# and also check to make sure they ARE in the database (because we are sampling the existing db)
# note the main_v is guaranteed to be in the database because that's where we sampled it from,
# but it's just easier to include it with all the subvariants
for v in av:
variant_in_set = v in variants_set
if variant_in_set:
print("Generated variant already in set: {}".format(v))
variant_in_db = True
if v not in db_variants_set:
variant_in_db = False
print("Generated subvariant does NOT exist in database, skipping: {}".format(v))
if not variant_in_set and variant_in_db:
variants_set.add(v)
variants_list.append(v)
return variants_list
def human_format(num):
"""https://stackoverflow.com/questions/579310/formatting-long-numbers-as-strings-in-python"""
num = float('{:.3g}'.format(num))
magnitude = 0
while abs(num) >= 1000:
magnitude += 1
num /= 1000.0
return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])
def gen_random_main(pdb_fn, seq, seq_idxs, chars, target_num, num_subs_list, num_replicates, seed, out_dir):
out_fn = "{}_random_TN-{}_NR-{}_NS-{}_RS-{}.txt".format(basename(pdb_fn)[:-4],
human_format(target_num), num_replicates,
",".join(map(str, num_subs_list)), seed)
out_fn = join(out_dir, out_fn)
if isfile(out_fn):
raise FileExistsError("Output file already exists: {}".format(out_fn))
print("Output file will be {}".format(out_fn))
# create a random number generator for this call
rng = np.random.default_rng(seed=seed)
# generate the variants
variants = single_pdb_local_variants(seq, target_num, num_subs_list, chars, seq_idxs, rng)
# multiply number of variants for variance testing
variants *= num_replicates
print_variant_info(variants)
with open(out_fn, "w") as f:
for v in variants:
f.write("{} {}\n".format(basename(pdb_fn), v))
def gen_all_main(pdb_fn: str,
seq: str,
seq_idxs: Sequence[int],
chars: list[str],
num_subs_list: list[int],
out_dir: Optional[str],
db_fn: Optional[str] = None,
db_mode: Optional[str] = None,
db_pdb_fn: Optional[str] = None,
ignore_existing_out_file: bool = False):
"""
Generate all variants for a single PDB file
db_mode: if 'filter', exclude variants that are in the given database
if 'sample', only include variants that are in the given database
"""
if (db_mode is None) ^ (db_fn is None):
raise ValueError("Both db_fn and db_mode should be specified or left as None")
# error checking -- db_mode must be "filter" or "sample"
if db_mode not in [None, "filter", "sample"]:
raise ValueError("db_mode must be None, 'filter' or 'sample'")
# if db_pdb_fn is None, set it equal to pdb_fn
# note db_pdb_fn will only be used if db_mode is "filter" or "sample"
if db_pdb_fn is None:
db_pdb_fn = pdb_fn
# if db_fn is specified, we need to have a hash of the database in the filename
db_hash = hash_db(db_fn)
# determine the output filename
if db_mode == "sample":
# only sampling variants from the given database
out_fn = "{}_all_NS-{}_sampled-DB-{}-{}.txt".format(basename(pdb_fn)[:-4],
",".join(map(str, num_subs_list)),
db_hash,
basename(db_pdb_fn)[:-4])
elif db_mode == "filter":
# excluding variants that are in the database
out_fn = "{}_all_NS-{}_filtered-DB-{}-{}.txt".format(basename(pdb_fn)[:-4],
",".join(map(str, num_subs_list)),
db_hash,
basename(db_pdb_fn)[:-4])
else:
# no database specified, just generate all variants
out_fn = "{}_all_NS-{}.txt".format(basename(pdb_fn)[:-4], ",".join(map(str, num_subs_list)))
out_fn = join(out_dir, out_fn)
# output file already exists
if isfile(out_fn):
if ignore_existing_out_file:
new_out_fn = out_fn
i = 1
while isfile(new_out_fn):
new_out_fn = "{}_{}.txt".format(out_fn[:-4], i)
i += 1
out_fn = new_out_fn
else:
raise FileExistsError("Output file already exists: {}".format(out_fn))
print("Output file will be {}".format(out_fn))
for i in num_subs_list:
mp = max_possible_variants(len(seq_idxs), i, len(chars))
print("Generating {} {}-mutation variants".format(mp, i))
variants = []
for i in num_subs_list:
variants += list(gen_all_variants(seq, i, chars, seq_idxs))
if db_mode == "sample":
# database sample mode, only include variants that are in the database
db_variants = load_db_variants(db_fn, db_pdb_fn)
variants = [v for v in variants if v in db_variants]
# database filter mode, exclude any variants that are in the database
elif db_mode == "filter":
# filter out variants already in the database if one is provided
db_variants = load_db_variants(db_fn, db_pdb_fn)
variants = [v for v in variants if v not in db_variants]
print_variant_info(variants)
with open(out_fn, "w") as f:
for v in variants:
f.write("{} {}\n".format(basename(pdb_fn), v))
def hash_db(db_fn):
db_hash = 0
if db_fn is not None:
print("Hashing database...")
start = time.time()
hash_obj = hashlib.shake_128()
with open(db_fn, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
hash_obj.update(byte_block)
db_hash = hash_obj.hexdigest(4)
print("Hashing database finished in {}".format(time.time() - start))
return db_hash
def gen_subvariants_main(pdb_fn: str,
seq: str,
seq_idxs: Sequence[int],
chars: list[str],
target_num: int,
max_num_subs: int,
min_num_subs: int,
seed: int,
out_dir: str,
db_fn: Optional[str] = None,
db_mode: Optional[str] = None,
db_pdb_fn: Optional[str] = None):
if (db_mode is None) ^ (db_fn is None):
raise ValueError("Both db_fn and db_mode should be specified or left as None")
if db_mode is not None and db_mode not in ["filter", "sample"]:
raise ValueError("db_mode must be None, 'filter' or 'sample'")
# db_pdb_fn is used to query the database for database modes 'filter' and 'sample'
# if None, then use the same PDB file for which we are generating variants
if db_pdb_fn is None:
db_pdb_fn = pdb_fn
# if db_fn is specified, we need to have a hash of the database in the filename
db_hash = hash_db(db_fn)
# determine the output filename
# todo: hard for the filename can't communicate all the provenance...maybe have additional metadata file?
out_fn_template = "{}_subvariants_TN-{}_MAXS-{}_MINS-{}_{}-DB-{}-{}_RS-{}.txt"
out_fn_template_args = [
basename(pdb_fn).rsplit('.', 1)[0],
human_format(target_num),
max_num_subs,
min_num_subs,
"sampled" if db_mode == "sample" else "filtered",
db_hash,
basename(db_pdb_fn).rsplit('.', 1)[0],
seed
]
out_fn = out_fn_template.format(*out_fn_template_args)
out_fn = join(out_dir, out_fn)
if isfile(out_fn):
raise FileExistsError("Output file already exists: {}".format(out_fn))
print("Output file will be {}".format(out_fn))
# create a random number generator for this call
rng = np.random.default_rng(seed=seed)
# generate the variants
if db_mode == "sample":
# just a type hint because if db_mode is "sample" then the error checking ensures db_fn is str
db_fn: str
# sampling needs a special function that selects the main variant from the database
variants = gen_subvariants_sample(db_fn, db_pdb_fn, target_num, min_num_subs, max_num_subs, rng)
elif db_mode == "filter" or db_mode is None:
# this can handle db_fn being None or db_mode being "filter"
variants = gen_subvariants_vlist(seq, target_num, min_num_subs, max_num_subs, chars, seq_idxs, rng, db_pdb_fn, db_fn)
else:
raise ValueError("db_mode must be None, 'filter' or 'sample'")
print_variant_info(variants)
# save output to file
with open(out_fn, "w") as f:
for v in variants:
f.write("{} {}\n".format(basename(pdb_fn), v))
def print_variant_info(variants):
# print out info about the generated variants
print("Generated {} variants".format(len(variants)))
count = Counter([len(v.split(",")) for v in variants])
for k, v in count.items():
print("{}-mutants: {}".format(k, v))
def get_seq_idxs(seq: str,
seq_idxs_range_start: Optional[int],
seq_idxs_range_end: Optional[int]) -> np.ndarray:
range_start = seq_idxs_range_start
if range_start is None:
range_start = 0
range_end = seq_idxs_range_end
if range_end is None:
range_end = len(seq)
seq_idxs = np.arange(range_start, range_end)
return seq_idxs
def main(args):
chars = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
for pdb_fn in args.pdb_fn:
print("Generating variant list for {}".format(pdb_fn))
seq = utils.extract_seq_from_pdb(pdb_fn, chain_id=args.chain_id, error_on_multiple_chains=True)
seq_idxs = get_seq_idxs(seq, args.seq_idxs_range_start, args.seq_idxs_range_end)
# grab a random, random seed
seed = args.seed
if seed is None:
seed = random.randint(100000000, 999999999)
if args.method == "subvariants":
gen_subvariants_main(pdb_fn=pdb_fn,
seq=seq,
seq_idxs=seq_idxs,
chars=chars,
target_num=args.target_num,
max_num_subs=args.max_num_subs,
min_num_subs=args.min_num_subs,
seed=seed,
out_dir=args.out_dir,
db_fn=args.db_fn,
db_mode=args.db_mode,
db_pdb_fn=args.db_pdb_fn)
elif args.method == "random":
gen_random_main(pdb_fn, seq, seq_idxs, chars,
args.target_num, args.num_subs_list, args.num_replicates, seed, args.out_dir)
elif args.method == "all":
gen_all_main(pdb_fn=pdb_fn,
seq=seq,
seq_idxs=seq_idxs,
chars=chars,
num_subs_list=args.num_subs_list,
out_dir=args.out_dir,
db_fn=args.db_fn,
db_mode=args.db_mode,
db_pdb_fn=args.db_pdb_fn,
ignore_existing_out_file=args.ignore_existing_out_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
fromfile_prefix_chars="@")
parser.add_argument("method",
help="what method to use to generate variants",
type=str,
choices=["all", "random", "subvariants"])
# common args
parser.add_argument("--pdb_fn",
help="the PDB file from which to generate variants. can specify multiple.",
type=str,
nargs="+")
parser.add_argument("--chain_id",
help="the chain ID to use for the PDB file",
type=str,
default=None)
parser.add_argument("--seq_idxs_range_start",
help="the start of the range where to mutate the pdb_fn sequence. 0-based indexing.",
type=int,
default=None)
parser.add_argument("--seq_idxs_range_end",
help="the end of the range where to mutate the pdb_fn sequence, EXCLUSIVE. 0-based indexing",
type=int,
default=None)
parser.add_argument("--target_num",
type=int,
help="target number of variants per pdb_fn")
parser.add_argument("--seed",
type=int,
help="random seed, None for a random random seed",
default=None)
parser.add_argument("--out_dir",
type=str,
help="output directory for variant lists",
default="variant_lists")
parser.add_argument("--db_fn",
type=str,
help="database filename, if specified, will not generate variants already in database",
default=None)
parser.add_argument("--db_mode",
type=str,
help="if 'filter', exclude variants that are in the given database. "
"if 'sample', only include variants that are in the given database",
default=None,
choices=["filter", "sample"])
parser.add_argument("--db_pdb_fn",
type=str,
help="the PDB file to use for the database. if None, use the same PDB file as the one "
"being used to generate variants",
default=None)
parser.add_argument("--ignore_existing_out_file",
action="store_true",
default=False,
help="ignore existing filename, create a new one with appended number")
# random args
parser.add_argument("--num_subs_list",
type=int,
help="for random and 'all' method, numbers of substitutions for variants",
nargs="+",
default=[1, 2])
parser.add_argument("--num_replicates",
type=int,
help="for random method, the maximum number of replicates of each variant",
default=1)
# subvariants args
parser.add_argument("--max_num_subs",
type=int,
help="for subvariants method, the maximum number of substitutions for a variant",
default=5)
parser.add_argument("--min_num_subs",
type=int,
help="for subvariants method, the minimum number of substitutions for a variant",
default=1)
main(parser.parse_args())