-
Notifications
You must be signed in to change notification settings - Fork 2
/
domesticator3
executable file
·177 lines (144 loc) · 7.6 KB
/
domesticator3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/net/software/lab/domesticator_3/env/michaelangeneo/bin/python3.9
# -*- coding: utf-8 -*-
###### input handling ######
#this happens first so it fails quickly if bad inputs are given
import argparse
parser = argparse.ArgumentParser(prog='domesticator3', description='A sophisticated codon optimizer for the discerning protein designer')
parser.add_argument("proteins", type=str, nargs="+", help="Either one or more fasta files containing one or more protein sequences or one or more pdb files")
parser.add_argument("vector", type=str, help="A genbank (.gb) file containing annotations in the domesticator format to control domesticator function")
parser.add_argument("--single_protein_fasta", action="store_true",help="Assign increasing chain letters in the order that they appear in the file. Lettering resets between files. This is useful for multiple insertions with a fasta file")
parser.add_argument("--nstruct", type=int, default=10, help="number of times to repeat optimization before picking one to return. Default: %(default)d")
parser.add_argument("--max_tries", type=int, default=3, help="number of times to restart optimization before giving up if no solution is found. Default: %(default)d")
parser.add_argument("--no_idt",action="store_true",help="Turn off complexity checking using IDT's API")
#parser.add_argument("--idt_credentials_dir",type=str,default = "/home/rdkibler/projects/dom_dev/lab_shared_idt_creds", help="A path to the place to search for you stored IDT API credentials. If no info.json file is found, then you will be prompted to enter new ones and they will be stored there")
parser.add_argument("--idt_credentials_dir",type=str,default = "~/idt_credentials", help="A path to the place to search for you stored IDT API credentials. If no info.json file is found, then you will be prompted to enter new ones and they will be stored there")
parser.add_argument("--idt_threshold",type=float,default=7,help="automatically accept the first solution with IDT score under this threshold")
parser.add_argument('--idt_kind', type=str, help='kind of sequence to query', default='gene', choices = ['gene','gblock','gblock_hifi','eblock','old'])
parser.add_argument("--no_opt", action="store_true",help="bypass the gene optimization step. Useful for debugging new vectors")
parser.add_argument("--ramp_kmers_boost", type=float, default=0, help="increase the boost of the kmers objective after each failed optimization. Default: %(default)f")
parser.add_argument('--version', action='version', version='%(prog)s alpha 1.0')
args = parser.parse_args()
###### finish imports ######
# Standard library imports
import copy
# Third party imports
from dnachisel import DnaOptimizationProblem, NoSolutionError
from dnachisel import DEFAULT_SPECIFICATIONS_DICT
import Bio
from Bio.Seq import Seq
import numpy as np
import pandas as pd
# Local application imports
from tools import input_parsing, product_analysis, idt
from specifications.MinimizeNumKmers import MinimizeNumKmers
DEFAULT_SPECIFICATIONS_DICT['MinimizeNumKmers'] = MinimizeNumKmers
###### load files ######
if not args.no_idt:
user_info_file = idt.use_dir(args.idt_credentials_dir)
idt_user_info = idt.get_user_info(user_info_file)
base_vector_record = input_parsing.load_vector_record(args.vector)
naive_vector_records = input_parsing.make_naive_vector_records(base_vector_record,args.proteins,args.single_protein_fasta)
###### optimize ######
optimized_vector_solutions = []
for i,record in enumerate(naive_vector_records):
print("-"*40 + f" {i+1}/{len(naive_vector_records)} " + "-"*40)
print(f"Attempting optimization of {record.name}")
initial_problem = DnaOptimizationProblem.from_record(record)
if args.no_opt:
optimized_vector_solutions.append(initial_problem)
continue
solutions = []
idt_scores = []
solution_found = False
for iteration in range(args.nstruct):
if solution_found:
break
trial = 0
while trial < args.max_tries:
try:
print(f"iteration {iteration + 1}/{args.nstruct}")
problem = copy.deepcopy(initial_problem)
problem.resolve_constraints_by_random_mutations()
problem.optimize()
problem.resolve_constraints(final_check=True)
solutions.append(problem)
break
except NoSolutionError:
print("no solution found. Retrying...")
initial_problem.max_random_iters += 1000
trial += 1
continue
if trial == args.max_tries:
print("Skipping...")
continue
if not args.no_idt:
print("querying IDT")
i = len(solutions)
solution = solutions[-1]
sol_record = solution.to_record()
seq = ""
for feature in sol_record.features:
if feature.type == "domesticator" and feature.qualifiers['label'] == ["synthesize"]:
seq = feature.extract(sol_record.seq)
break
assert seq != ""
response = idt.query_complexity(seq, idt_user_info, kind=args.idt_kind, gene_name=record.name)
# if len(response[0]) == 0:
# print("no issue!")
score_sum = 0
print(f"SOLUTION {i+1}:")
for issue in response[0]:
print(issue["Score"],issue["Name"])
score_sum += issue["Score"]
print(f"Total Score: {score_sum}")
print()
idt_scores.append(score_sum)
if score_sum < args.idt_threshold:
print("accept")
solution_found = True
break
#find the avoid_kmers objective
for objective in initial_problem.objectives:
if type(objective) == MinimizeNumKmers:
objective.boost = objective.boost + args.ramp_kmers_boost
print(f"DEBUG! boosting {str(objective)} by {args.ramp_kmers_boost}. Value is now {objective.boost}")
break
if not args.no_idt:
best_idx = np.argmin(idt_scores)
else:
scores = [solution.objectives_evaluations().scores_sum() for solution in solutions]
best_idx = np.argmin(scores)
best_solution = solutions[best_idx]
optimized_vector_solutions.append(best_solution)
###### create reports and outputs ######
print("REPORT")
records_to_synthesize = []
all_protein_params = []
for optimized_vector_solution in optimized_vector_solutions:
#optimized_vector_solution.record stores the NAIVE RECORD, so we need to use to_record()
#however I think the annotations to the generated record (from to_record()) are bad, so instead let's just transplant the sequence
optimized_vector_record = optimized_vector_solution.record
optimized_vector_record.seq = Seq(optimized_vector_solution.sequence)
Bio.SeqIO.write(optimized_vector_record, optimized_vector_record.name + ".gb","genbank")
fragment_to_synthesize = None
for feature in optimized_vector_record.features:
if feature.type == "domesticator" and feature.qualifiers['label'] == ["synthesize"]:
fragment_to_synthesize = feature.extract(optimized_vector_record.seq)
break
assert fragment_to_synthesize != ""
record_to_synthesize = Bio.SeqRecord.SeqRecord(seq=fragment_to_synthesize,id=optimized_vector_record.name,name=optimized_vector_record.name,description="")
records_to_synthesize.append(record_to_synthesize)
with open(optimized_vector_record.name + ".log",'w') as f:
f.write(optimized_vector_solution.constraints_text_summary() + "\n")
f.write(optimized_vector_solution.objectives_text_summary() + "\n")
polypeptides = product_analysis.find_polypeptides(optimized_vector_record)
protein_params = product_analysis.get_params(polypeptides)
all_protein_params.append(protein_params)
#TODO: Add support for detection of protease cleavage sites and printing of the fragment params
pd.concat(all_protein_params).to_csv("translated_proteins.params")
Bio.SeqIO.write(records_to_synthesize, "order.dna.fasta","fasta")
# ## start debug block -- gives user the steering wheel
# import code
# print("now entering interactive console. Press Ctrl-D to return to the script")
# code.interact(local=locals())
# ## end debug block