domesticator3

#!/net/software/lab/domesticator_3/env/michaelangeneo/bin/python3.9
# -*- coding: utf-8 -*-

###### input handling ######
#this happens first so it fails quickly if bad inputs are given
import argparse

parser = argparse.ArgumentParser(prog='domesticator3', description='A sophisticated codon optimizer for the discerning protein designer')

parser.add_argument("proteins", type=str, nargs="+", help="Either one or more fasta files containing one or more protein sequences or one or more pdb files")
parser.add_argument("vector", type=str, help="A genbank (.gb) file containing annotations in the domesticator format to control domesticator function")

parser.add_argument("--single_protein_fasta", action="store_true",help="Assign increasing chain letters in the order that they appear in the file. Lettering resets between files. This is useful for multiple insertions with a fasta file")

parser.add_argument("--nstruct", type=int, default=10, help="number of times to repeat optimization before picking one to return. Default: %(default)d")
parser.add_argument("--max_tries", type=int, default=3, help="number of times to restart optimization before giving up if no solution is found. Default: %(default)d")
parser.add_argument("--no_idt",action="store_true",help="Turn off complexity checking using IDT's API")
#parser.add_argument("--idt_credentials_dir",type=str,default = "/home/rdkibler/projects/dom_dev/lab_shared_idt_creds", help="A path to the place to search for you stored IDT API credentials. If no info.json file is found, then you will be prompted to enter new ones and they will be stored there")
parser.add_argument("--idt_credentials_dir",type=str,default = "~/idt_credentials", help="A path to the place to search for you stored IDT API credentials. If no info.json file is found, then you will be prompted to enter new ones and they will be stored there")
parser.add_argument("--idt_threshold",type=float,default=7,help="automatically accept the first solution with IDT score under this threshold")
parser.add_argument('--idt_kind', type=str, help='kind of sequence to query', default='gene', choices = ['gene','gblock','gblock_hifi','eblock','old'])
parser.add_argument("--no_opt", action="store_true",help="bypass the gene optimization step. Useful for debugging new vectors")
parser.add_argument("--ramp_kmers_boost", type=float, default=0, help="increase the boost of the kmers objective after each failed optimization. Default: %(default)f")


parser.add_argument('--version', action='version', version='%(prog)s alpha 1.0')

args = parser.parse_args()


###### finish imports ######
# Standard library imports
import copy

# Third party imports
from dnachisel import DnaOptimizationProblem, NoSolutionError
from dnachisel import DEFAULT_SPECIFICATIONS_DICT
import Bio
from Bio.Seq import Seq
import numpy as np
import pandas as pd

# Local application imports
from tools import input_parsing, product_analysis, idt
from specifications.MinimizeNumKmers import MinimizeNumKmers

DEFAULT_SPECIFICATIONS_DICT['MinimizeNumKmers'] = MinimizeNumKmers

###### load files ######
if not args.no_idt:
	user_info_file = idt.use_dir(args.idt_credentials_dir)
	idt_user_info = idt.get_user_info(user_info_file)

base_vector_record = input_parsing.load_vector_record(args.vector)
naive_vector_records = input_parsing.make_naive_vector_records(base_vector_record,args.proteins,args.single_protein_fasta)

###### optimize ######
optimized_vector_solutions = []
for i,record in enumerate(naive_vector_records):
	print("-"*40 + f" {i+1}/{len(naive_vector_records)} " + "-"*40)
	print(f"Attempting optimization of {record.name}")
	initial_problem = DnaOptimizationProblem.from_record(record)
	if args.no_opt:
		optimized_vector_solutions.append(initial_problem)
		continue
	solutions = []
	idt_scores = []
	solution_found = False
	for iteration in range(args.nstruct):
		if solution_found:
			break
		trial = 0
		while trial < args.max_tries:
			try:
				print(f"iteration {iteration + 1}/{args.nstruct}")
				problem = copy.deepcopy(initial_problem)
				problem.resolve_constraints_by_random_mutations()
				problem.optimize()
				problem.resolve_constraints(final_check=True)
				solutions.append(problem)
				break
			except NoSolutionError:
				print("no solution found. Retrying...")
				initial_problem.max_random_iters += 1000
				trial += 1
				continue

		if trial == args.max_tries:
			print("Skipping...")
			continue

		if not args.no_idt:
			print("querying IDT")
			i = len(solutions)
			solution = solutions[-1]
			sol_record = solution.to_record()
			seq = ""
			for feature in sol_record.features:
				if feature.type == "domesticator" and feature.qualifiers['label'] == ["synthesize"]:
					seq = feature.extract(sol_record.seq)
					break
			assert seq != ""
			response = idt.query_complexity(seq, idt_user_info, kind=args.idt_kind, gene_name=record.name)
			# if len(response[0]) == 0:
			#       print("no issue!")
			score_sum = 0
			print(f"SOLUTION {i+1}:")
			for issue in response[0]:
					print(issue["Score"],issue["Name"])
					score_sum += issue["Score"]
			print(f"Total Score: {score_sum}")
			print()
			idt_scores.append(score_sum)
			if score_sum < args.idt_threshold:
				print("accept")
				solution_found = True
				break
		
		#find the avoid_kmers objective
		for objective in initial_problem.objectives:
			if type(objective) == MinimizeNumKmers:
				objective.boost = objective.boost + args.ramp_kmers_boost
				print(f"DEBUG! boosting {str(objective)} by {args.ramp_kmers_boost}. Value is now {objective.boost}")
				break


	if not args.no_idt:
		best_idx = np.argmin(idt_scores)
	else:
		scores = [solution.objectives_evaluations().scores_sum() for solution in solutions]
		best_idx = np.argmin(scores)

	best_solution = solutions[best_idx]
	optimized_vector_solutions.append(best_solution)

###### create reports and outputs ######
print("REPORT")
records_to_synthesize = []
all_protein_params = []
for optimized_vector_solution in optimized_vector_solutions:
	#optimized_vector_solution.record stores the NAIVE RECORD, so we need to use to_record()
	#however I think the annotations to the generated record (from to_record()) are bad, so instead let's just transplant the sequence
	optimized_vector_record = optimized_vector_solution.record
	optimized_vector_record.seq = Seq(optimized_vector_solution.sequence)

	Bio.SeqIO.write(optimized_vector_record, optimized_vector_record.name + ".gb","genbank")

	fragment_to_synthesize = None
	for feature in optimized_vector_record.features:
			if feature.type == "domesticator" and feature.qualifiers['label'] == ["synthesize"]:
				fragment_to_synthesize = feature.extract(optimized_vector_record.seq)
				break
	assert fragment_to_synthesize != ""
	record_to_synthesize = Bio.SeqRecord.SeqRecord(seq=fragment_to_synthesize,id=optimized_vector_record.name,name=optimized_vector_record.name,description="")

	records_to_synthesize.append(record_to_synthesize)

	with open(optimized_vector_record.name + ".log",'w') as f:
		f.write(optimized_vector_solution.constraints_text_summary() + "\n")
		f.write(optimized_vector_solution.objectives_text_summary() + "\n")

	polypeptides = product_analysis.find_polypeptides(optimized_vector_record)
	protein_params = product_analysis.get_params(polypeptides)
	all_protein_params.append(protein_params)
	#TODO: Add support for detection of protease cleavage sites and printing of the fragment params

pd.concat(all_protein_params).to_csv("translated_proteins.params")
Bio.SeqIO.write(records_to_synthesize, "order.dna.fasta","fasta")


# ## start debug block -- gives user the steering wheel
# import code
# print("now entering interactive console. Press Ctrl-D to return to the script")
# code.interact(local=locals())
# ## end debug block