From 27e6f14f12aac62140504adbaa24bb97d610ba6a Mon Sep 17 00:00:00 2001 From: Maohua Yang Date: Fri, 18 Aug 2023 18:26:37 +0800 Subject: [PATCH] validate and repair for ligand (#41) * fix: scan bug * version 0.1.5 * fix: add validate ligand tool * fix: add validate ligand tool * feat: add ligand prepare * fix: add forcefield check * feat: add a new tool unigbsa-validate * fix: add protprep for lauching * version 0.1.6 --- launching/app.py | 26 +++++- setup.py | 3 +- unigbsa/CLI.py | 20 +++++ unigbsa/pipeline.py | 17 ++-- unigbsa/simulation/utils.py | 163 +++++++++++++++++++++++++++++++++++- unigbsa/version.py | 2 +- 6 files changed, 219 insertions(+), 12 deletions(-) diff --git a/launching/app.py b/launching/app.py index bb1f18b..2c19867 100644 --- a/launching/app.py +++ b/launching/app.py @@ -179,6 +179,22 @@ class GBSAModel( ... +def prep_protein(pdbfile): + outfile = pdbfile[:-4] + '_prep.pdb' + args = [ + '-i', pdbfile, + '-o', outfile, + '-mode', 'gromacs', + ] + print(args) + argstr = ' '.join(args) + cmd = '/bin/bash -c "source ~/.bashrc; conda activate protprep; unisp-prot %s"'%argstr + RC = os.system(cmd) + if RC != 0: + return -2 + return outfile + + def gbsa_runner(opts: GBSAModel) -> int: status = 0 try: @@ -217,6 +233,9 @@ def gbsa_runner(opts: GBSAModel) -> int: } } input_protein = opts.input_protein.get_path() + preped_protein = prep_protein(input_protein) + if preped_protein == -2: + raise Exception('ERROR prep protein file.') input_dir = os.path.dirname(input_protein) output_dir = str(opts.output_dir) if not os.path.exists(output_dir): @@ -228,9 +247,10 @@ def gbsa_runner(opts: GBSAModel) -> int: cmd = sh.Command('unigbsa-pipeline') args = [ - '-i', opts.input_protein.get_path(), - '-l', " ".join(opts.input_ligands), - '-c', configfile, + '-i', preped_protein, + '-l', *opts.input_ligands, + '-validate', + '-c', str(configfile), '-o', csvoutfile ] diff --git a/setup.py b/setup.py index 48b2ad1..c1d771c 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,8 @@ def get_readme(rel_path): 'unigbsa-buildsys = unigbsa.CLI:simulation_builder', 'unigbsa-md = unigbsa.CLI:simulation_run', 'unigbsa-plots = unigbsa.CLI:mmpbsa_plot', - 'unigbsa-scan = unigbsa.scanparas.scan:main' + 'unigbsa-scan = unigbsa.scanparas.scan:main', + 'unigbsa-validate = unigbsa.CLI:ligand_check' ]}, include_package_data=True ) diff --git a/unigbsa/CLI.py b/unigbsa/CLI.py index 9352602..c83eb08 100644 --- a/unigbsa/CLI.py +++ b/unigbsa/CLI.py @@ -7,6 +7,7 @@ from .utils import generate_index_file, process_pbc from .simulation import topology, mdrun +from .simulation.utils import ligand_validate from .settings import logging from .version import __version__ @@ -292,3 +293,22 @@ def mmpbsa_plot(): func = funcdic[fname] logging.info('Analysis file: %s'%fname) func(datfile, outdir=oup) + + +def ligand_check(): + parser = argparse.ArgumentParser(description='Analysis and plot results for MMPBSA.') + parser.add_argument('-i', help='Ligand file to validate.', required=True) + parser.add_argument('-o', help='Ligand file after validate. default: None', default=None) + parser.add_argument('-v', '--version', action='version', version="{prog}s ({version})".format(prog="%(prog)", version=__version__)) + + args = parser.parse_args() + ligandfile, outfile = args.i, args.o + if not outfile: + outfile = ligand_validate(ligandfile, outfile) + if outfile == ligandfile: + print(f'{ligandfile} is OK as the input for unigbsa.') + else: + print(f'{ligandfile} is not OK as the input for unigbsa. But unigbsa-validate can repair it.') + else: + outfile = ligand_validate(ligandfile, outfile) + print(f'{outfile} is OK as the input for unigbsa.') diff --git a/unigbsa/pipeline.py b/unigbsa/pipeline.py index 1ab9587..12dd60f 100644 --- a/unigbsa/pipeline.py +++ b/unigbsa/pipeline.py @@ -10,6 +10,7 @@ from unigbsa.utils import generate_index_file, load_configue_file from unigbsa.simulation.mdrun import GMXEngine from unigbsa.simulation.topology import build_topol, build_protein +from unigbsa.simulation.utils import ligand_validate from unigbsa.settings import logging, DEFAULT_CONFIGURE_FILE, GMXEXE, set_OMP_NUM_THREADS from tqdm import tqdm @@ -52,7 +53,7 @@ def traj_pipeline(complexfile, trajfile, topolfile, indexfile, pbsaParas=None, m print('%6d %4s %18.4f '%(irow['Frames'], irow['mode'], irow['TOTAL'])) return detal_G -def base_pipeline(receptorfile, ligandfiles, paras, nt=1, mmpbsafile=None, outfile='BindingEnergy.csv', verbose=False): +def base_pipeline(receptorfile, ligandfiles, paras, nt=1, mmpbsafile=None, outfile='BindingEnergy.csv', validate=False, verbose=False): """ This function takes a receptorfile and ligandfile, and build a complex.pdb and complex.top file @@ -81,7 +82,8 @@ def base_pipeline(receptorfile, ligandfiles, paras, nt=1, mmpbsafile=None, outfi if not os.path.exists(ligandName): os.mkdir(ligandName) os.chdir(ligandName) - + if validate: + ligandfile = ligand_validate(ligandfile, ligandName+'.mol') grofile = 'complex.pdb' topfile = 'complex.top' logging.info('Build ligand topology: %s'%ligandName) @@ -118,7 +120,7 @@ def base_pipeline(receptorfile, ligandfiles, paras, nt=1, mmpbsafile=None, outfi def single(arg): - receptor, ligandfile, simParas, ligandfiles, mmpbsafile, nt, pbsaParas, verbose, receptorfile = arg + receptor, ligandfile, simParas, ligandfiles, mmpbsafile, nt, pbsaParas, validate, verbose, receptorfile = arg d1 = pd.DataFrame({'Frames': 1, 'mode':pbsaParas['modes'], 'complex':0.0,'receptor':0.0,'ligand':0.0,'Internal':0.0,'Van der Waals':0.0,'Electrostatic':0,'Polar Solvation':0.0,'Non-Polar Solvation':0.0,'Gas':0.0,'Solvation':0.0,'TOTAL':0.0}, index=[1]) cwd = os.getcwd() statu = 'S' @@ -127,6 +129,8 @@ def single(arg): if not os.path.exists(ligandName): os.mkdir(ligandName) os.chdir(ligandName) + if validate: + ligandfile = ligand_validate(ligandfile, ligandName+'.mol') grofile = 'complex.pdb' topfile = 'complex.top' if len(ligandfiles) == 1: @@ -179,7 +183,7 @@ def single(arg): d1['status'] = statu return d1 -def minim_pipeline(receptorfile, ligandfiles, paras, mmpbsafile=None, nt=1, outfile='BindingEnergy.csv', verbose=False): +def minim_pipeline(receptorfile, ligandfiles, paras, mmpbsafile=None, nt=1, outfile='BindingEnergy.csv', validate=False, verbose=False): """ It runs the simulation pipeline for each ligand. @@ -199,7 +203,7 @@ def minim_pipeline(receptorfile, ligandfiles, paras, mmpbsafile=None, nt=1, outf args = [(receptor, ligandfile, simParas, ligandfiles, mmpbsafile, 1, - pbsaParas, verbose, receptorfile) for ligandfile in sorted(ligandfiles)] + pbsaParas, validate, verbose, receptorfile) for ligandfile in sorted(ligandfiles)] if len(args) == 1: df = single(args[0]) else: @@ -285,6 +289,7 @@ def main(args=None): parser.add_argument('-d', dest='ligdir', help='Floder contains many ligand files. file format: .mol or .sdf', default=None) parser.add_argument('-f', dest='pbsafile', help='gmx_MMPBSA input file. default=None', default=None) parser.add_argument('-o', dest='outfile', help='Output file.', default='BindingEnergy.csv') + parser.add_argument('-validate', help='Validate the ligand file. default: False', action='store_true', default=False) parser.add_argument('-nt', dest='thread', help='Set number of thread to run this program.', type=int, default=multiprocessing.cpu_count()) parser.add_argument('--decomp', help='Decompose the free energy. default:False', action='store_true', default=False) parser.add_argument('--verbose', help='Keep all the files.', action='store_true', default=False) @@ -323,7 +328,7 @@ def main(args=None): paras['GBSA']['modes'] = gbtype if paras['simulation']['mode'] == 'em': - minim_pipeline(receptorfile=receptor, ligandfiles=ligands, paras=paras, outfile=outfile, mmpbsafile=mmpbsafile, verbose=verbose, nt=nt) + minim_pipeline(receptorfile=receptor, ligandfiles=ligands, paras=paras, outfile=outfile, mmpbsafile=mmpbsafile, validate=args.validate, verbose=verbose, nt=nt) elif paras['simulation']['mode'] == 'md': md_pipeline(receptorfile=receptor, ligandfiles=ligands, paras=paras, outfile=outfile, mmpbsafile=mmpbsafile, verbose=verbose, nt=nt) elif paras['simulation']['mode'] == 'input': diff --git a/unigbsa/simulation/utils.py b/unigbsa/simulation/utils.py index e6b5fdb..7415e72 100644 --- a/unigbsa/simulation/utils.py +++ b/unigbsa/simulation/utils.py @@ -1,5 +1,10 @@ import os -from unigbsa.settings import GMXEXE +import uuid +import shutil +from pathlib import Path +from unigbsa.settings import GMXEXE, logging + + def convert_format(inputfile, filetype, outfile=None, outtype='mol'): """ Convert a file of type filetype to mol2 format @@ -335,3 +340,159 @@ def obtain_net_charge(sdfile): # Calculate the net charge of the molecule net_charge = mol.GetTotalCharge() return net_charge + + +def check_forcefield(sdfile): + sqm_key = "grms_tol=0.005,qm_theory='AM1',scfconv=1.d-5,ndiis_attempts=700,maxcyc=0" + tmpdir = Path('/tmp') / str(uuid.uuid4()) + sdfile = Path(sdfile).absolute() + net_charge = obtain_net_charge(str(sdfile)) + tmpdir.mkdir(exist_ok=True) + cwd = os.getcwd() + os.chdir(tmpdir) + cmd = f'export OMP_NUM_THREADS=1;acpype -i {sdfile} -b MOL -a gaff -c bcc -n {net_charge} -k "{sqm_key}" >acpype.log 2>&1 ' + rc = os.system(cmd) + os.chdir(cwd) + shutil.rmtree(tmpdir) + if rc != 0: + return False + else: + return True + + +def check_element(sdfile): + from openbabel import openbabel + ext = Path(sdfile).suffix[1:] + obConversion = openbabel.OBConversion() + obConversion.SetInAndOutFormats(ext, ext) + mol = openbabel.OBMol() + if not obConversion.ReadFile(mol, str(sdfile)): + return -1 + + acceptable_elements = {6, 7, 8, 16, 15, 1, 9, 17, 35, 53} + for atom in openbabel.OBMolAtomIter(mol): + element = atom.GetAtomicNum() + if element not in acceptable_elements: + return 1 + return 0 + + +def get_total_valence_electrons(sdfile): + from openbabel import openbabel + extin = Path(sdfile).suffix[1:] + obConversion = openbabel.OBConversion() + obConversion.SetInAndOutFormats(extin, extin) + mol = openbabel.OBMol() + obConversion.ReadFile(mol, str(sdfile)) + total_valence_electrons = 0 + for atom in openbabel.OBMolAtomIter(mol): + total_valence_electrons += atom.GetAtomicNum() - atom.GetFormalCharge() + return total_valence_electrons + + +def get_electronegativity(atomic_number): + electronegativity_table = { + 1: 2.20, # Hydrogen (H) + 6: 2.55, # Carbon (C) + 7: 3.04, # Nitrogen (N) + 8: 3.44, # Oxygen (O) + 15: 2.19, # Phosphorus (P) + 16: 2.58, # Sulfur (S) + 9: 3.98, # Fluorine (F) + 17: 3.16, # Chlorine (Cl) + 35: 2.96, # Bromine (Br) + 53: 2.66, # Iodine (I) + } + return electronegativity_table.get(atomic_number, 0) + + +def adjust_charge_based_on_electronegativity(ob_mol): + from openbabel import openbabel + max_electronegativity = 0 + target_atom = None + + for atom in openbabel.OBMolAtomIter(ob_mol): + if atom.GetFormalCharge() != 0: + electronegativity = get_electronegativity(atom.GetAtomicNum()) + if electronegativity > max_electronegativity: + max_electronegativity = electronegativity + target_atom = atom + + if target_atom is not None: + if target_atom.GetFormalCharge() > 0: + target_atom.SetFormalCharge(target_atom.GetFormalCharge() - 1) + elif target_atom.GetFormalCharge() < 0: + target_atom.SetFormalCharge(target_atom.GetFormalCharge() + 1) + + +def add_hydrogen(sdfile, outfile=None): + from openbabel import openbabel + extin = extout = Path(sdfile).suffix[1:] + obConversion = openbabel.OBConversion() + obConversion.SetInAndOutFormats(extin, extout) + mol = openbabel.OBMol() + obConversion.ReadFile(mol, str(sdfile)) + mol.AddHydrogens() + mol_string = obConversion.WriteString(mol) + if outfile: + with open(outfile, 'w') as fw: + fw.write(mol_string) + else: + return mol + + +def repair_ligand(sdfile, outfile=None): + from openbabel import openbabel + extin = Path(sdfile).suffix[1:] + if outfile is None: + fname = str(uuid.uuid4()) + '.' + extin + outfile = Path('/tmp') / fname + extout = Path(outfile).suffix[1:] + obConversion = openbabel.OBConversion() + obConversion.SetInAndOutFormats(extin, extout) + mol = openbabel.OBMol() + obConversion.ReadFile(mol, str(sdfile)) + mol.AddHydrogens() + mol.DeleteHydrogens() + # Correct valence information + pH = 7.4 + charge_model = openbabel.OBChargeModel.FindType("mmff94") + charge_model.ComputeCharges(mol, str(pH)) + mol.CorrectForPH() + mol.PerceiveBondOrders() + total_valence_electrons = sum([atom.GetAtomicNum() - atom.GetFormalCharge() for atom in openbabel.OBMolAtomIter(mol)]) + if total_valence_electrons % 2 == 1: + adjust_charge_based_on_electronegativity(mol) + mol_string = obConversion.WriteString(mol) + new_string = [] + for i, line in enumerate(mol_string.split('\n')): + if i >= 4 and len(line) >= 30 and line.startswith(' '): + line = line[:42] + ' 0 0 0 0 0 0 0 0 0' + new_string.append(line) + mol_string = '\n'.join(new_string) + with open(outfile, 'w') as fw: + fw.write(mol_string) + add_hydrogen(outfile, outfile) + return outfile + + +def ligand_validate(sdfile, outfile=None): + rc = check_element(sdfile) + if rc == -1: + logging.error(f'Failed to load {sdfile}, please check your input {sdfile}.') + exit(256) + elif rc == 1: + logging.error(f'Ligand file only accept C N O S P H F Cl Br I, please check your input {sdfile}.') + exit(257) + total_valence_electrons = get_total_valence_electrons(sdfile) + if total_valence_electrons % 2 == 1 or not check_forcefield(sdfile): + logging.warning(f'The total valence electrons of your ligand is odd({total_valence_electrons}) or forcefield check error, try to repair input ligand.') + outfile = repair_ligand(sdfile, outfile=outfile) + total_valence_electrons = get_total_valence_electrons(outfile) + if total_valence_electrons % 2 == 1 or not check_forcefield(outfile): + logging.error(f'The total valence electrons of your ligand is stil odd({total_valence_electrons}) or forcefield check error after repair ligand, please check your input {sdfile}.') + exit(257) + else: + return outfile + else: + return sdfile diff --git a/unigbsa/version.py b/unigbsa/version.py index 182e8f5..4301ba4 100644 --- a/unigbsa/version.py +++ b/unigbsa/version.py @@ -1,2 +1,2 @@ -__version__="0.1.5" +__version__="0.1.6"