From 27e6f14f12aac62140504adbaa24bb97d610ba6a Mon Sep 17 00:00:00 2001
From: Maohua Yang <maohuay@hotmail.com>
Date: Fri, 18 Aug 2023 18:26:37 +0800
Subject: [PATCH] validate and repair for ligand (#41)

* fix: scan bug

* version 0.1.5

* fix: add validate ligand tool

* fix: add validate ligand tool

* feat: add ligand prepare

* fix: add forcefield check

* feat: add a new tool unigbsa-validate

* fix: add protprep for lauching

* version 0.1.6
---
 launching/app.py            |  26 +++++-
 setup.py                    |   3 +-
 unigbsa/CLI.py              |  20 +++++
 unigbsa/pipeline.py         |  17 ++--
 unigbsa/simulation/utils.py | 163 +++++++++++++++++++++++++++++++++++-
 unigbsa/version.py          |   2 +-
 6 files changed, 219 insertions(+), 12 deletions(-)

diff --git a/launching/app.py b/launching/app.py
index bb1f18b..2c19867 100644
--- a/launching/app.py
+++ b/launching/app.py
@@ -179,6 +179,22 @@ class GBSAModel(
     ...
 
 
+def prep_protein(pdbfile):
+    outfile = pdbfile[:-4] + '_prep.pdb'
+    args = [
+        '-i', pdbfile,
+        '-o', outfile,
+        '-mode', 'gromacs',
+        ]
+    print(args)
+    argstr = ' '.join(args)
+    cmd = '/bin/bash -c "source ~/.bashrc; conda activate protprep; unisp-prot %s"'%argstr
+    RC = os.system(cmd)
+    if RC != 0:
+        return -2
+    return outfile
+
+
 def gbsa_runner(opts: GBSAModel) -> int:
     status = 0
     try:
@@ -217,6 +233,9 @@ def gbsa_runner(opts: GBSAModel) -> int:
                 }
             }
         input_protein = opts.input_protein.get_path()
+        preped_protein = prep_protein(input_protein)
+        if preped_protein == -2:
+            raise Exception('ERROR prep protein file.')
         input_dir = os.path.dirname(input_protein)
         output_dir = str(opts.output_dir)
         if not os.path.exists(output_dir):
@@ -228,9 +247,10 @@ def gbsa_runner(opts: GBSAModel) -> int:
 
         cmd = sh.Command('unigbsa-pipeline')
         args = [
-            '-i', opts.input_protein.get_path(),
-            '-l', " ".join(opts.input_ligands),
-            '-c', configfile,
+            '-i', preped_protein,
+            '-l', *opts.input_ligands,
+            '-validate',
+            '-c', str(configfile),
             '-o', csvoutfile
         ]
 
diff --git a/setup.py b/setup.py
index 48b2ad1..c1d771c 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,8 @@ def get_readme(rel_path):
          'unigbsa-buildsys = unigbsa.CLI:simulation_builder',
          'unigbsa-md = unigbsa.CLI:simulation_run',
          'unigbsa-plots = unigbsa.CLI:mmpbsa_plot',
-         'unigbsa-scan = unigbsa.scanparas.scan:main'
+         'unigbsa-scan = unigbsa.scanparas.scan:main',
+         'unigbsa-validate = unigbsa.CLI:ligand_check'
      ]},
     include_package_data=True
 )
diff --git a/unigbsa/CLI.py b/unigbsa/CLI.py
index 9352602..c83eb08 100644
--- a/unigbsa/CLI.py
+++ b/unigbsa/CLI.py
@@ -7,6 +7,7 @@
 from .utils import generate_index_file, process_pbc
 
 from .simulation import topology, mdrun
+from .simulation.utils import ligand_validate
 from .settings import logging
 from .version import __version__
 
@@ -292,3 +293,22 @@ def mmpbsa_plot():
             func = funcdic[fname]
             logging.info('Analysis file: %s'%fname)
             func(datfile, outdir=oup)
+
+
+def ligand_check():
+    parser = argparse.ArgumentParser(description='Analysis and plot results for MMPBSA.')
+    parser.add_argument('-i', help='Ligand file to validate.', required=True)
+    parser.add_argument('-o', help='Ligand file after validate. default: None', default=None)
+    parser.add_argument('-v', '--version', action='version', version="{prog}s ({version})".format(prog="%(prog)", version=__version__))    
+
+    args = parser.parse_args()
+    ligandfile, outfile = args.i, args.o
+    if not outfile:
+        outfile = ligand_validate(ligandfile, outfile)
+        if outfile == ligandfile:
+            print(f'{ligandfile} is OK as the input for unigbsa.')
+        else:
+            print(f'{ligandfile} is not OK as the input for unigbsa. But unigbsa-validate can repair it.')
+    else:
+        outfile = ligand_validate(ligandfile, outfile)
+        print(f'{outfile} is OK as the input for unigbsa.')
diff --git a/unigbsa/pipeline.py b/unigbsa/pipeline.py
index 1ab9587..12dd60f 100644
--- a/unigbsa/pipeline.py
+++ b/unigbsa/pipeline.py
@@ -10,6 +10,7 @@
 from unigbsa.utils import generate_index_file, load_configue_file
 from unigbsa.simulation.mdrun import GMXEngine
 from unigbsa.simulation.topology import build_topol, build_protein
+from unigbsa.simulation.utils import ligand_validate
 from unigbsa.settings import logging, DEFAULT_CONFIGURE_FILE, GMXEXE, set_OMP_NUM_THREADS
 
 from tqdm import tqdm
@@ -52,7 +53,7 @@ def traj_pipeline(complexfile, trajfile, topolfile, indexfile, pbsaParas=None, m
         print('%6d    %4s    %18.4f  '%(irow['Frames'], irow['mode'], irow['TOTAL']))
     return detal_G
 
-def base_pipeline(receptorfile, ligandfiles, paras, nt=1, mmpbsafile=None, outfile='BindingEnergy.csv', verbose=False):
+def base_pipeline(receptorfile, ligandfiles, paras, nt=1, mmpbsafile=None, outfile='BindingEnergy.csv', validate=False, verbose=False):
     """
     This function takes a receptorfile and ligandfile, and build a complex.pdb and complex.top file
     
@@ -81,7 +82,8 @@ def base_pipeline(receptorfile, ligandfiles, paras, nt=1, mmpbsafile=None, outfi
         if not os.path.exists(ligandName):
             os.mkdir(ligandName)
         os.chdir(ligandName)
-
+        if validate:
+            ligandfile = ligand_validate(ligandfile, ligandName+'.mol')
         grofile = 'complex.pdb'
         topfile = 'complex.top'
         logging.info('Build ligand topology: %s'%ligandName)
@@ -118,7 +120,7 @@ def base_pipeline(receptorfile, ligandfiles, paras, nt=1, mmpbsafile=None, outfi
 
 
 def single(arg):
-    receptor, ligandfile, simParas, ligandfiles, mmpbsafile, nt, pbsaParas, verbose, receptorfile = arg
+    receptor, ligandfile, simParas, ligandfiles, mmpbsafile, nt, pbsaParas, validate, verbose, receptorfile = arg
     d1 = pd.DataFrame({'Frames': 1, 'mode':pbsaParas['modes'], 'complex':0.0,'receptor':0.0,'ligand':0.0,'Internal':0.0,'Van der Waals':0.0,'Electrostatic':0,'Polar Solvation':0.0,'Non-Polar Solvation':0.0,'Gas':0.0,'Solvation':0.0,'TOTAL':0.0}, index=[1])
     cwd = os.getcwd()
     statu = 'S'
@@ -127,6 +129,8 @@ def single(arg):
     if not os.path.exists(ligandName):
         os.mkdir(ligandName)
     os.chdir(ligandName)
+    if validate:
+        ligandfile = ligand_validate(ligandfile, ligandName+'.mol')
     grofile = 'complex.pdb'
     topfile = 'complex.top'
     if len(ligandfiles) == 1:
@@ -179,7 +183,7 @@ def single(arg):
     d1['status'] = statu
     return d1
 
-def minim_pipeline(receptorfile, ligandfiles, paras, mmpbsafile=None, nt=1, outfile='BindingEnergy.csv', verbose=False):
+def minim_pipeline(receptorfile, ligandfiles, paras, mmpbsafile=None, nt=1, outfile='BindingEnergy.csv', validate=False, verbose=False):
     """
     It runs the simulation pipeline for each ligand.
     
@@ -199,7 +203,7 @@ def minim_pipeline(receptorfile, ligandfiles, paras, mmpbsafile=None, nt=1, outf
 
     args = [(receptor, ligandfile, simParas,
              ligandfiles, mmpbsafile, 1,
-             pbsaParas, verbose, receptorfile) for ligandfile in sorted(ligandfiles)]
+             pbsaParas, validate, verbose, receptorfile) for ligandfile in sorted(ligandfiles)]
     if len(args) == 1:
         df = single(args[0])
     else:
@@ -285,6 +289,7 @@ def main(args=None):
     parser.add_argument('-d', dest='ligdir', help='Floder contains many ligand files. file format: .mol or .sdf', default=None)
     parser.add_argument('-f', dest='pbsafile', help='gmx_MMPBSA input file. default=None', default=None)
     parser.add_argument('-o', dest='outfile', help='Output file.', default='BindingEnergy.csv')
+    parser.add_argument('-validate', help='Validate the ligand file. default: False', action='store_true', default=False)
     parser.add_argument('-nt', dest='thread', help='Set number of thread to run this program.', type=int, default=multiprocessing.cpu_count())
     parser.add_argument('--decomp', help='Decompose the free energy. default:False', action='store_true', default=False)
     parser.add_argument('--verbose', help='Keep all the files.', action='store_true', default=False)
@@ -323,7 +328,7 @@ def main(args=None):
         paras['GBSA']['modes'] = gbtype
 
     if paras['simulation']['mode'] == 'em':
-        minim_pipeline(receptorfile=receptor, ligandfiles=ligands, paras=paras, outfile=outfile, mmpbsafile=mmpbsafile, verbose=verbose, nt=nt)
+        minim_pipeline(receptorfile=receptor, ligandfiles=ligands, paras=paras, outfile=outfile, mmpbsafile=mmpbsafile, validate=args.validate, verbose=verbose, nt=nt)
     elif paras['simulation']['mode'] == 'md':
         md_pipeline(receptorfile=receptor, ligandfiles=ligands, paras=paras, outfile=outfile, mmpbsafile=mmpbsafile, verbose=verbose, nt=nt)
     elif paras['simulation']['mode'] == 'input':
diff --git a/unigbsa/simulation/utils.py b/unigbsa/simulation/utils.py
index e6b5fdb..7415e72 100644
--- a/unigbsa/simulation/utils.py
+++ b/unigbsa/simulation/utils.py
@@ -1,5 +1,10 @@
 import os
-from unigbsa.settings import GMXEXE
+import uuid
+import shutil
+from pathlib import Path
+from unigbsa.settings import GMXEXE, logging
+
+
 def convert_format(inputfile, filetype, outfile=None, outtype='mol'):
     """
     Convert a file of type filetype to mol2 format
@@ -335,3 +340,159 @@ def obtain_net_charge(sdfile):
     # Calculate the net charge of the molecule
     net_charge = mol.GetTotalCharge()
     return net_charge
+
+
+def check_forcefield(sdfile):
+    sqm_key = "grms_tol=0.005,qm_theory='AM1',scfconv=1.d-5,ndiis_attempts=700,maxcyc=0"
+    tmpdir = Path('/tmp') / str(uuid.uuid4())
+    sdfile = Path(sdfile).absolute()
+    net_charge = obtain_net_charge(str(sdfile))
+    tmpdir.mkdir(exist_ok=True)
+    cwd = os.getcwd()
+    os.chdir(tmpdir)
+    cmd = f'export OMP_NUM_THREADS=1;acpype -i {sdfile} -b MOL -a gaff -c bcc -n {net_charge} -k "{sqm_key}" >acpype.log 2>&1 '
+    rc = os.system(cmd)
+    os.chdir(cwd)
+    shutil.rmtree(tmpdir)
+    if rc != 0:
+        return False
+    else:
+        return True
+
+
+def check_element(sdfile):
+    from openbabel import openbabel
+    ext = Path(sdfile).suffix[1:]
+    obConversion = openbabel.OBConversion()
+    obConversion.SetInAndOutFormats(ext, ext)
+    mol = openbabel.OBMol()
+    if not obConversion.ReadFile(mol, str(sdfile)):
+        return -1
+
+    acceptable_elements = {6, 7, 8, 16, 15, 1, 9, 17, 35, 53}
+    for atom in openbabel.OBMolAtomIter(mol):
+        element = atom.GetAtomicNum()
+        if element not in acceptable_elements:
+            return 1
+    return 0
+
+
+def get_total_valence_electrons(sdfile):
+    from openbabel import openbabel
+    extin = Path(sdfile).suffix[1:]
+    obConversion = openbabel.OBConversion()
+    obConversion.SetInAndOutFormats(extin, extin)
+    mol = openbabel.OBMol()
+    obConversion.ReadFile(mol, str(sdfile))
+    total_valence_electrons = 0
+    for atom in openbabel.OBMolAtomIter(mol):
+        total_valence_electrons += atom.GetAtomicNum() - atom.GetFormalCharge()
+    return total_valence_electrons
+
+
+def get_electronegativity(atomic_number):
+    electronegativity_table = {
+        1: 2.20,  # Hydrogen (H)
+        6: 2.55,  # Carbon (C)
+        7: 3.04,  # Nitrogen (N)
+        8: 3.44,  # Oxygen (O)
+        15: 2.19,  # Phosphorus (P)
+        16: 2.58,  # Sulfur (S)
+        9: 3.98,  # Fluorine (F)
+        17: 3.16,  # Chlorine (Cl)
+        35: 2.96,  # Bromine (Br)
+        53: 2.66,  # Iodine (I)
+    }
+    return electronegativity_table.get(atomic_number, 0)
+
+
+def adjust_charge_based_on_electronegativity(ob_mol):
+    from openbabel import openbabel
+    max_electronegativity = 0
+    target_atom = None
+
+    for atom in openbabel.OBMolAtomIter(ob_mol):
+        if atom.GetFormalCharge() != 0:
+            electronegativity = get_electronegativity(atom.GetAtomicNum())
+            if electronegativity > max_electronegativity:
+                max_electronegativity = electronegativity
+                target_atom = atom
+
+    if target_atom is not None:
+        if target_atom.GetFormalCharge() > 0:
+            target_atom.SetFormalCharge(target_atom.GetFormalCharge() - 1)
+        elif target_atom.GetFormalCharge() < 0:
+            target_atom.SetFormalCharge(target_atom.GetFormalCharge() + 1)
+
+
+def add_hydrogen(sdfile, outfile=None):
+    from openbabel import openbabel
+    extin = extout = Path(sdfile).suffix[1:]
+    obConversion = openbabel.OBConversion()
+    obConversion.SetInAndOutFormats(extin, extout)
+    mol = openbabel.OBMol()
+    obConversion.ReadFile(mol, str(sdfile))
+    mol.AddHydrogens()
+    mol_string = obConversion.WriteString(mol)
+    if outfile:
+        with open(outfile, 'w') as fw:
+            fw.write(mol_string)
+    else:
+        return mol
+
+
+def repair_ligand(sdfile, outfile=None):
+    from openbabel import openbabel
+    extin = Path(sdfile).suffix[1:]
+    if outfile is None:
+        fname = str(uuid.uuid4()) + '.' + extin
+        outfile = Path('/tmp') / fname
+    extout = Path(outfile).suffix[1:]
+    obConversion = openbabel.OBConversion()
+    obConversion.SetInAndOutFormats(extin, extout)
+    mol = openbabel.OBMol()
+    obConversion.ReadFile(mol, str(sdfile))
+    mol.AddHydrogens()
+    mol.DeleteHydrogens()
+    # Correct valence information
+    pH = 7.4
+    charge_model = openbabel.OBChargeModel.FindType("mmff94")
+    charge_model.ComputeCharges(mol, str(pH))
+    mol.CorrectForPH()
+    mol.PerceiveBondOrders()
+    total_valence_electrons = sum([atom.GetAtomicNum() - atom.GetFormalCharge() for atom in openbabel.OBMolAtomIter(mol)])
+    if total_valence_electrons % 2 == 1:
+        adjust_charge_based_on_electronegativity(mol)
+    mol_string = obConversion.WriteString(mol)
+    new_string = []
+    for i, line in enumerate(mol_string.split('\n')):
+        if i >= 4 and len(line) >= 30 and line.startswith(' '):
+            line = line[:42] + '  0  0  0  0  0  0  0  0  0'
+        new_string.append(line)
+    mol_string = '\n'.join(new_string)
+    with open(outfile, 'w') as fw:
+        fw.write(mol_string)
+    add_hydrogen(outfile, outfile)
+    return outfile
+
+
+def ligand_validate(sdfile, outfile=None):
+    rc = check_element(sdfile)
+    if rc == -1:
+        logging.error(f'Failed to load {sdfile}, please check your input {sdfile}.')
+        exit(256)
+    elif rc == 1:
+        logging.error(f'Ligand file only accept C N O S P H F Cl Br I, please check your input {sdfile}.')
+        exit(257)
+    total_valence_electrons = get_total_valence_electrons(sdfile)
+    if total_valence_electrons % 2 == 1 or not check_forcefield(sdfile):
+        logging.warning(f'The total valence electrons of your ligand is odd({total_valence_electrons}) or forcefield check error, try to repair input ligand.')
+        outfile = repair_ligand(sdfile, outfile=outfile)
+        total_valence_electrons = get_total_valence_electrons(outfile)
+        if total_valence_electrons % 2 == 1 or not check_forcefield(outfile):
+            logging.error(f'The total valence electrons of your ligand is stil odd({total_valence_electrons}) or forcefield check error after repair ligand, please check your input {sdfile}.')
+            exit(257)
+        else:
+            return outfile
+    else:
+        return sdfile
diff --git a/unigbsa/version.py b/unigbsa/version.py
index 182e8f5..4301ba4 100644
--- a/unigbsa/version.py
+++ b/unigbsa/version.py
@@ -1,2 +1,2 @@
-__version__="0.1.5"
+__version__="0.1.6"