From 3ef4c1ddb9cc6578133990ae0dbe75d883fff68e Mon Sep 17 00:00:00 2001
From: Gaowei Chen02 <gwchen02@f100.hn.org>
Date: Wed, 7 Feb 2024 10:55:46 +0800
Subject: [PATCH] Add some error messages that are more helpful for trouble
 shooting. 1. add check.py in utils: parse the input cv files with PLUMED
 before running the workflow. 2. add assertions in submit.py and run_select.py
 to ensure the trust_lvl_1 and std_threshold to be suitable.

---
 rid/entrypoint/submit.py |  4 ++-
 rid/op/label_stats.py    |  2 ++
 rid/op/run_select.py     |  4 +++
 rid/utils/__init__.py    |  3 ++-
 rid/utils/check.py       | 58 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 69 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 rid/entrypoint/submit.py
 create mode 100644 rid/utils/check.py

diff --git a/rid/entrypoint/submit.py b/rid/entrypoint/submit.py
old mode 100644
new mode 100755
index ab1b0c82..cbbfb311
--- a/rid/entrypoint/submit.py
+++ b/rid/entrypoint/submit.py
@@ -2,7 +2,7 @@
 import json
 from pathlib import Path
 from typing import List, Union, Optional
-from rid.utils import load_json
+from rid.utils import load_json, check_cv_file
 import os
 
 from dflow import (
@@ -224,6 +224,8 @@ def submit_rid(
     if len(cvfile_list) == 0:
         cv_file_artifact = None
     else:
+        Rsl, Rmsg = check_cv_file(cvfile_list)
+        assert Rsl, f"An error occurred while parsing cv_files:\n\n{Rmsg}"
         cv_file_artifact = upload_artifact([Path(p) for p in cvfile_list], archive=None)
         
     if len(dpfile_list) == 0:
diff --git a/rid/op/label_stats.py b/rid/op/label_stats.py
index 53b7e3a7..8d09d23a 100644
--- a/rid/op/label_stats.py
+++ b/rid/op/label_stats.py
@@ -91,6 +91,8 @@ def execute(
                     higher_index.add(i)
         higher_index_list = list(higher_index)
         print("higher index list", list(cv_forces_list[higher_index_list]))
+        assert len(higher_index_list) < len(mf_all_std_list), \
+            f"All the std are higher than the std_threshold ({op_in["std_threshold"]}), please lower the std_threshold."
         mf_all_std_list_modified = np.delete(mf_all_std_list, higher_index_list, axis=0)
         cv_forces_list_modified = np.delete(cv_forces_list, higher_index_list, axis=0)
         assert len(mf_all_std_list_modified) == len(cv_forces_list_modified)
diff --git a/rid/op/run_select.py b/rid/op/run_select.py
index 24ab1e91..033663fc 100644
--- a/rid/op/run_select.py
+++ b/rid/op/run_select.py
@@ -110,6 +110,10 @@ def execute(
             else:
                 stds = make_std(cls_sel_data, models=op_in["models"])
                 save_txt("cls_"+model_devi_name, stds, fmt=model_devi_precision)
+                assert max(stds) > trust_lvl_1, f"""
+                The maximum deviation of the models ({max(stds)}) is smaller than trust_lvl_1
+                ({trust_lvl_1}), causing the selected indices to be empty. Please enlarge trust_lvl_1.
+                """
                 _selected_idx = select_from_devi(stds, op_in["trust_lvl_1"])
             sel_idx = cls_sel_idx[_selected_idx]
             np.save(sel_ndx_name, sel_idx)
diff --git a/rid/utils/__init__.py b/rid/utils/__init__.py
index f3b6009b..8ef83e58 100644
--- a/rid/utils/__init__.py
+++ b/rid/utils/__init__.py
@@ -12,4 +12,5 @@
 from rid.utils.format import list_to_string
 from rid.utils.command import run_command
 from rid.utils.path import set_directory
-from rid.utils.set_config import init_executor, normalize_resources
\ No newline at end of file
+from rid.utils.set_config import init_executor, normalize_resources
+from rid.utils.check import check_cv_file
\ No newline at end of file
diff --git a/rid/utils/check.py b/rid/utils/check.py
new file mode 100644
index 00000000..441d2ff5
--- /dev/null
+++ b/rid/utils/check.py
@@ -0,0 +1,58 @@
+# Trouble shooting for common errors during the workflow
+
+import os
+import subprocess
+import numpy as np
+from pathlib import Path
+
+def check_cv_file(file_list : list):
+    """Parse the cv files with plumed to check if the files are valid.
+    
+    Parameters
+    ----------
+    file_list : list
+        List of file absolute paths. Only one .pdb file is allowed.
+        
+    Returns
+    -------
+    Rsl : bool
+        True if the file is valid, False otherwise.
+    Rmsg : str
+        The whole parsing message of PLUMED if the file is invalid, 
+        none otherwise.
+    """
+    input_dir = Path(file_list[0]).parent
+    os.chdir(input_dir)
+    
+    cv_file_list = [file for file in file_list if file[-4:] != ".pdb"]
+    strut_pdb = [file for file in file_list if file[-4:] == ".pdb"]
+    assert len(strut_pdb) == 1, \
+        "There should be only one .pdb file in the cv files."
+    
+    # Fetch the number of atoms from the .pdb file
+    with open(strut_pdb[0], "rb") as f:
+        block = -1024
+        flag = True
+        while flag:
+            f.seek(block, 2)
+            lines = f.readlines()
+            lines.reverse()
+            for line in lines:
+                if line.startswith(b'ATOM'):
+                    natoms = int(line[6:11])
+                    flag = False
+                    break
+            block *= 2
+    
+    # Parse the file with plumed
+    Rsl = True
+    Rmsg = ''
+    for file in cv_file_list:
+        cmd = f"plumed driver --plumed {file} \
+            --parse-only --natoms {natoms}"
+        (Status, Rmsg) = subprocess.getstatusoutput(cmd)
+        if Status != 0:
+            Rsl = False
+            break
+        
+    return Rsl, Rmsg