Merge pull request #100 from choderalab/sprint-5

Updates for Sprint 5
choderalab · May 19, 2021 · 2b8de75 · 2b8de75
2 parents 9cbbfb4 + 049dd77
commit 2b8de75
Show file tree

Hide file tree

Showing 21 changed files with 1,033 additions and 187 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,13 @@ Tools and infrastructure for automated compound discovery using Folding@home.
 2. Create a `conda` environment with the required dependencies:
 
     ``` sh
-    conda env create -n fah-xchem
+    conda env create -f environment.yml
+    ```
+
+    If the above process is slow, we recommend using [mamba](https://github.com/mamba-org/mamba.git) to speed up installation:
+
+    ```sh
+    mamba env create -f environment.yml
     ```
 
 3. Install `fah-xchem` in the environment using `pip`:
@@ -48,6 +54,7 @@ Generate representative snapshots, plots, PDF report, and static site HTML in `r
 fah-xchem generate-artifacts \
         --compound-series-analysis-file results/analysis.json \
         --config-file config.json \
+        --fragalysis-config fragalysis_config.json \
         --fah-projects-dir /path/to/projects/ \
         --fah-data-dir /path/to/data/SVR314342810/ \
         --output-dir results \
@@ -56,7 +63,6 @@ fah-xchem generate-artifacts \
         --num-procs 8
 ```
 
-
 ### Unit conventions
 
 Energies are represented in configuration and internally in units of `k T`, except when otherwise indicated. For energies in kilocalories per mole, the function or variable name should be suffixed with `_kcal`.
@@ -81,6 +87,53 @@ Some analysis options can be configured in a separate JSON file with schema give
 
 The JSON file is passed on the command line using the `--config-file` option.
 
+#### Upload to Fragalysis
+
+To upload sprint results to [Fragalysis](https://fragalysis.diamond.ac.uk/viewer/react/landing) a JSON config file may be supplied. For example,
+
+`fragalysis_config.json`
+```json
+{
+        "run": true,
+        "ligands_filename": "reliable-transformations-final-ligands.sdf",
+        "fragalysis_sdf_filename": "compound-set_foldingathome-sprint-X.sdf",
+        "ref_url": "https://url-link",
+        "ref_mols": "x00000",
+        "ref_pdb": "references.zip",
+        "target_name": "protein-target",
+        "submitter_name": "Folding@home",
+        "submitter_email": "[email protected]",
+        "submitter_institution": "institution-name",
+        "method": "Sprint X",
+        "upload_key": "upload-key",
+        "new_upload": true 
+}
+```
+
+The JSON file is passed on the command line using the `--fragalysis-config` option.
+
+Description of the JSON parameters:
+
+* `run`: specify whether to run the Fragalysis upload. If set to `false` the results will not be uploaded (even if the JSON is supplied via the `--fragalysis-config` option).
+* `ligands_filename`: the name of the SDF file to upload to Fragalysis.
+* `fragalysis_sdf_filename`: the name to use for the SDF Fragalysis upload. This will be a copy of `ligands_filename` but must be in the form `compound-set_name.sdf`.
+* `ref_url`: the url to the post that describes the work e.g. for [Sprint 5](https://discuss.postera.ai/t/folding-home-sprint-5/2423).
+* `ref_mol`: a comma separated list of the fragments that inspired the design of the new molecule (codes as they appear in fragalysis - e.g. x0104_0,x0692_0).
+* `ref_pdb`: 1) the name of the protein PDB zipped file to upload, this should be named `references.zip` (recommended) or 2) the code to the fragment pdb from fragalysis that should be used (e.g. x0692_0).
+* `target_name`: the name of the target protein.
+* `submitter_name`: the name of the submitter.
+* `submitter_email`: the email address of the submitter.
+* `submitter_institution`: the name of the institution that the submitter is associated with.
+* `method`: the method by which the results were obtained (e.g. Sprint 5).
+* `upload_key`: the unique upload key used to upload to Fragalysis.
+* `new_upload`: specifies whether to upload a new set (`true`) or to update an existing set (`false`).
+
+For more information on the upload format see this forum [post](https://discuss.postera.ai/t/providing-computed-poses-for-others-to-look-at/1155/8).
+
+A unique `upload_key` is needed to push to Fragalysis, this can be requested [here](https://fragalysis.diamond.ac.uk/viewer/cset_key/).
+
+For more information on the entire upload process see this forum [post](https://discuss.postera.ai/t/instructions-around-the-docking-results-category/1212/2).
+
 #### Server-specific configuration
 
 Paths to Folding@home project and data directories are passed on the command line. See usage examples above.
@@ -89,10 +142,10 @@ Paths to Folding@home project and data directories are passed on the command lin
 
 #### Conda
 
-This project uses [conda](https://github.com/conda/conda) to manage the environment. To set up a conda environment named `fah-xchem` with the required dependencies, run
+This project uses [conda](https://github.com/conda/conda) to manage the environment. To set up a conda environment named `fah-xchem` with the required dependencies, create the conda environment as described above. To install `fah-xchem` as `dev` run:
 
-``` sh
-conda env create -n fah-xchem
+```sh
+pip install -e .
 ```
 
 #### Running tests locally

diff --git a/environment.yml b/environment.yml
@@ -26,4 +26,7 @@ dependencies:
   - pip
   - pip:
       - git+https://github.com/openforcefield/[email protected]
+      - git+https://github.com/xchem/pandda_gemmi.git
+      - git+https://github.com/xchem/gemmi_pandda.git
+      - git+https://github.com/xchem/fragalysis-api.git
 
diff --git a/fah_xchem/analysis/__init__.py b/fah_xchem/analysis/__init__.py
@@ -18,13 +18,14 @@
     Transformation,
     TransformationAnalysis,
     WorkPair,
+    FragalysisConfig,
 )
 from .diffnet import combine_free_energies
 from .exceptions import AnalysisError, DataValidationError
 from .extract_work import extract_work_pair
 from .free_energy import compute_relative_free_energy
 from .plots import generate_plots
-from .report import generate_report
+from .report import generate_report, gens_are_consistent
 from .structures import generate_representative_snapshots
 from .website import generate_website
 
@@ -72,6 +73,7 @@ def analyze_transformation(
     projects: ProjectPair,
     server: FahConfig,
     config: AnalysisConfig,
+    filter_gen_consistency: Optional[bool] = True,
 ) -> TransformationAnalysis:
 
     analyze_phase_partial = partial(
@@ -80,16 +82,25 @@ def analyze_transformation(
 
     complex_phase = analyze_phase_partial(project=projects.complex_phase)
     solvent_phase = analyze_phase_partial(project=projects.solvent_phase)
+    binding_free_energy = (
+        complex_phase.free_energy.delta_f - solvent_phase.free_energy.delta_f
+    )
+
+    # Check for consistency across GENS, if requested
+    consistent_bool = None
+    if filter_gen_consistency:
+        consistent_bool = gens_are_consistent(
+            complex_phase=complex_phase, solvent_phase=solvent_phase, nsigma=3
+        )
 
     return TransformationAnalysis(
         transformation=transformation,
-        binding_free_energy=solvent_phase.free_energy.delta_f
-        - complex_phase.free_energy.delta_f,
+        reliable_transformation=consistent_bool,
+        binding_free_energy=binding_free_energy,
         complex_phase=complex_phase,
         solvent_phase=solvent_phase,
     )
 
-
 def analyze_transformation_or_warn(
     transformation: Transformation, **kwargs
 ) -> Optional[TransformationAnalysis]:
@@ -129,6 +140,14 @@ def analyze_compound_series(
             if result is not None
         ]
 
+    # Sort transformations by RUN
+    # transformations.sort(key=lambda transformation_analysis : transformation_analysis.transformation.run_id)
+    # Sort transformations by free energy difference
+    transformations.sort(
+        key=lambda transformation_analysis: transformation_analysis.binding_free_energy.point
+    )
+
+    # Warn about failures
     num_failed = len(series.transformations) - len(transformations)
     if num_failed > 0:
         logging.warning(
@@ -147,6 +166,7 @@ def analyze_compound_series(
 
 def generate_artifacts(
     series: CompoundSeriesAnalysis,
+    fragalysis_config: FragalysisConfig,
     timestamp: dt.datetime,
     projects_dir: str,
     data_dir: str,
@@ -159,6 +179,7 @@ def generate_artifacts(
     plots: bool = True,
     report: bool = True,
     website: bool = True,
+    overwrite: bool = False,
 ) -> None:
 
     complex_project_dir = os.path.join(
@@ -179,6 +200,7 @@ def generate_artifacts(
             max_binding_free_energy=config.max_binding_free_energy,
             cache_dir=cache_dir,
             num_procs=num_procs,
+            overwrite=overwrite,
         )
 
     if plots:
@@ -188,11 +210,17 @@ def generate_artifacts(
             timestamp=timestamp,
             output_dir=output_dir,
             num_procs=num_procs,
+            overwrite=overwrite,
         )
 
     if snapshots and report:
         logging.info("Generating pdf report")
-        generate_report(series=series, results_path=output_dir)
+        generate_report(
+            series=series,
+            results_path=output_dir,
+            max_binding_free_energy=config.max_binding_free_energy,
+            fragalysis_config=fragalysis_config,
+        )
 
     if website:
         logging.info("Generating website")

diff --git a/fah_xchem/analysis/constants.py b/fah_xchem/analysis/constants.py
@@ -1,5 +1,8 @@
 from simtk.openmm import unit
 from openmmtools.constants import kB
+import numpy as np
 
 # TODO: should be a configuration parameter
 KT_KCALMOL = kB * 300 * unit.kelvin / unit.kilocalories_per_mole
+KCALMOL_KT = 1.0 / KT_KCALMOL
+KT_PIC50 = np.log10(np.e)
diff --git a/fah_xchem/analysis/diffnet.py b/fah_xchem/analysis/diffnet.py
@@ -5,6 +5,7 @@
 import networkx as nx
 import numpy as np
 from scipy.special import logsumexp
+from .constants import KT_KCALMOL, KCALMOL_KT
 
 from ..schema import (
     Compound,
@@ -78,7 +79,7 @@ def get_compound_free_energy(microstates: List[MicrostateAnalysis]) -> PointEsti
     g_err = np.array([p[0].stderr for p in microstate_free_energies])
     s_err = np.array([p[1].stderr for p in microstate_free_energies])
 
-    gs = g + s + logsumexp(-s)
+    gs = g - (- s - logsumexp(-s))
 
     # TODO: check the error propagation below. It was written in a hurry!
     # Error propagation for gs
@@ -121,7 +122,7 @@ def _validate_inputs(
     )
 
     for microstate in compound_microstates - transformation_microstates:
-        logging.warning("No transformation data for microstate '%s'", microstate)
+        logging.info("No transformation data for microstate '%s'", microstate)
 
     missing_microstates = transformation_microstates - compound_microstates
     if missing_microstates:
@@ -285,16 +286,16 @@ def combine_free_energies(
 
         # Compute normalized microstate probabilities
         p_is = np.exp(-g_is - logsumexp(-g_is))
-
+        
         # Apportion compound K_a according to microstate probability
         Ka_is = p_is * np.exp(-g_exp_compound)
-
+        
         for (node, _), Ka in zip(valid_nodes, Ka_is):
             if node in supergraph:
                 supergraph.nodes[node]["g_exp"] = -np.log(Ka)
                 # NOTE: naming of uncertainty fixed by Arsenic convention
                 # TODO: remove hard-coded value
-                supergraph.nodes[node]["g_dexp"] = 0.1
+                supergraph.nodes[node]["g_dexp"] = 0.1*KCALMOL_KT
             else:
                 logging.warning(
                     "Compound microstate '%s' has experimental data, "
@@ -307,7 +308,7 @@ def combine_free_energies(
     # energies.
 
     for graph in valid_subgraphs:
-        gs, C = stats.mle(graph, factor="g_ij", node_factor="g_exp")
+        gs, C = stats.mle(graph, factor="g_ij", node_factor="g_exp")        
         errs = np.sqrt(np.diag(C))
         for node, g, g_err in zip(graph.nodes, gs, errs):
             graph.nodes[node]["g"] = g
@@ -343,7 +344,7 @@ def get_microstate_analysis(microstate: Microstate) -> MicrostateAnalysis:
         try:
             free_energy = get_compound_free_energy(microstates)
         except AnalysisError as exc:
-            logging.warning(
+            logging.info(
                 "Failed to estimate free energy for compound '%s': %s",
                 compound.metadata.compound_id,
                 exc,

diff --git a/fah_xchem/analysis/free_energy.py b/fah_xchem/analysis/free_energy.py
@@ -43,14 +43,19 @@ def _mask_outliers(
     """
     mask = np.abs(a) < max_value
     if len(a) >= min_sample_size:
-        mask &= np.abs(a - np.mean(a)) < max_n_devs * np.std(a)
+        import statsmodels.api as sm        
+        # Use a robust estimator of stddev that is robust to outliers
+        mean = np.median( a[mask] )
+        stddev = np.mean( np.abs(a[mask]-mean) )
+        stddev = 5.0 # DEBUG
+        mask &= np.abs(a - mean) < max_n_devs * stddev
     return mask
 
 
 def _filter_work_values(
     works: np.ndarray,
     max_value: float = 1e4,
-    max_n_devs: float = 5,
+    max_n_devs: float = 6,
     min_sample_size: int = 10,
 ) -> np.ndarray:
     """Remove pairs of works when either is determined to be an outlier.
@@ -74,7 +79,6 @@ def _filter_work_values(
         1-D array of filtered works.
         ``out.shape == (works.size, 1)``
     """
-
     mask_work_outliers = functools.partial(
         _mask_outliers,
         max_value=max_value,
@@ -127,12 +131,13 @@ def _get_bar_overlap(works: np.ndarray) -> float:
 
     from pymbar.mbar import MBAR
 
+    # TODO : Figure out why this is throwing warnings
+
     n = len(works)
     u_kn = np.block([[works["forward"], np.zeros(n)], [np.zeros(n), works["reverse"]]])
     N_k = np.array([n, n])
 
     mbar = MBAR(u_kn, N_k)
-
     return float(mbar.computeOverlap()["scalar"])
 
 
@@ -165,6 +170,7 @@ def compute_relative_free_energy(
         dtype=[("clone", int), ("forward", float), ("reverse", float)],
     )
 
+    # TODO: Flag problematic RUN/CLONE/GEN trajectories for further analysis and debugging
     works = _filter_work_values(all_works)
 
     if len(works) < (min_num_work_values or 1):