Skip to content

Commit

Permalink
Merge pull request #27 from drorlab/develop
Browse files Browse the repository at this point in the history
v0.2.5
  • Loading branch information
martinvoegele authored Sep 10, 2021
2 parents 0dbccf9 + 9adc9b8 commit 9269dfd
Show file tree
Hide file tree
Showing 18 changed files with 1,160 additions and 522 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ tests/test_data/MOR-*/.*.npz
*.ipynb
.DS_Store

# Files from workload manager
slurm*.out

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
14 changes: 7 additions & 7 deletions docs/tut-5-dimensionality.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,12 @@ entire receptor, sorted by the PCs of the transmembrane region.
_ = sort_trajs_along_common_pc(sim_a_tmr_data['bb-torsions'],
sim_b_tmr_data['bb-torsions'],
feature_start_frame,
"traj/condition-a_receptor.gro",
"traj/condition-b_receptor.gro",
"traj/condition-a_receptor.xtc",
"traj/condition-b_receptor.xtc",
"pca/receptor_by_tmr",
num_pc=3)
num_pc=3, start_frame=feature_start_frame)
The above function deals with the special case of two input
trajectories. We also provide the functions for a single one (see
Expand All @@ -121,8 +120,9 @@ Here are the major steps of a PCA demonstrated for a single simulation.
.. code:: python
_, __ = sort_traj_along_pc(sim_a_tmr_data['bb-torsions'],
pca_a, feature_start_frame,
"traj/condition-a_receptor.gro",
"traj/condition-a_receptor.xtc",
"pca/condition-a_receptor_by_tmr", num_pc=3)
_, __, ___ = sort_traj_along_pc(sim_a_tmr_data['bb-torsions'],
"traj/condition-a_receptor.gro",
"traj/condition-a_receptor.xtc",
"pca/condition-a_receptor_by_tmr",
start_frame = feature_start_frame,
pca=pca_a, num_pc=3)
2 changes: 1 addition & 1 deletion pensa/comparison/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .relative_entropy import *
from .statespecific import *
from .visualization import *

from .metrics import *
222 changes: 222 additions & 0 deletions pensa/comparison/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import numpy as np
from pensa import *
from pensa.comparison import *
from pensa.dimensionality import *
import random
import math


"""
Calculates the average and maximum Jensen-Shannon distance and the Kullback-Leibler divergences for each feature from two ensembles. Each of four functions uses the relative_entropy_analysis function with the same parameters.
Parameters
----------
features_a : list of str
Feature names of the first ensemble.
Can be obtained from features object via .describe().
features_b : list of str
Feature names of the first ensemble.
Can be obtained from features object via .describe().
Must be the same as features_a. Provided as a sanity check.
all_data_a : float array
Trajectory data from the first ensemble. Format: [frames,frame_data].
all_data_b : float array
Trajectory data from the second ensemble.
For kld functions, the second ensemble should be the reference ensemble.
Format: [frames,frame_data].
bin_width : float, default=None
Bin width for the axis to compare the distributions on.
If bin_width is None, bin_num (see below) bins are used and the width is determined from the common histogram.
bin_num : int, default=10
Number of bins for the axis to compare the distributions on (only if bin_width=None).
verbose : bool, default=True
Print intermediate results.
override_name_check : bool, default=False
Only check number of features, not their names.
Returns
-------
Each function returns one value.
average_jsd : float
Average Jensen-Shannon distance from two ensembles.
max_jsd : float
Maximum Jensen-Shannon distance from two ensembles.
average_kld : float
Average Kullback-Leibler divergence from two ensembles.
max_kld : float
Maximum Kullback-Leibler divergence from two ensembles.
"""

def average_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
_, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
return np.mean(data_jsdist)


def max_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
_, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
return np.max(data_jsdist)


def average_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
_, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
return np.mean(data_kld_ab)


def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
_, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
return np.max(data_kld_ab)


"""
Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of five functions uses the kolmogorov_smirnov_analysis function with the same parameters.
Parameters
----------
features_a : list of str
Feature names of the first ensemble.
Can be obtained from features object via .describe().
features_b : list of str
Feature names of the first ensemble.
Can be obtained from features object via .describe().
Must be the same as features_a. Provided as a sanity check.
all_data_a : float array
Trajectory data from the first ensemble. Format: [frames,frame_data].
all_data_b : float array
Trajectory data from the second ensemble. Format: [frames,frame_data].
verbose : bool, default=True
Print intermediate results.
override_name_check : bool, default=False
Only check number of features, not their names.
Returns
-------
Each function returns one value.
average_kss : float
Average Kolmogorov-Smirnov statistic for two distributions.
max_kss : float
Maximum Kolmogorov-Smirnov statistic for two distributions.
average_ksp : float
Average Kolmogorov-Smirnov p-value for two distributions.
max_ksp : float
Maximum Kolmogorov-Smirnov statistic for two distributions.
min_ksp : float
Minimum Kolmogorov-Smirnov statistic for two distributions.
"""


def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False):
_, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
return np.mean(data_kss)


def max_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False):
_, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
return np.max(data_kss)

def average_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False):
_, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
return np.mean(data_ksp)


def max_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False):
_, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
return np.max(data_ksp)

def min_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False):
_, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
return np.min(data_ksp)

"""
Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters.
Parameters
----------
features_a : list of str
Feature names of the first ensemble.
features_b : list of str
Feature names of the first ensemble.
Must be the same as features_a. Provided as a sanity check.
all_data_a : float array
Trajectory data from the first ensemble. Format: [frames,frame_data].
all_data_b : float array
Trajectory data from the second ensemble. Format: [frames,frame_data].
torsions : str
Torsion angles to use for SSI, including backbone - 'bb', and sidechain - 'sc'.
Default is None.
pocket_occupancy : bool, optional
Set to 'True' if the data input is pocket occupancy distribution.
The default is None.
pbc : bool, optional
If true, the apply periodic bounary corrections on angular distribution inputs.
The input for periodic correction must be radians. The default is True.
verbose : bool, default=True
Print intermediate results.
write_plots : bool, optional
If true, visualise the states over the raw distribution. The default is None.
override_name_check : bool, default=False
Only check number of features, not their names.
Returns
-------
Each function returns one value.
average_ssi : float
Average of State Specific Information for a feature across two ensembles.
max_ssi : float
Maximum of State Specific Information for a feature across two ensembles.
"""

def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbc=True, verbose=True, write_plots=None, override_name_check=False):
_, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check)
return np.mean(data_ssi)


def max_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbs=True, verbose=True, write_plots=None, override_name_check=False):
_, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check)
return np.max(data_ssi)


"""
Calculates the relative sampling efficiency of test data based on reference data.
Parameters
----------
ref_data : float array
Trajectory data from the reference ensemble. Format: [frames,frame_data].
test_data : float array
Trajectory data from the test ensemble. Format: [frames,frame_data].
num_pc : int
Number of principal components used.
Returns
-------
pca_se : float
Sampling efficiency of test data based on reference data.
"""

def pca_sampling_efficiency(ref_data, test_data, num_pc=2):
pca = calculate_pca(ref_data)

_, ref_components = get_components_pca(ref_data, num_pc, pca=pca)
_, test_components = get_components_pca(test_data, num_pc, pca=pca)

ref_var = np.var(ref_components, axis=0)
test_var = np.var(test_components, axis=0)

ref_vol = np.prod(ref_var)
test_vol = np.prod(test_var)

pca_se = test_vol / ref_vol

return pca_se








Loading

0 comments on commit 9269dfd

Please sign in to comment.