diff --git a/.gitignore b/.gitignore
index b575318ad..2df95e351 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ Untitled*.ipynb
# generated from MANIFEST.in
MANIFEST
doc/source/api/generated/
+.ipynb_checkpoints
diff --git a/.project b/.project
index f01638c0e..a15375e53 100644
--- a/.project
+++ b/.project
@@ -10,6 +10,16 @@
+
+ org.eclipse.ui.externaltools.ExternalToolBuilder
+ full,incremental,
+
+
+ LaunchConfigHandle
+ <project>/.externalToolBuilders/build_extensions.launch
+
+
+
org.python.pydev.pythonNature
diff --git a/.travis.yml b/.travis.yml
index eae823a4a..6331f59ef 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,9 +6,9 @@ python:
env:
global:
- PATH=$HOME/miniconda/bin:$HOME/miniconda/envs/ci/bin/:$PATH
- - common_py_deps="pip nose cython jpype1"
+ - common_py_deps="pip nose cython jpype1 mdtraj scikit-learn"
- doc_deps="ipython sphinx matplotlib numpydoc pyzmq"
- - deps="scipy=0.11 numpy=1.7"
+ - deps="scipy numpy"
matrix:
secure: "byk9bmnGvP3qDfpYvPKX4909KeS6pJQtfW+GkSsuHy4vnp++gu2IsXC/CJeCB0r7hpoRp7Z+XlOtYmJLvb585LZmGaqIs5LKs1DimJniAg5anpywOnaXodspeFcz6UWtLlIAQQS3SvP9SXvvrlIiF8IwXqWfEewtoGbpiCj3dEo="
# - deps="scipy=0.11 numpy=1.7 cython"
@@ -23,7 +23,7 @@ before_install:
- deactivate # travis python venv
- wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O mc.sh -o /dev/null
- bash mc.sh -b
-- conda create -q --yes -n ci -c https://conda.binstar.org/marscher
+- conda create -q --yes -n ci -c https://conda.binstar.org/omnia
python=$TRAVIS_PYTHON_VERSION $deps $common_py_deps
- source activate ci
- python -c "import scipy; print scipy.__version__; print scipy.__numpy_version__"
@@ -33,6 +33,8 @@ install:
- python setup.py install
script:
+- printf "[Java]\nstartup=True" > ~/pyemma.cfg
+- cat ~/pyemma.cfg
- python setup.py test
after_success:
diff --git a/doc/source/api/coordinates.transform.tica.rst b/doc/source/api/coordinates.transform.tica.rst
deleted file mode 100644
index cbcaf38d6..000000000
--- a/doc/source/api/coordinates.transform.tica.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. automodule:: pyemma.coordinates.transform.tica
-
-.. toctree::
- :maxdepth: 1
-
diff --git a/doc/source/api/index.rst b/doc/source/api/index.rst
index 5b8198aa3..e3fc01730 100644
--- a/doc/source/api/index.rst
+++ b/doc/source/api/index.rst
@@ -2,8 +2,9 @@
Coordinates
===========
-The *coordinates* package implements common transformations used in
-Markov state modeling, like RMSD, TICA etc.
+The *coordinates* package contains tools to select features from MD-trajectories
+to assign them to a discrete state space, which will be later used in Markov
+modeling.
.. toctree::
:maxdepth: 1
@@ -11,7 +12,6 @@ Markov state modeling, like RMSD, TICA etc.
coordinates.io
coordinates.transform
coordinates.clustering
- coordinates.tica
Markov State Models
===================
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 973a0fa16..f5ab96dc9 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -41,6 +41,17 @@
#'sphinx.ext.viewcode'
]
+# for edit on github button
+html_context = {
+ 'github_user': 'markovmodel',
+ 'display_github': True,
+ 'github_repo': 'PyEMMA',
+ 'github_version': 'devel',
+ 'conf_py_path': '/doc/',
+ 'html_logo': 'logo-200px.png',
+ 'base_url': 'http://pythonhosted.org/pyEMMA'
+}
+
# Add any paths that contain templates here, relative to this directory.
#templates_path = ['_templates']
@@ -286,11 +297,13 @@
# -----------------------------------------------------------------------------
import glob
-autosummary_generate = glob.glob("*.rst")
-autosummary_generate += glob.glob("api/*.rst")
+
+autosummary_generate = True
+autodoc_default_flags = ['members', 'inherited-members']
# see https://github.com/numpy/numpydoc/issues/5
numpydoc_class_members_toctree = False
+numpydoc_show_class_members = False
# intersphinx for linking to other api's
intersphinx_mapping = {
@@ -298,8 +311,8 @@
'http://docs.scipy.org/doc/numpy': None,
'http://docs.scipy.org/doc/scipy/reference/': None,
'http://matplotlib.sourceforge.net/': None,
+ 'http://mdtraj.org/latest/': None,
}
-#autodoc_default_flags = []
# todo list extension
todo_include_todos = True
diff --git a/meta.yaml b/meta.yaml
deleted file mode 100644
index 5efdd1c29..000000000
--- a/meta.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-package:
- name: pyemma
- version: !!str 1.0.1
-
-source:
- path: .
-build:
- preserve_egg_dir: True
- script: python setup.py install
-
-requirements:
- build:
- - python
- - setuptools
-# not needed actually, since we're deploying pre-cythonized sources
- - cython >=0.20
- - numpy >=1.6.0
- - scipy >=0.11
- - pystallone >=1.0.0b3
-
- run:
- - python
- - setuptools
- - numpy >=1.6.0
- - scipy >=0.11
- - pystallone >=1.0.0b3
- - argparse # [py26]
-
-test:
- imports:
- - pyemma
- commands:
- - mm_cluster -h
-about:
- home: http://github.com/markovmodel/PyEMMA
- license: BSD 2-Clause License
- summary: "EMMA: Emma's Markov Model Algorithms"
-
-
diff --git a/pyemma/__init__.py b/pyemma/__init__.py
index 837c87c7c..1d2a8dac9 100644
--- a/pyemma/__init__.py
+++ b/pyemma/__init__.py
@@ -1,7 +1,7 @@
r"""
-==========================================
- Emma2 - Emma's Markov Model Algorithms
-==========================================
+=======================================
+PyEMMA - Emma's Markov Model Algorithms
+=======================================
"""
from . import coordinates
from . import msm
@@ -9,4 +9,4 @@
from ._version import get_versions
__version__ = get_versions()['version']
-del get_versions
+del get_versions
\ No newline at end of file
diff --git a/pyemma/coordinates/__init__.py b/pyemma/coordinates/__init__.py
index 3fbbd3dc6..2bf65f4e3 100644
--- a/pyemma/coordinates/__init__.py
+++ b/pyemma/coordinates/__init__.py
@@ -12,6 +12,4 @@
:toctree: generated/
"""
-from . import transform
-from . import io
-from . import clustering
+from .api import *
\ No newline at end of file
diff --git a/pyemma/coordinates/acf.py b/pyemma/coordinates/acf.py
new file mode 100644
index 000000000..2591296d1
--- /dev/null
+++ b/pyemma/coordinates/acf.py
@@ -0,0 +1,106 @@
+import numpy as np
+import sys
+
+__author__ = 'Fabian Paul'
+__all__ = ['acf']
+
+
+def acf(trajs, stride=1, max_lag=None, subtract_mean=True, normalize=True, mean=None):
+ '''Computes the (combined) autocorrelation function of multiple trajectories.
+
+ Parameters
+ ----------
+ trajs : list of (*,N) ndarrays
+ the observable trajectories, N is the number of observables
+ stride : int (default = 1)
+ only take every n'th frame from trajs
+ max_lag : int (default = maximum trajectory length / stride)
+ only compute acf up to this lag time
+ subtract_mean : bool (default = True)
+ subtract trajectory mean before computing acfs
+ normalize : bool (default = True)
+ divide acf be the variance such that acf[0,:]==1
+ mean : (N) ndarray (optional)
+ if subtract_mean is True, you can give the trajectory mean
+ so this functions doesn't have to compute it again
+
+ Returns
+ -------
+ acf : (max_lag,N) ndarray
+ autocorrelation functions for all observables
+
+ Note
+ ----
+ The computation uses FFT (with zero-padding) and is done im memory (RAM).
+ '''
+ if not isinstance(trajs, list):
+ trajs = [trajs]
+
+ mytrajs = [None] * len(trajs)
+ for i in xrange(len(trajs)):
+ if trajs[i].ndim == 1:
+ mytrajs[i] = trajs[i].reshape((trajs[i].shape[0], 1))
+ elif trajs[i].ndim == 2:
+ mytrajs[i] = trajs[i]
+ else:
+ raise Exception(
+ 'Unexpected number of dimensions in trajectory number %d' % i)
+ trajs = mytrajs
+
+ assert stride > 0, 'stride must be > 0'
+ assert max_lag is None or max_lag > 0, 'max_lag must be > 0'
+
+ if subtract_mean and mean is None:
+ # compute mean over all trajectories
+ mean = trajs[0].sum(axis=0)
+ n_samples = trajs[0].shape[0]
+ for i, traj in enumerate(trajs[1:]):
+ if traj.shape[1] != mean.shape[0]:
+ raise Exception(('number of order parameters in tarjectory number %d differs' +
+ 'from the number found in previous trajectories.') % (i + 1))
+ mean += traj.sum(axis=0)
+ n_samples += traj.shape[0]
+ mean /= n_samples
+
+ acf = np.array([[]])
+ # number of samples for every tau
+ N = np.array([])
+
+ for i, traj in enumerate(trajs):
+ data = traj[::stride]
+ if subtract_mean:
+ data -= mean
+ # calc acfs
+ l = data.shape[0]
+ fft = np.fft.fft(data, n=2 ** int(np.ceil(np.log2(l * 2 - 1))), axis=0)
+ acftraj = np.fft.ifft(fft * np.conjugate(fft), axis=0).real
+ # throw away acf data for long lag times (and negative lag times)
+ if max_lag and max_lag < l:
+ acftraj = acftraj[:max_lag, :]
+ else:
+ acftraj = acftraj[:l, :]
+ if max_lag:
+ sys.stderr.write(
+ 'Warning: trajectory number %d is shorter than maximum lag.\n' % i)
+ # find number of samples used for every lag
+ Ntraj = np.linspace(l, l - acftraj.shape[0] + 1, acftraj.shape[0])
+ # adapt shape of acf: resize temporal dimension, additionally set
+ # number of order parameters of acf in the first step
+ if acf.shape[1] < acftraj.shape[1] and acf.shape[1] > 0:
+ raise Exception(('number of order parameters in tarjectory number %d differs ' +
+ 'from the number found in previous trajectories.') % i)
+ if acf.shape[1] < acftraj.shape[1] or acf.shape[0] < acftraj.shape[0]:
+ acf.resize(acftraj.shape)
+ N.resize(acftraj.shape[0])
+ # update acf and number of samples
+ acf[0:acftraj.shape[0], :] += acftraj
+ N[0:acftraj.shape[0]] += Ntraj
+
+ # divide by number of samples
+ acf = np.transpose(np.transpose(acf) / N)
+
+ # normalize acfs
+ if normalize:
+ acf /= acf[0, :].copy()
+
+ return acf
diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py
new file mode 100644
index 000000000..935144b28
--- /dev/null
+++ b/pyemma/coordinates/api.py
@@ -0,0 +1,211 @@
+"""
+API for coordinates package
+"""
+__author__ = 'noe'
+
+from discretizer import Discretizer
+# io
+from io.feature_reader import FeatureReader
+from io.data_in_memory import DataInMemory
+# transforms
+from transform.pca import PCA
+from transform.tica import TICA
+# clustering
+from clustering.kmeans import KmeansClustering
+from clustering.uniform_time import UniformTimeClustering
+from clustering.regspace import RegularSpaceClustering
+
+__all__ = ['discretizer',
+ 'feature_reader',
+ 'tica',
+ 'pca',
+ 'kmeans',
+ 'regspace',
+ 'uniform_time',
+ ]
+
+
+def discretizer(reader,
+ transform=None,
+ cluster=KmeansClustering(n_clusters=100)):
+ """
+ Constructs a discretizer
+
+
+ Parameters
+ ----------
+
+ reader : instance of FeatureReader
+ get input data from a FeatureReader
+
+ transform : instance of Transformer
+ an optional transform like PCA/TICA etc.
+
+ cluster : instance of Transformer
+ a cluster algorithm to discretize transformed data
+
+
+ Examples
+ --------
+
+ >>> reader = feature_reader(['traj01.xtc'], 'topology.pdb')
+ >>> transform = pca(dim=2)
+ >>> cluster = uniform_time(n_clusters=100)
+ >>> disc = discretizer(reader, transform, cluster)
+
+ """
+ return Discretizer(reader, transform, cluster)
+
+#==============================================================================
+#
+# READERS
+#
+#==============================================================================
+
+
+def feature_reader(trajfiles, topfile):
+ """
+ Constructs a feature reader
+
+ :param trajfiles:
+ :param topfile:
+ :return:
+ """
+ return FeatureReader(trajfiles, topfile)
+
+
+def memory_reader(data):
+ """
+ Constructs a reader from an in-memory ndarray
+
+ :param data: (N,d) ndarray with N frames of d dimensions
+ :return:
+ """
+ return DataInMemory(data)
+
+
+#=========================================================================
+#
+# TRANSFORMATION ALGORITHMS
+#
+#=========================================================================
+
+
+def pca(data=None, dim=2):
+ """
+ Constructs a PCA object
+
+ :param data:
+ ndarray with the data, if available. When given, the PCA is immediately parametrized
+ :param dim:
+ the number of dimensions to project onto
+
+ :return:
+ a PCA transformation object
+ """
+ res = PCA(dim)
+ if data is not None:
+ inp = DataInMemory(data)
+ res.data_producer = inp
+ res.parametrize()
+ return res
+
+
+def tica(data=None, lag=10, dim=2):
+ """
+ Constructs a TICA object
+
+ Parameters
+ ----------
+ data : ndarray
+ array with the data, if available. When given, the TICA transformation
+ is immediately parametrized.
+ lag : int
+ the lag time, in multiples of the input time step
+ dim : int
+ the number of dimensions to project onto
+
+ Returns
+ -------
+ tica : a TICA transformation object
+ """
+ res = TICA(lag, dim)
+ if data is not None:
+ inp = DataInMemory(data)
+ res.data_producer = inp
+ res.parametrize()
+ return res
+
+
+#=========================================================================
+#
+# CLUSTERING ALGORITHMS
+#
+#=========================================================================
+
+def kmeans(data=None, k=100, max_iter=1000):
+ """
+ Constructs a k-means clustering
+
+ Parameters
+ ----------
+ data: ndarray
+ input data, if available in memory
+ k: int
+ the number of cluster centers
+
+ Returns
+ -------
+ kmeans : A KmeansClustering object
+
+ """
+ res = KmeansClustering(n_clusters=k, max_iter=max_iter)
+ if data is not None:
+ inp = DataInMemory(data)
+ res.data_producer = inp
+ res.parametrize()
+ return res
+
+
+def uniform_time(data=None, k=100):
+ """
+ Constructs a uniform time clustering
+
+ :param data:
+ input data, if available in memory
+ :param k:
+ the number of cluster centers
+
+ :return:
+ A UniformTimeClustering object
+
+ """
+ res = UniformTimeClustering(k)
+ if data is not None:
+ inp = DataInMemory(data)
+ res.data_producer = inp
+ res.parametrize()
+ return res
+
+
+def regspace(data=None, dmin=-1):
+ """
+ Constructs a regular space clustering
+
+ :param dmin:
+ the minimal distance between cluster centers
+ :param data:
+ input data, if available in memory
+
+ :return:
+ A RegularSpaceClustering object
+
+ """
+ if dmin == -1:
+ raise ValueError("provide a minimum distance for clustering")
+ res = RegularSpaceClustering(dmin)
+ if data is not None:
+ inp = DataInMemory(data)
+ res.data_producer = inp
+ res.parametrize()
+ return res
diff --git a/pyemma/coordinates/clustering/__init__.py b/pyemma/coordinates/clustering/__init__.py
index 9d4d20950..96f7493f4 100644
--- a/pyemma/coordinates/clustering/__init__.py
+++ b/pyemma/coordinates/clustering/__init__.py
@@ -1,20 +1,20 @@
r"""
-=======================================================================================
-clustering - java algorithms for data clustering (:mod:`pyemma.coordinates.clustering`)
-=======================================================================================
+===============================================================================
+clustering - Algorithms (:mod:`pyemma.coordinates.clustering`)
+===============================================================================
-.. currentmodule:: pyemma.coordinates.clustering
-
-clustering
-==========
+.. currentmodule: pyemma.coordinates.clustering
.. autosummary::
- :toctree: generated/
-
- kmeans
- regspace
- assign
+ :toctree: generated/
+ AssignCenters
+ KmeansClustering
+ RegularSpaceClustering
+ UniformTimeClustering
"""
-from .api import *
+from .assign import AssignCenters
+from .kmeans import KmeansClustering
+from .regspace import RegularSpaceClustering
+from .uniform_time import UniformTimeClustering
diff --git a/pyemma/coordinates/clustering/api.py b/pyemma/coordinates/clustering/api.py
deleted file mode 100644
index 596aa37a2..000000000
--- a/pyemma/coordinates/clustering/api.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# coding=utf-8
-r"""
-================================
-Clustering Coordinates API
-================================
-
-"""
-
-__docformat__ = "restructuredtext en"
-
-from pyemma.util.pystallone import jarray
-from pyemma.util import pystallone as stallone
-from . import clustering
-
-# shortcuts
-intseqNew = stallone.API.intseqNew
-intseq = stallone.API.intseq
-dataNew = stallone.API.dataNew
-data = stallone.API.data
-clusterNew = stallone.API.clusterNew
-cluster = stallone.API.cluster
-#
-Clustering = clustering.Clustering
-
-__author__ = "Martin Scherer, Frank Noe"
-__copyright__ = "Copyright 2014, Computational Molecular Biology Group, FU-Berlin"
-__credits__ = ["Martin Scherer", "Frank Noe"]
-__license__ = "FreeBSD"
-__version__ = "2.0.0"
-__maintainer__ = "Martin Scherer"
-__email__="m.scherer AT fu-berlin DOT de"
-
-__all__ = ['kmeans', 'regspace', 'assign']
-
-def kmeans(infiles, k, maxiter = 100):
- r"""Performs k-means in the input files using k clusters. Uses
- Euclidean metric
-
- Parameters
- ----------
- k : int
- the number of clusters
- maxiter : int
- the maximum number of k-means iterations
-
- Returns
- -------
- Clustering : A clustering object
-
- """
- if not isinstance(infiles, basestring):
- arr = jarray(infiles)
- infiles = stallone.API.str.toList(arr)
- input = dataNew.dataInput(infiles)
- return Clustering(cluster.kmeans(input, k, maxiter))
-
-
-def regspace(infiles, mindist, metric='Euclidean'):
- r"""Regular space clustering.
-
- Performs regular space clustering on the input files using the
- given minimal distance. Regspace clustering defines the first data
- point to be the first cluster center. The next cluster center is
- defined by the next data point whose minimal distance of all
- existing data points is at least mindist, and so on. The number of
- clusters is thus a function of mindist and the data and is hard to
- predict. If you want to have approximately uniformly spaced
- cluster centers with a controllable number of cluster centers, use
- kregspace.
-
- Returns
- -------
- Clustering : A clustering object
-
- References
- ----------
- J.-H. Prinz, H. Wu, M. Sarich, B. Keller, M. Senne, M. Held, J.D. Chodera,
- Ch. Schuette and F. Noe:
- Markov models of molecular kinetics: Generation and Validation.
- J. Chem. Phys. 134, 174105 (2011).
- """
- if not isinstance(infiles, basestring):
- arr = jarray(infiles)
- infiles = stallone.API.str.toList(arr)
- datainput = dataNew.dataInput(infiles)
- mindist = 1.0*mindist
- dim = datainput.dimension()
- # get metric
- if (str(metric).lower().startswith('euclid')):
- jmetric = clusterNew.metric(clusterNew.METRIC_EUCLIDEAN, dim)
- elif (str(metric).lower().endswith('rmsd')):
- jmetric = clusterNew.metric(clusterNew.METRIC_MINRMSD, dim/3)
- else:
- raise ValueError("Metric "+str(metric)+" not recognized. Use Euclidean or minRMSD")
- # do clustering
- jregspace = cluster.regspace(datainput, jmetric, mindist)
- return Clustering(jregspace)
-
-
-def kregspace(infiles, k):
- """
- Performs regular space clustering on the input files with (approximately)
- fixed number of clusters
- """
- raise NotImplementedError
-
-
-def assign(infiles, clustering, outfiles=None, return_discretization=True):
- r"""Assigns all input trajectories to discrete trajectories using
- the specified discretizer.
-
- Parameters
- ----------
- infiles : string or list of strings
- trajectory file names
- clustering : Clustering or an IDiscretization instance
- the clustering object used for discretizing the data
- outfiles : string or list of strings
- discrete trajectory file names. Will only be written if requested
- return_discretization : bool
- if true (default), will return the discrete trajectories.
- """
- # check input
- if (isinstance(clustering, Clustering)):
- idisc = clustering._jclustering
- elif isinstance(clustering, stallone.stallone.api.discretization.IDiscretization):
- idisc = clustering
- else:
- raise AttributeError("clustering is not an instance of Clustering or"
- " stallone.api.discretization.IClustering!")
- if (isinstance(infiles, str) and isinstance(outfiles, str)):
- infiles = [infiles]
- outfiles = [outfiles]
- singlefile = True
- elif (isinstance(infiles, list) and isinstance(outfiles, list)):
- singlefile = False
- else:
- raise AttributeError("input/output files must be either single filenames"
- " of equally sized lists of filenames, but not a mix.")
-
- # load input
- datainput = dataNew.dataInput(stallone.list_to_java_list(infiles))
- nseq = datainput.numberOfSequences()
- # assign data
- res = []
-
- for i in xrange(nseq):
- seq = datainput.getSequence(i)
- jdtraj = cluster.discretize(seq, idisc)
-
- # write to file if requested
- if (outfiles is not None):
- intseq.writeIntSequence(jdtraj, outfiles[i])
- # store return data if requested
- if (return_discretization):
- dtraj = stallone.stallone_array_to_ndarray(jdtraj)
- res.append(dtraj)
- # return discrete trajectories if requested
- if (return_discretization):
- if singlefile:
- return res[0]
- else:
- return res
diff --git a/pyemma/coordinates/clustering/assign.py b/pyemma/coordinates/clustering/assign.py
new file mode 100644
index 000000000..62de35890
--- /dev/null
+++ b/pyemma/coordinates/clustering/assign.py
@@ -0,0 +1,55 @@
+'''
+Created on 18.02.2015
+
+@author: marscher
+'''
+from pyemma.coordinates.clustering.interface import AbstractClustering
+from pyemma.msm.io import read_matrix
+import numpy as np
+
+
+class AssignCenters(AbstractClustering):
+
+ """Assigns given (precalculated) cluster centers. If you already have
+ cluster centers from somewhere, you use this class to assign your data to it.
+
+ Parameters
+ ----------
+ clustercenters : path to file (csv) or ndarray
+ cluster centers to use in assignment of data
+
+ Examples
+ --------
+ Assuming you have stored your centers in a CSV file:
+
+ >>> from pyemma.coordinates.clustering import AssignCenters
+ >>> from pyemma.coordinates import discretizer
+ >>> reader = ...
+ >>> assign = AssignCenters('my_centers.dat')
+ >>> disc = discretizer(reader, cluster=assign)
+ >>> disc.run()
+
+ """
+
+ def __init__(self, clustercenters):
+ super(AssignCenters, self).__init__()
+
+ if isinstance(clustercenters, str):
+ self.clustercenters = read_matrix(clustercenters)
+
+ self.clustercenters = clustercenters
+ assert isinstance(self.clustercenters, np.ndarray)
+
+ def param_add_data(self, X, itraj, t, first_chunk, last_chunk_in_traj,
+ last_chunk, ipass, Y=None):
+ # discretize all
+ if t == 0:
+ n = self.data_producer.trajectory_length(itraj)
+ self.dtrajs.append(np.empty(n, dtype=int))
+ L = np.shape(X)[0]
+ # TODO: optimize: assign one chunk at once
+ for i in xrange(L):
+ self.dtrajs[itraj][i + t] = self.map(X[i])
+
+ if last_chunk:
+ return True
diff --git a/pyemma/coordinates/clustering/clustering.py b/pyemma/coordinates/clustering/clustering.py
deleted file mode 100644
index 6b0c608e2..000000000
--- a/pyemma/coordinates/clustering/clustering.py
+++ /dev/null
@@ -1,63 +0,0 @@
-'''
-Created on Jan 5, 2014
-
-@author: noe
-'''
-
-import numpy as np
-import pyemma.util.pystallone as stallone
-
-
-class Clustering(object):
- r"""Wrapper to stallone clustering.
-
- """
-
- def __init__(self, jclustering):
- self._jclustering = jclustering
-
- def __jclustering(self):
- return self._jclustering
-
- def nclusters(self):
- r"""Returns the number of clusters.
-
- """
- return self._jclustering.getNumberOfClusters()
-
- def clustercenter(self, i):
- """
- returns the cluster centers
- """
- jclustercenters = self._jclustering.getClusterCenter(i)
- return stallone.stallone_array_to_ndarray(jclustercenters)
-
- def clustercenters(self):
- r"""Returns the cluster centers.
-
- """
- x0 = self.clustercenter(0)
- nc = self.nclusters()
- centers = np.ndarray(tuple([nc]) + np.shape(x0))
- for i in range(0,nc):
- centers[i] = self.clustercenter(i)
- return centers
-
- def clusters(self):
- r"""Returns the cluster indexes of the input data set.
-
- """
- jindexes = self._jclustering.getClusterIndexes()
- return stallone.stallone_array_to_ndarray(jindexes)
-
- def assign(self, X):
- r"""Assigns point X to a cluster and returns its index.
-
- Parameters
- ----------
- X : numpy ndarray
- coordinate set to be assigned
-
- """
- jX = stallone.ndarray_to_stallone_array(X)
- return self._jclustering.assign(jX)
diff --git a/pyemma/coordinates/clustering/interface.py b/pyemma/coordinates/clustering/interface.py
new file mode 100644
index 000000000..c4728c91c
--- /dev/null
+++ b/pyemma/coordinates/clustering/interface.py
@@ -0,0 +1,87 @@
+'''
+Created on 18.02.2015
+
+@author: marscher
+'''
+from pyemma.coordinates.transform.transformer import Transformer
+from pyemma.util.log import getLogger
+import numpy as np
+
+log = getLogger('Clustering')
+
+
+class AbstractClustering(Transformer):
+
+ """
+ provides a common interface for cluster algorithms.
+ """
+
+ def __init__(self):
+ super(AbstractClustering, self).__init__()
+ self.clustercenters = None
+ self.dtrajs = []
+
+ def map(self, x):
+ """get closest index of point in :attr:`clustercenters` to x."""
+ d = self.data_producer.distances(x, self.clustercenters)
+ return np.argmin(d)
+
+ def save_dtrajs(self, trajfiles=None, prefix='', output_format='ascii',
+ extension='.dtraj'):
+ """saves calculated discrete trajectories. Filenames are taken from
+ given reader. If data comes from memory dtrajs are written to a default
+ filename.
+
+
+ Parameters
+ ----------
+ trajfiles : list of str (optional)
+ names of input trajectory files, will be used generate output files.
+ prefix : str
+ prepend prefix to filenames.
+
+ output_format : str
+ if format is 'ascii' dtrajs will be written as csv files, otherwise
+ they will be written as NumPy .npy files.
+
+ extension : str
+ file extension to append (eg. '.itraj')
+ """
+ if extension[0] != '.':
+ extension = '.' + extension
+
+ # obtain filenames from input (if possible, reader is a featurereader)
+ if output_format == 'ascii':
+ from pyemma.msm.io import write_discrete_trajectory as write_dtraj
+ else:
+ from pyemma.msm.io import save_discrete_trajectory as write_dtraj
+ import os.path as path
+
+ dtrajs = self.dtrajs # clustering.dtrajs
+ output_files = []
+
+ if trajfiles is not None: # have filenames available?
+ for f in trajfiles:
+ p, n = path.split(f) # path and file
+ basename, _ = path.splitext(n)
+ name = path.join(p, prefix + '_' + basename + extension)
+ output_files.append(name)
+ else:
+ for i in xrange(len(dtrajs)):
+ if prefix is not '':
+ name = prefix + '_' + i + extension
+ else:
+ name = i + extension
+ output_files.append(name)
+
+ assert len(dtrajs) == len(output_files)
+
+ for filename, dtraj in zip(output_files, dtrajs):
+ try:
+ if path.exists(output_files):
+ # TODO: decide what to do if file already exists.
+ pass
+ write_dtraj(filename, dtraj)
+ except IOError:
+ self.logger.exception(
+ "Exception during writing dtraj to '%s'" % filename)
diff --git a/pyemma/coordinates/clustering/kmeans.py b/pyemma/coordinates/clustering/kmeans.py
new file mode 100644
index 000000000..5206043ef
--- /dev/null
+++ b/pyemma/coordinates/clustering/kmeans.py
@@ -0,0 +1,87 @@
+'''
+Created on 22.01.2015
+
+@author: marscher
+'''
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+
+from pyemma.util.log import getLogger
+from pyemma.util.annotators import doc_inherit
+from pyemma.coordinates.clustering.interface import AbstractClustering
+
+log = getLogger('KmeansClustering')
+
+__all__ = ['KmeansClustering']
+
+
+class KmeansClustering(AbstractClustering):
+
+ r"""
+ Kmeans clustering
+
+ Parameters
+ ----------
+ n_clusters : int
+ amount of cluster centers
+ max_iter : int
+ how many iterations per chunk?
+
+ """
+
+ def __init__(self, n_clusters, max_iter=1000):
+ super(KmeansClustering, self).__init__()
+ self.n_clusters = n_clusters
+ # TODO: if we do not set a random_state here (eg. a forced seed) we get slightly different cluster centers each run
+ self.algo = MiniBatchKMeans(n_clusters,
+ max_iter=max_iter,
+ batch_size=self.chunksize,
+ )
+
+ @doc_inherit
+ def describe(self):
+ return "[Kmeans, k=%i]" % self.n_clusters
+
+ def dimension(self):
+ return 1
+
+ @doc_inherit
+ def get_memory_per_frame(self):
+ return 1
+
+ @doc_inherit
+ def get_constant_memory(self):
+ return 1
+
+ def _ensure2d(self, X):
+ X = X.reshape((-1, 1))
+ return X
+
+ def param_add_data(self, X, itraj, t, first_chunk, last_chunk_in_traj,
+ last_chunk, ipass, Y=None):
+ if X.ndim == 1:
+ X = self._ensure2d(X)
+
+ if ipass == 0:
+ self.algo.partial_fit(X)
+ if ipass == 1:
+ # discretize all
+ if t == 0:
+ n = self.data_producer.trajectory_length(itraj)
+ self.dtrajs.append(np.empty(n, dtype=int))
+ assignment = self.algo.predict(X)
+ self.dtrajs[itraj][:] = assignment
+
+ if last_chunk:
+ return True
+
+ @doc_inherit
+ def param_finish(self):
+ self.clustercenters = self.algo.cluster_centers_
+
+ @doc_inherit
+ def map(self, X):
+ if X.ndim == 1:
+ X = self._ensure2d(X)
+ d = self.algo.predict(X)
+ return d
diff --git a/pyemma/coordinates/clustering/regspace.py b/pyemma/coordinates/clustering/regspace.py
new file mode 100644
index 000000000..2e982ff10
--- /dev/null
+++ b/pyemma/coordinates/clustering/regspace.py
@@ -0,0 +1,114 @@
+'''
+Created on 26.01.2015
+
+@author: marscher
+'''
+
+from pyemma.util.log import getLogger
+from pyemma.util.annotators import doc_inherit
+from pyemma.coordinates.clustering.interface import AbstractClustering
+
+import numpy as np
+
+log = getLogger('RegSpaceClustering')
+__all__ = ['RegularSpaceClustering']
+
+
+class RegularSpaceClustering(AbstractClustering):
+
+ """Clusters data objects in such a way, that cluster centers are at least in
+ distance of dmin to each other according to the given metric.
+ The assignment of data objects to cluster centers is performed by
+ Voronoi paritioning. That means, that a data object is assigned to
+ that clusters center, which has the least distance [1] Senne et al.
+
+ Parameters
+ ----------
+ dmin : float
+ minimum distance between all clusters.
+
+ References
+ ----------
+ .. [1] Senne, Martin, et al. J. Chem Theory Comput. 8.7 (2012): 2223-2238
+
+ """
+
+ def __init__(self, dmin):
+ super(RegularSpaceClustering, self).__init__()
+
+ self.dmin = dmin
+ # temporary list to store cluster centers
+ self._clustercenters = []
+
+ @doc_inherit
+ def describe(self):
+ return "[RegularSpaceClustering dmin=%i]" % self.dmin
+
+ @doc_inherit
+ def map_to_memory(self):
+ # nothing to do, because memory-mapping of the discrete trajectories is
+ # done in parametrize
+ pass
+
+ def dimension(self):
+ return 1
+
+ @doc_inherit
+ def get_memory_per_frame(self):
+ # 4 bytes per frame for an integer index
+ return 4
+
+ @doc_inherit
+ def get_constant_memory(self):
+ # memory for cluster centers and discrete trajectories
+ return 4 * self.data_producer.dimension() + 4 * self.data_producer.n_frames_total()
+
+ def param_add_data(self, X, itraj, t, first_chunk, last_chunk_in_traj, last_chunk, ipass, Y=None):
+ """
+ first pass: calculate clustercenters
+ 1. choose first datapoint as centroid
+ 2. for all X: calc distances to all clustercenters
+ 3. add new centroid, if min(distance to all other clustercenters) >= dmin
+ second pass: assign data to discrete trajectories
+ """
+ log.debug("t=%i; itraj=%i" % (t, itraj))
+ if ipass == 0:
+ # add first point as first centroid
+ if first_chunk:
+ self._clustercenters.append(X[0])
+ log.info("Run regspace clustering with dmin=%f;"
+ " First centroid=%s" % (self.dmin, X[0]))
+ # TODO: optimize with cython, support different metrics
+ # see mixtape.libdistance
+ for x in X:
+ dist = np.fromiter((np.linalg.norm(x - c, 2)
+ for c in self._clustercenters), dtype=np.float32)
+
+ minIndex = np.argmin(dist)
+ if dist[minIndex] >= self.dmin:
+ self._clustercenters.append(x)
+
+ elif ipass == 1:
+ # discretize all
+ if t == 0:
+ assert len(self._clustercenters) >= 1
+ # create numpy array from clustercenters list
+ self.clustercenters = np.array(self._clustercenters)
+
+ log.debug("shape of clustercenters: %s" %
+ str(self.clustercenters.shape))
+ log.info("number of clustercenters: %i" %
+ len(self.clustercenters))
+ n = self.data_producer.trajectory_length(itraj)
+ self.dtrajs.append(np.empty(n, dtype=int))
+ L = np.shape(X)[0]
+ # TODO: optimize: assign one chunk at once
+ for i in xrange(L):
+ self.dtrajs[itraj][i + t] = self.map(X[i])
+ if last_chunk:
+ return True # finished!
+
+ return False
+
+ def param_finish(self):
+ del self._clustercenters # delete temporary
diff --git a/pyemma/coordinates/clustering/stalloneClustering.py b/pyemma/coordinates/clustering/stalloneClustering.py
deleted file mode 100644
index 006854f2c..000000000
--- a/pyemma/coordinates/clustering/stalloneClustering.py
+++ /dev/null
@@ -1,114 +0,0 @@
-'''
-Created on 20.11.2013
-
-@author: marscher
-'''
-from pyemma.util.pystallone import API, JavaException
-
-from pyemma.util.log import getLogger
-log = getLogger(__name__)
-
-__all__ = ['getDataSequenceLoader', 'getClusterAlgorithm', 'writeASCIIResults']
-
-
-def getDataSequenceLoader(files):
- r"""creates a stallone java instance of IDataSequenceLoader
- Parameters
- ----------
- files : list of strings
- list of filenames to wrap in loader
-
- Returns
- -------
- Instance of stallone.api.datasequence.IDataSequenceLoader
- """
- log.info('trying to build a SequenceLoader from this files: %s' % files)
- try:
- files = API.str.toList(files)
- return API.dataNew.multiSequenceLoader(files)
- except JavaException:
- log.exception('Java exception during file reading.')
- # reraise exception
- raise
- except Exception:
- log.exception('Unknown exception during file reading')
- raise
-
-
-def getClusterAlgorithm(data, size, **kwargs):
- """
- constructs the algorithm in stallone factory with given parameters
-
- Parameters
- ----------
- data : Stallone instance of IDataSequence
- algorithm : string
- algorithm to construct. Valid values are one of ['kcenter', 'kmeans',
- 'regularspatial']
- metric : string
- metric to use. Valid values are one of ['euclidian', 'minrmsd']
- k : int
- cluster centers in case of kcenter/kmeans
- dmin : float
- in case of regularspatical algorithm
- spacing : int
- in case of regulartemporal
- """
- log.debug("args to getClusterAlgorithm: %s" % kwargs)
- metric = kwargs['metric']
- algorithm = kwargs['algorithm']
-
- # TODO: ensure type data is either IDataInput or IDataSequence or else clusterfactory api methods will raise
- imetric = None
- if metric == 'euclidean':
- # TODO: set dimension of data
- imetric = API.clusterNew.metric(0, 0)
- elif metric == 'minrmsd':
- nrows = 0 # TODO: set to something useful
- imetric = API.clusterNew.metric(1, nrows)
- else:
- raise ValueError('no valid metric given')
-
- if algorithm == 'kcenter':
- k = kwargs['k']
- return API.clusterNew.kcenter(data, size, imetric, k)
- elif algorithm == 'density_based':
- k = kwargs['k']
- dmin = kwargs['dmin']
- if dmin:
- return API.clusterNew.densitybased(data, imetric, dmin, k)
- else:
- return API.clusterNew.densitybased(data, imetric, k)
- elif algorithm == 'kmeans':
- k = kwargs['k']
- maxIter = kwargs['maxiterations']
- if data is not None:
- return API.clusterNew.kmeans(data, size, imetric, k, maxIter)
- else:
- return API.clusterNew.kmeans(imetric, k, maxIter)
- elif algorithm == 'regularspatial':
- dmin = kwargs['dmin']
- return API.clusterNew.regspace(data, imetric, dmin)
- elif algorithm == 'regulartemporal':
- # TODO: copy impl from emma1 to stallone and map it here
- spacing = kwargs['spacing']
- raise NotImplemented('regular temporal not impled in stallone')
- else:
- raise ValueError('no valid algorithm (%s) given!')
-
-def writeASCIIResults(data, filename):
- writer = API.dataNew.writerASCII(filename,' ', '\n')
- writer.addAll(data)
-
-
-def checkFitIntoMemory(loader):
- from pyemma.util.pystallone import java
-
- mem_needed = loader.memorySizeTotal()
- mem_max = java.lang.Runtime.getRuntime().maxMemory() # in bytes
- log.info('max jvm memory: %s MB' % (mem_max / 1024**2))
- log.info('Memory needed for all data: %s MB' % (mem_needed / 1024**2))
- if mem_max - mem_needed > 0:
- return True
- else:
- return False
\ No newline at end of file
diff --git a/pyemma/coordinates/clustering/uniform_time.py b/pyemma/coordinates/clustering/uniform_time.py
new file mode 100644
index 000000000..979e89415
--- /dev/null
+++ b/pyemma/coordinates/clustering/uniform_time.py
@@ -0,0 +1,120 @@
+__author__ = 'noe'
+
+from pyemma.util.log import getLogger
+from pyemma.coordinates.clustering.interface import AbstractClustering
+
+import numpy as np
+
+log = getLogger('UniformTimeClustering')
+__all__ = ['UniformTimeClustering']
+
+
+class UniformTimeClustering(AbstractClustering):
+
+ """
+ Uniform time clustering
+
+ Parameters
+ ----------
+ k : int
+ """
+
+ def __init__(self, k=2):
+ super(UniformTimeClustering, self).__init__()
+ self.k = k
+
+ def describe(self):
+ return "[Uniform time clustering, k = %i]" % self.k
+
+ def dimension(self):
+ return 1
+
+ def get_memory_per_frame(self):
+ """
+ Returns the memory requirements per frame, in bytes
+
+ :return:
+ """
+ # 4 bytes per frame for an integer index
+ return 0
+
+ def get_constant_memory(self):
+ """
+ Returns the constant memory requirements, in bytes
+
+ :return:
+ """
+ # memory for cluster centers and discrete trajectories
+ return self.k * 4 * self.data_producer.dimension() + 4 * self.data_producer.n_frames_total()
+
+ def param_init(self):
+ """
+ Initializes the parametrization.
+
+ :return:
+ """
+ log.info("Running uniform time clustering")
+ # initialize cluster centers
+ self.clustercenters = np.zeros(
+ (self.k, self.data_producer.dimension()), dtype=np.float32)
+ # initialize time counters
+ self.tprev = 0
+ self.ipass = 0
+ self.n = 0
+ self.stride = self.data_producer.n_frames_total() / self.k
+ self.nextt = self.stride / 2
+
+ def param_add_data(self, X, itraj, t, first_chunk, last_chunk_in_traj, last_chunk, ipass, Y=None):
+ """
+
+ :param X:
+ coordinates. axis 0: time, axes 1-..: coordinates
+ :param itraj:
+ index of the current trajectory
+ :param t:
+ time index of first frame within trajectory
+ :param first_chunk:
+ boolean. True if this is the first chunk globally.
+ :param last_chunk_in_traj:
+ boolean. True if this is the last chunk within the trajectory.
+ :param last_chunk:
+ boolean. True if this is the last chunk globally.
+ :param ipass:
+ number of pass through data
+ :param Y:
+ time-lagged data (if available)
+ :return:
+ """
+ log.debug("itraj = " + str(itraj) + ". t = " + str(t) + ". last_chunk_in_traj = " + str(last_chunk_in_traj)
+ + " last_chunk = " + str(last_chunk) + " ipass = " + str(ipass))
+
+ L = np.shape(X)[0]
+ if ipass == 0:
+ maxt = self.tprev + t + L
+ while (self.nextt < maxt):
+ i = self.nextt - self.tprev - t
+ # FIXME: n out of bounds
+ self.clustercenters[self.n] = X[i]
+ self.n += 1
+ self.nextt += self.stride
+ if last_chunk_in_traj:
+ self.tprev += self.data_producer.trajectory_length(itraj)
+ if ipass == 1:
+ # discretize all
+ if t == 0:
+ n = self.data_producer.trajectory_length(itraj)
+ self.dtrajs.append(np.zeros(n, dtype=int))
+ for i in xrange(L):
+ self.dtrajs[itraj][i + t] = self.map(X[i])
+ if last_chunk:
+ return True # done!
+
+ return False # not done yet.
+
+ def map_to_memory(self):
+ # nothing to do, because memory-mapping of the discrete trajectories is
+ # done in parametrize
+ pass
+
+ def get_discrete_trajectories(self):
+ return self.dtrajs
diff --git a/pyemma/coordinates/discretizer.py b/pyemma/coordinates/discretizer.py
new file mode 100644
index 000000000..f4eb73e18
--- /dev/null
+++ b/pyemma/coordinates/discretizer.py
@@ -0,0 +1,179 @@
+__author__ = 'noe'
+
+import psutil
+import numpy as np
+
+from pyemma.coordinates.clustering.interface import AbstractClustering
+from pyemma.coordinates.transform.transformer import Transformer
+from pyemma.coordinates.io.reader import ChunkedReader
+from pyemma.coordinates.io.feature_reader import FeatureReader
+from pyemma.coordinates.util.chaining import build_chain
+
+from pyemma.util.log import getLogger
+
+
+logger = getLogger('Discretizer')
+__all__ = ['Discretizer']
+
+
+class Discretizer(object):
+
+ """
+ A Discretizer gets a FeatureReader, which defines the features (distances,
+ angles etc.) of given trajectory data and passes this data in a memory
+ efficient way through the given pipeline of a Transformer and a clustering.
+ The clustering object is responsible for assigning the data to discrete
+ states.
+
+ Currently the constructor will calculate everything instantly.
+
+
+ Parameters
+ ----------
+ reader : a FeatureReader object
+ reads trajectory data and selects features.
+ transform : a Transformer object (optional)
+ the Transformer will be used to e.g reduce dimensionality of inputs.
+ cluster : a clustering object
+ used to assign input data to discrete states/ discrete trajectories.
+ """
+
+ def __init__(self, reader, transform=None, cluster=None, chunksize=None):
+ # check input
+ assert isinstance(reader, ChunkedReader), \
+ 'reader is not of the correct type'
+ if (transform is not None):
+ assert isinstance(transform, Transformer), \
+ 'transform is not of the correct type'
+ if cluster is None:
+ raise ValueError('Must specify a clustering algorithm!')
+ else:
+ assert isinstance(cluster, Transformer), \
+ 'cluster is not of the correct type'
+
+ if hasattr(reader, 'featurizer'): # reader is a FeatureReader
+ if reader.featurizer.dimension == 0:
+ logger.warning("no features selected!")
+
+ self.transformers = [reader]
+
+ if transform is not None:
+ self.transformers.append(transform)
+
+ self.transformers.append(cluster)
+
+ if chunksize is not None:
+ build_chain(self.transformers, chunksize)
+ self._chunksize = chunksize
+ else:
+ self._chunksize = None
+ build_chain(self.transformers)
+ self._estimate_chunksize_from_mem_requirement(reader)
+
+ self._parametrized = False
+
+ def run(self):
+ """
+ reads all data and discretizes it into discrete trajectories
+ """
+ for trans in self.transformers:
+ trans.parametrize()
+
+ self._parametrized = True
+
+ @property
+ def dtrajs(self):
+ """ get discrete trajectories """
+ if not self._parametrized:
+ logger.info("not yet parametrized, running now.")
+ self.run()
+ return self.transformers[-1].dtrajs
+
+ @property
+ def chunksize(self):
+ return self._chunksize
+
+ @chunksize.setter
+ def chunksize(self, cs):
+ self._chunksize = cs
+ # update transformers to use new chunksize
+ for trans in self.transformers:
+ trans.chunksize = cs
+
+ def save_dtrajs(self, prefix='', output_format='ascii', extension='.dtraj'):
+ """saves calculated discrete trajectories. Filenames are taken from
+ given reader. If data comes from memory dtrajs are written to a default
+ filename.
+
+
+ Parameters
+ ----------
+ prefix : str
+ prepend prefix to filenames.
+
+ output_format : str
+ if format is 'ascii' dtrajs will be written as csv files, otherwise
+ they will be written as NumPy .npy files.
+
+ extension : str
+ file extension to append (eg. '.itraj')
+ """
+
+ clustering = self.transformers[-1]
+ reader = self.transformers[0]
+
+ assert isinstance(clustering, AbstractClustering)
+
+ trajfiles = None
+ if isinstance(reader, FeatureReader):
+ trajfiles = reader.trajfiles
+
+ clustering.save_dtrajs(trajfiles, prefix, output_format, extension)
+
+ def _estimate_chunksize_from_mem_requirement(self, reader):
+ """
+ estimate memory requirement from chain of transformers and sets a
+ chunksize accordingly
+ """
+ if not hasattr(reader, 'get_memory_per_frame'):
+ self.chunksize = 0
+ return
+
+ M = psutil.virtual_memory()[1] # available RAM in bytes
+ logger.info("available RAM: %i" % M)
+ const_mem = long(0)
+ mem_per_frame = long(0)
+
+ for trans in self.transformers:
+ mem_per_frame += trans.get_memory_per_frame()
+ const_mem += trans.get_constant_memory()
+ logger.info("per-frame memory requirements: %i" % mem_per_frame)
+
+ # maximum allowed chunk size
+ logger.info("const mem: %i" % const_mem)
+ chunksize = (M - const_mem) / mem_per_frame
+ if chunksize < 0:
+ raise MemoryError(
+ 'Not enough memory for desired transformation chain!')
+
+ # is this chunksize sufficient to store full trajectories?
+ chunksize = min(chunksize, np.max(reader.trajectory_lengths()))
+ logger.info("resulting chunk size: %i" % chunksize)
+
+ # set chunksize
+ self.chunksize = chunksize
+
+ # any memory unused? if yes, we can store results
+ Mfree = M - const_mem - chunksize * mem_per_frame
+ logger.info("free memory: %i" % Mfree)
+
+ # starting from the back of the pipeline, store outputs if possible
+ for trans in reversed(self.transformers):
+ mem_req_trans = trans.n_frames_total() * \
+ trans.get_memory_per_frame()
+ if Mfree > mem_req_trans:
+ Mfree -= mem_req_trans
+ # TODO: before we are allowed to call this method, we have to ensure all memory requirements are correct!
+ # trans.operate_in_memory()
+ logger.info("spending %i bytes to operate in main memory: %s "
+ % (mem_req_trans, trans.describe()))
diff --git a/pyemma/coordinates/io/__init__.py b/pyemma/coordinates/io/__init__.py
index bd1cf060b..1d80f7ea3 100644
--- a/pyemma/coordinates/io/__init__.py
+++ b/pyemma/coordinates/io/__init__.py
@@ -1,21 +1,29 @@
r"""
-==============================================================
-io - MD trajectory io functions (:mod:`pyemma.coordinates.io`)
-==============================================================
+===============================================================================
+io - IO Utilities (:mod:`pyemma.coordinates.io`)
+===============================================================================
-.. currentmodule:: pyemma.coordinates.io
+.. currentmodule: pyemma.coordinates.io
-MD trajectory io
+Order parameters
================
.. autosummary::
- :toctree: generated/
+ :toctree: generated/
- reader - return reader object for MD trajectory
- read_traj - load full trajectory into memory
- writer - return writer object for MD trajectory
- write_traj - write full MD trajectory to file
+ MDFeaturizer - selects and computes features from MD trajectories
+ CustomFeature -
-"""
+Reader
+======
+
+.. autosummary::
+ :toctree: generated/
-from .api import *
+ FeatureReader - reads features via featurizer
+ DataInMemory - used if data is already available in mem
+
+"""
+from feature_reader import FeatureReader
+from featurizer import MDFeaturizer, CustomFeature
+from data_in_memory import DataInMemory
\ No newline at end of file
diff --git a/pyemma/coordinates/io/api.py b/pyemma/coordinates/io/api.py
deleted file mode 100644
index fc5254f0e..000000000
--- a/pyemma/coordinates/io/api.py
+++ /dev/null
@@ -1,154 +0,0 @@
-r"""
-
-=========================
-Pyemma coordinates.io API
-=========================
-
-"""
-
-__docformat__ = "restructuredtext en"
-
-"""python package imports"""
-import numpy as np
-
-"""emma intra package imports"""
-from datareader import DataReader
-from datawriter import DataWriter
-
-"""pystallone imports"""
-from pyemma.util.pystallone import API as sapi
-
-__author__ = "Martin Scherer, Frank Noe"
-__copyright__ = "Copyright 2014, Computational Molecular Biology Group, FU-Berlin"
-__credits__ = ["Martin Scherer", "Frank Noe"]
-__license__ = "FreeBSD"
-__version__ = "2.0.0"
-__maintainer__ = "Martin Scherer"
-__email__="m.scherer AT fu-berlin DOT de"
-
-__all__ = ['reader',
- 'read_traj',
- 'writer',
- 'write_traj']
-
-################################################################################
-# Molecular IO
-################################################################################
-
-# DONE: Map to stallone (Frank)
-def reader(filename):
- r"""Opens a trajectory reader to a given trajectory file
-
- Supports xtc and dcd. For these file types, trajectory frames are presented
- as Nx3 arrays.
- Supports ascii where coordinates are separated by white spaces. For this file
- type, trajectory frames are presented as one-dimensional arrays
-
- Parameters
- ----------
- filename : str
- The name of the trajectory file
-
- Returns
- -------
- DataReader :
- Object with access to the specified file
-
- """
- return DataReader(sapi.dataNew.reader(filename))
-
-
-# TODO: Map to moltools (Frank)
-#def read_molcrd(filename):
-# """
-# Loads a single molecular structure from a structure file
-#
-# Support pdb, gro and charmm crd (ascii and binary)
-# """
-
-# DONE: Map to stallone (Frank)
-def read_traj(filename, select = None, frames = None):
- r"""Load the given trajectory completely into memory
-
- Supports xtc and dcd. For these file types, trajectory frames are presented
- as Nx3 arrays.
- Supports ascii where coordinates are separated by white spaces. For this file
- type, trajectory frames are presented as one-dimensional arrays
-
- Parameters
- ----------
- filename : str
- The trajectory filename
-
- Returns
- -------
- traj : array
- array of size (L,N,3) for a trajectory with L frames and N atoms
-
- """
- reader = DataReader(sapi.dataNew.reader(filename))
- return reader.load(select=select, frames=frames)
-
-
-# DONE: Map to stallone (Frank)
-def writer(filename, nframes=None, natoms=None):
- r"""Opens a trajectory writer to a given trajectory file
-
- Supports dcd. For this file type, trajectory frames will be received
- as Nx3 arrays.
- Supports ascii where coordinates are separated by white spaces. For this file
- type, trajectory frames are presented as one-dimensional arrays
-
- Parameters
- ----------
- filename : string
- the file name to be written to
- natoms : int
- number of atoms for the writer. Must be given for molecular coordinate
- writers (e.g. dcd)
-
- Returns
- -------
- DataWriter : object
- Writer object for MD trajectory
-
- """
- if (str(filename).lower().endswith('dcd')):
- if (nframes is None) or (natoms is None):
- raise ValueError('To open a dcd writer, please specify nframes and natoms')
- ndim = natoms * 3
- return DataWriter(sapi.dataNew.writer(filename, nframes, ndim))
- else:
- return DataWriter(sapi.dataNew.writerASCII(filename))
-
-# DONE: Map to stallone (Frank)
-def write_traj(filename, traj):
- r"""Write complete trajectory to file.
-
- Supports xtc and dcd. For these file types, trajectory frames will be received
- as (N,3) arrays.
-
- Parameters
- ----------
- filename : string
- file name
- traj : ndarray with shape (F,N,3) or (F,d)
- Array containing the full trajectory
-
- """
- # number of frames to write
- nframes = np.shape(traj)[0]
- # number of dimensions
- ndim = np.shape(traj)[1]
- if (len(np.shape(traj)) > 2):
- ndim *= np.shape(traj)[2]
- # open java writer
- jwriter = sapi.dataNew.writer(filename, nframes, ndim)
- # wrap into python
- writer = DataWriter(jwriter)
- # write trajectory
- writer.addAll(traj)
- # close file
- writer.close()
-
-
diff --git a/pyemma/coordinates/io/data_in_memory.py b/pyemma/coordinates/io/data_in_memory.py
new file mode 100644
index 000000000..f2bb6efd8
--- /dev/null
+++ b/pyemma/coordinates/io/data_in_memory.py
@@ -0,0 +1,180 @@
+__author__ = 'noe'
+
+import numpy as np
+from pyemma.coordinates.io.reader import ChunkedReader
+from pyemma.util.log import getLogger
+
+logger = getLogger('DataInMemory')
+
+
+class DataInMemory(ChunkedReader):
+
+ """
+ multi-dimensional multi-trajectory data fully stored in memory
+ """
+
+ def __init__(self, _data):
+ """
+
+ :param data:
+ ndarray of shape (nframe, ndim) or
+ list of ndarrays, each of shape (nframe_i, ndim)
+ """
+ ChunkedReader.__init__(self)
+
+ if isinstance(_data, np.ndarray):
+ self.data = [_data]
+ self.ntraj = 1
+ if _data.ndim == 1:
+ self.ndim = np.shape(_data)[0]
+ else:
+ self.ndim = np.shape(_data)[1]
+ self._lengths = [np.shape(_data)[0]]
+ elif isinstance(_data, list):
+ self.data = _data
+ self.ntraj = len(_data)
+ # ensure all trajs have same dim
+ ndims = np.fromiter(([np.shape(_data[i])[1]
+ for i in xrange(len(_data))]), dtype=int)
+ ndim_eq = ndims == np.shape(_data[0][1])
+ if not np.all(ndim_eq):
+ raise ValueError("input data has different dimensions!"
+ " Indices not matching: %s"
+ % np.where(ndim_eq == False))
+ self.ndim = ndims[0]
+
+ self._lengths = [np.shape(_data[i])[0] for i in range(len(_data))]
+ else:
+ raise ValueError('input data is neither an ndarray '
+ 'nor a list of ndarrays!')
+
+ self.t = 0
+ self.itraj = 0
+ self.chunksize = 0
+
+ def number_of_trajectories(self):
+ """
+ Returns the number of trajectories
+
+ :return:
+ number of trajectories
+ """
+ return self.ntraj
+
+ def trajectory_length(self, itraj):
+ """
+ Returns the length of trajectory
+
+ :param itraj:
+ trajectory index
+
+ :return:
+ length of trajectory
+ """
+ return self._lengths[itraj]
+
+ def trajectory_lengths(self):
+ """
+ Returns the length of each trajectory
+
+ :return:
+ length of each trajectory
+ """
+ return self._lengths
+
+ def n_frames_total(self):
+ """
+ Returns the total number of frames, over all trajectories
+
+ :return:
+ the total number of frames, over all trajectories
+ """
+ return np.sum(self._lengths)
+
+ def dimension(self):
+ """
+ Returns the number of output dimensions
+
+ :return:
+ """
+ return self.ndim
+
+ def reset(self):
+ """Resets the data producer
+ """
+ self.itraj = 0
+ self.t = 0
+
+ def next_chunk(self, lag=0):
+ """
+
+ :param lag:
+ :return:
+ """
+ # finished once with all trajectories? so reset the pointer to allow
+ # multi-pass
+ if self.itraj >= self.ntraj:
+ self.reset()
+
+ traj_len = self._lengths[self.itraj]
+ traj = self.data[self.itraj]
+
+ # complete trajectory mode
+ if self.chunksize == 0:
+ if lag == 0:
+ X = traj
+ self.itraj += 1
+ return X
+ else:
+ X = traj[: -lag]
+ Y = traj[lag:traj_len]
+ self.itraj += 1
+ return (X, Y)
+ else:
+ #logger.debug("t=%i" % self.t)
+ # FIXME: if t + chunksize < traj_len, this selects wrong bounds. eg [100:40], which is empty
+ chunksize_bounds = min(self.t + self.chunksize, traj_len)
+ if lag == 0:
+ X = traj[self.t:chunksize_bounds]
+ self.t += np.shape(X)[0]
+ if self.t >= traj_len:
+ self.itraj += 1
+ return X
+ else:
+ logger.warning("chunked lagged access not debugged!")
+ X = traj[self.t: chunksize_bounds - lag]
+ assert np.shape(X)[0] > 0
+ #logger.debug("Y=traj[%i+%i : %i]" %
+ # (self.t, lag, chunksize_bounds))
+ # if we do not have enough data anymore for chunked, padd it with zeros
+ Y = traj[self.t + lag: chunksize_bounds]
+# if np.shape(Y)[0] == 0:
+# assert False
+# Y = PaddedArray(np.zeros_like(X), X.shape)
+
+ assert np.shape(X) == np.shape(Y), "%s != %s" % (str(np.shape(X)), str(np.shape(Y)))
+ self.t += np.shape(X)[0]
+ assert np.shape(Y)[0] > 0
+ if self.t + lag >= traj_len:
+ self.itraj += 1
+ return (X, Y)
+
+ @staticmethod
+ def distance(x, y):
+ """
+
+ :param x:
+ :param y:
+ :return:
+ """
+ return np.linalg.norm(x - y, 2)
+
+ @staticmethod
+ def distances(x, Y):
+ """
+
+ :param x: ndarray (n)
+ :param y: ndarray (Nxn)
+ :return:
+ """
+ return np.linalg.norm(Y - x, 2, axis=1)
\ No newline at end of file
diff --git a/pyemma/coordinates/io/datareader.py b/pyemma/coordinates/io/datareader.py
deleted file mode 100644
index 2eca33ba1..000000000
--- a/pyemma/coordinates/io/datareader.py
+++ /dev/null
@@ -1,122 +0,0 @@
-'''
-Created on Jan 3, 2014
-
-@author: noe
-'''
-import numpy as np
-from pyemma.util import pystallone as stallone
-
-
-# Reader interface
-class DataReader(object):
- """
- Class that accesses trajectory files and can read frames.
- Maps to stallone IDataReader and can currently only be initialized with
- java data readers.
- """
- def __init__(self, java_reader):
- """
- Initializes the reader
- """
- self._java_reader = java_reader
- self._selection = None
-
- def size(self):
- """
- Returns the number of data sets
- """
- return self._java_reader.size()
-
- def dimension(self):
- """
- Returns the dimension of each data set
- """
- return self._java_reader.dimension()
-
- def memory_size(self):
- """
- Returns the memory size needed when loading all the data
- """
- return self._java_reader.memorySize()
-
- def __select(self,selection = None):
- """
- Selection coordinates to be read
-
- By default (selection = None), all atoms / dimensions are read.
- When set otherwise, a call to get() will load only a subset of rows
- of each data set array. When the data is one-dimensional,
- the corresponding data elements are selected.
- For molecular data, instead of the full (N x 3) arrays, a (n x 3) subset
- will be returned.
-
- Parameters
- ----------
- select = None : list of integers
- atoms or dimension selection.
- """
- # when a change is made:
- if (not np.array_equal(selection, self._selection)):
- self._selection = selection
- if (selection is None):
- self._java_reader.select(None)
- else:
- self._java_reader.select(stallone.jarray(selection))
-
- def get(self, index, select=None):
- """
- loads and returns a single data set as an appropriately shaped numpy array
-
- Parameters
- ----------
- index : int
- the index of the requested data set must be in [0,size()-1]
- select = None : list of integers
- atoms or dimension selection. By default, all atoms / dimensions
- are read. When set, will load only a subset of rows of each data
- set array. When the data is one-dimensional, the corresponding
- data elements are selected. When the data is molecular data, i.e.
- (N x 3) arrays, a (n x 3) subset will be returned.
- """
- self.__select(select)
- #return stallone.mytrans(self._java_reader.get(index))
- return stallone.stallone_array_to_ndarray(self._java_reader.get(index))
-
- def load(self, select=None, frames=None):
- """
- loads the entire data set into a [K x shape] numpy array, where shape
- is the natural shape of a data set.
-
- **WARNING:** This is currently **slow** due to the inefficient
- conversion of JCC JArrays to numpy arrays via numpy.array(). This
- bottleneck can probably be avoided by constructing the data field on
- the python side, passing the pointer to it through the interface, and
- then filling it on the Java side. This would speed this function up by
- a factor of 20 or more.
-
- Parameters
- ----------
- select = None : list of integers
- atoms or dimension selection. By default, all atoms / dimensions
- are read. When set, will load only a subset of rows of each data
- set array. When the data is one-dimensional, the corresponding
- data elements are selected. When the data is molecular data, i.e.
- (N x 3) arrays, a (n x 3) subset will be returned.
- frames = None : list of integers
- frame selection. By default, all frames are read
- """
- self.__select(select)
- x0 = self.get(select=select)
- if frames is None:
- frames = range(self.size())
- data = np.ndarray(tuple([len(frames)]) + np.shape(x0))
- for i in range(len(frames)):
- data[i] = self.get(frames[i], select=select)
- return data
-
- def close(self):
- """
- Closes the file and returns the file handler. Further attempts to
- access the file via get(i) or load() will result in an error.
- """
- self._java_reader.close()
diff --git a/pyemma/coordinates/io/datawriter.py b/pyemma/coordinates/io/datawriter.py
deleted file mode 100644
index dab01ba42..000000000
--- a/pyemma/coordinates/io/datawriter.py
+++ /dev/null
@@ -1,25 +0,0 @@
-'''
-Created on Jan 5, 2014
-
-@author: noe
-'''
-
-
-class DataWriter(object):
-
- def __init__(self, jwriter):
- self._jwriter = jwriter
-
- def add(self, x):
- import pyemma.util.pystallone as stallone
- self._jwriter.add(stallone.ndarray_to_stallone_array(x))
-
- def addAll(self, X):
- for i in range(len(X)):
- self.add(X[i])
-
- def close(self):
- self._jwriter.close()
-
- def __del__(self):
- self.close()
diff --git a/pyemma/coordinates/io/feature_reader.py b/pyemma/coordinates/io/feature_reader.py
new file mode 100644
index 000000000..455d1d580
--- /dev/null
+++ b/pyemma/coordinates/io/feature_reader.py
@@ -0,0 +1,326 @@
+__author__ = 'noe'
+
+import mdtraj
+import numpy as np
+
+from mdtraj.core.trajectory import Trajectory
+from pyemma.coordinates.io.reader import ChunkedReader
+from pyemma.util.log import getLogger
+from featurizer import MDFeaturizer
+
+log = getLogger('FeatureReader')
+
+__all__ = ['FeatureReader']
+
+
+class FeatureReader(ChunkedReader):
+
+ """
+ Reads features from MD data.
+
+ To select a feature, access the :attr:`featurizer` and call a feature
+ selecting method (e.g) distances.
+
+ Parameters
+ ----------
+ trajectories: list of strings
+ paths to trajectory files
+
+ topologyfile: string
+ path to topology file (e.g. pdb)
+
+ Examples
+ --------
+ Iterator access:
+
+ >>> reader = FeatureReader('mytraj.xtc', 'my_structure.pdb')
+ >>> chunks = []
+ >>> for itraj, X in reader:
+ >>> chunks.append(X)
+
+ """
+
+ def __init__(self, trajectories, topologyfile):
+ # init with chunksize 100
+ ChunkedReader.__init__(self, 100)
+
+ # files
+ if isinstance(trajectories, str):
+ trajectories = [trajectories]
+ self.trajfiles = trajectories
+ self.topfile = topologyfile
+
+ # featurizer
+ self.featurizer = MDFeaturizer(topologyfile)
+
+ # _lengths
+ self._lengths = []
+ self._totlength = 0
+
+ # iteration
+ self.mditer = None
+ # current lag time
+ self.curr_lag = 0
+ # time lagged iterator
+ self.mditer2 = None
+
+ # cache size
+ self.in_memory = False
+ self.Y = None
+ # basic statistics
+ for traj in trajectories:
+ sum_frames = sum(t.n_frames for t in self._create_iter(traj))
+ self._lengths.append(sum_frames)
+
+ self._totlength = np.sum(self._lengths)
+
+ self.t = 0
+
+ def describe(self):
+ """
+ Returns a description of this transformer
+
+ :return:
+ """
+ return "Feature reader, features = ", self.featurizer.describe()
+
+ def operate_in_memory(self):
+ """
+ If called, the output will be fully stored in memory
+
+ :return:
+ """
+ self.in_memory = True
+ # output data
+ self.Y = [np.empty((self.trajectory_length(itraj), self.dimension()))
+ for itraj in xrange(self.number_of_trajectories())]
+
+ def parametrize(self):
+ """
+ Parametrizes this transformer
+
+ :return:
+ """
+ if self.in_memory:
+ self.map_to_memory()
+
+ def number_of_trajectories(self):
+ """
+ Returns the number of trajectories
+
+ :return:
+ number of trajectories
+ """
+ return len(self.trajfiles)
+
+ def trajectory_length(self, itraj):
+ """
+ Returns the length of trajectory
+
+ Parameters
+ ----------
+ itraj : int
+
+ :return:
+ length of trajectory
+ """
+ return self._lengths[itraj]
+
+ def trajectory_lengths(self):
+ """
+ Returns the trajectory _lengths in a list
+ :return:
+ """
+ return self._lengths
+
+ def n_frames_total(self):
+ """
+ Returns the total number of frames, summed over all trajectories
+ :return:
+ """
+ return self._totlength
+
+ def dimension(self):
+ """
+ Returns the number of output dimensions
+
+ :return:
+ """
+ return self.featurizer.dimension()
+
+ def get_memory_per_frame(self):
+ """
+ Returns the memory requirements per frame, in bytes
+
+ :return:
+ """
+ return 4 * self.dimension()
+
+ def get_constant_memory(self):
+ """
+ Returns the constant memory requirements, in bytes
+
+ :return:
+ """
+ return 0
+
+ def map_to_memory(self):
+ self.reset()
+ # iterate over trajectories
+ last_chunk = False
+ itraj = 0
+ while not last_chunk:
+ last_chunk_in_traj = False
+ t = 0
+ while not last_chunk_in_traj:
+ y = self.next_chunk()
+ assert y is not None
+ L = np.shape(y)[0]
+ # last chunk in traj?
+ last_chunk_in_traj = (t + L >= self.trajectory_length(itraj))
+ # last chunk?
+ last_chunk = (
+ last_chunk_in_traj and itraj >= self.number_of_trajectories() - 1)
+ # write
+ self.Y[itraj][t:t + L] = y
+ # increment time
+ t += L
+ # increment trajectory
+ itraj += 1
+
+ def _create_iter(self, filename):
+ return mdtraj.iterload(filename, chunk=self.chunksize, top=self.topfile)
+
+ def _open_time_lagged(self):
+ log.debug("open time lagged iterator for traj %i" % self.curr_itraj)
+ if self.mditer2 is not None:
+ self.mditer2.close()
+ self.mditer2 = self._create_iter(self.trajfiles[self.curr_itraj])
+ self.skip_n = int(np.floor(1.0 * self.curr_lag / self.chunksize))
+ log.debug("trying to skip %i frames in advanced iterator" %
+ self.skip_n)
+ i = 0
+ for _ in xrange(self.skip_n):
+ try:
+ self.mditer2.next()
+ i += 1
+ except StopIteration:
+ log.debug("was able to increment %i times" % i)
+ break
+
+ def reset(self):
+ """
+ resets the chunk reader
+ """
+ self.curr_itraj = 0
+ self.curr_lag = 0
+ if len(self.trajfiles) >= 1:
+ self.t = 0
+ self.mditer = self._create_iter(self.trajfiles[0])
+
+ def next_chunk(self, lag=0):
+ """
+ gets the next chunk. If lag > 0, we open another iterator with same chunk
+ size and advance it by one, as soon as this method is called with a lag > 0.
+
+ :return: a feature mapped vector X, or (X, Y) if lag > 0
+ """
+ chunk = self.mditer.next()
+
+ if lag > 0:
+ if self.curr_lag == 0:
+ # lag time changed, so open lagged iterator
+ self.curr_lag = lag
+ self._open_time_lagged()
+ try:
+ self.last_advanced_chunk = self.mditer2.next()
+ except StopIteration:
+ log.debug(
+ "No more data in mditer2 during last_adv_chunk assignment. Padding with zeros")
+ lagged_xyz = np.zeros_like(chunk.xyz)
+ self.last_advanced_chunk = Trajectory(
+ lagged_xyz, chunk.topology)
+ try:
+ adv_chunk = self.mditer2.next()
+ except StopIteration:
+ # no more data available in mditer2, so we have to take data from
+ # current chunk and padd it with zeros!
+ log.debug("No more data in mditer2. Padding with zeros."
+ " Data avail: %i" % chunk.xyz.shape[0])
+ lagged_xyz = np.zeros_like(chunk.xyz)
+ adv_chunk = Trajectory(lagged_xyz, chunk.topology)
+
+ # build time lagged Trajectory by concatenating
+ # last adv chunk and advance chunk
+ i = lag - (self.chunksize * self.skip_n)
+ padding_length = max(0, chunk.xyz.shape[0]
+ - (self.last_advanced_chunk.xyz.shape[0] - i)
+ - adv_chunk.xyz.shape[0])
+ padding = np.zeros(
+ (padding_length, chunk.xyz.shape[1], chunk.xyz.shape[2]))
+ merged = Trajectory(np.concatenate(
+ (self.last_advanced_chunk.xyz,
+ adv_chunk.xyz, padding)), chunk.topology)
+ # assert merged.xyz.shape[0] >= chunk.xyz.shape[0]
+ # skip "lag" number of frames and truncate to chunksize
+ chunk_lagged = merged[i:][:chunk.xyz.shape[0]]
+
+ # remember last advanced chunk
+ self.last_advanced_chunk = adv_chunk
+
+ self.t += chunk.xyz.shape[0]
+
+ if (self.t >= self.trajectory_length(self.curr_itraj) and
+ self.curr_itraj < len(self.trajfiles) - 1):
+ log.debug('closing current trajectory "%s"'
+ % self.trajfiles[self.curr_itraj])
+ self.mditer.close()
+ self.t = 0
+ self.curr_itraj += 1
+ self.mditer = self._create_iter(self.trajfiles[self.curr_itraj])
+ # we open self.mditer2 only if requested due lag parameter!
+ self.curr_lag = 0
+
+ # map data
+ if lag == 0:
+ return self.featurizer.map(chunk)
+ else:
+ X = self.featurizer.map(chunk)
+ Y = self.featurizer.map(chunk_lagged)
+ return X, Y
+
+ def __iter__(self):
+ self.reset()
+ return self
+
+ def next(self):
+ """ enable iteration over transformed data.
+
+ Returns
+ -------
+ (itraj, X) : (int, ndarray(n, m)
+ itraj corresponds to input sequence number (eg. trajectory index)
+ and X is the transformed data, n = chunksize or n < chunksize at end
+ of input.
+
+ """
+ # iterate over trajectories
+ if self.curr_itraj >= self.number_of_trajectories():
+ raise StopIteration
+
+ # next chunk already maps output
+ if self.lag == 0:
+ X = self.next_chunk()
+ else:
+ X, Y = self.next_chunk(self.lag)
+
+ last_itraj = self.curr_itraj
+ # note: t is incremented in next_chunk
+ if self.t >= self.trajectory_length(self.curr_itraj):
+ self.curr_itraj += 1
+ self.t = 0
+
+ if self.lag == 0:
+ return (last_itraj, X)
+
+ return (last_itraj, X, Y)
diff --git a/pyemma/coordinates/io/featurizer.py b/pyemma/coordinates/io/featurizer.py
new file mode 100644
index 000000000..f45a979ee
--- /dev/null
+++ b/pyemma/coordinates/io/featurizer.py
@@ -0,0 +1,353 @@
+__author__ = 'noe'
+
+import mdtraj
+import numpy as np
+
+__all__ = ['MDFeaturizer',
+ 'CustomFeature',
+ ]
+
+
+def _describe_atom(topology, index):
+ """
+ Returns a string describing the given atom
+
+ :param topology:
+ :param index:
+ :return:
+ """
+ at = topology.atom(index)
+ return "%s %i %s %i" % (at.residue.name, at.residue.index, at.name, at.index)
+
+
+class CustomFeature(object):
+
+ """
+ A CustomFeature is the base class for all self defined features. You shall
+ calculate your quantities in map method and return it as an ndarray.
+
+ If you simply want to call a function with arguments, you do not need to derive.
+
+
+ Parameters
+ ----------
+ func : function
+ will be invoked with given args and kwargs on mapping traj
+ args : list of positional args (optional)
+ kwargs : named arguments
+
+ Examples
+ --------
+ We use a FeatureReader to read md-trajectories and add a CustomFeature to
+ transform coordinates:
+
+ >>> reader = FeatureReader(...)
+ >>> my_feature = CustomFeature(lambda x: 1.0 / x**2)
+ >>> reader.featurizer.add_custom_feature(my_feature, output_dimension=3)
+
+ Now a pipeline using this reader will apply the :math:`1 / x^2` transform on
+ every frame being red.
+
+ """
+
+ def __init__(self, func=None, *args, **kwargs):
+ self._func = func
+ self._args = args
+ self._kwargs = kwargs
+
+ def describe(self):
+ return "override me to get proper description!"
+
+ def map(self, traj):
+ feature = self._func(traj, self._args, self._kwargs)
+ assert isinstance(feature, np.ndarray)
+ return feature
+
+
+class DistanceFeature:
+
+ def __init__(self, top, distance_indexes):
+ self.top = top
+ self.distance_indexes = distance_indexes
+ self.prefix_label = "DIST:"
+
+ def describe(self):
+ labels = []
+ for pair in self.distance_indexes:
+ labels.append("%s%s - %s" % (self.prefix_label,
+ _describe_atom(self.top, pair[0]),
+ _describe_atom(self.top, pair[1])))
+ return labels
+
+ def map(self, traj):
+ return mdtraj.compute_distances(traj, self.distance_indexes)
+
+
+class InverseDistanceFeature(DistanceFeature):
+
+ def __init__(self, top, distance_indexes):
+ DistanceFeature.__init__(self, top, distance_indexes)
+ self.prefix_label = "INVDIST:"
+
+ def map(self, traj):
+ return 1.0 / mdtraj.compute_distances(traj, self.distance_indexes)
+
+
+class BackboneTorsionFeature:
+
+ def __init__(self, topology):
+ self.topology = topology
+
+ def _has_atom(self, res_index, name):
+ for atom in self.topology.atoms:
+ if atom.name.lower() == name.lower():
+ return True
+ return False
+
+ def _list_phis(self):
+ # phi: C-1, N, CA, C
+ phis = []
+ for ires in range(1, self.topology.n_residues):
+ if (self._has_atom(ires - 1, "C") and self._has_atom(ires, "N") and
+ self._has_atom(ires, "CA") and self._has_atom(ires, "C")):
+ phis.append(ires)
+ return phis
+
+ def _list_psis(self):
+ # psi: N, CA, C, N+1
+ psis = []
+ for ires in range(0, self.topology.n_residues - 1):
+ if (self._has_atom(ires, "N") and self._has_atom(ires, "CA") and
+ self._has_atom(ires, "C") and self._has_atom(ires + 1, "N")):
+ psis.append(ires)
+ return psis
+
+ def describe(self):
+ labels = []
+ phis = self._list_phis()
+ for ires in phis:
+ labels.append("PHI: %s %i" %
+ (self.topology.residue(ires).name, ires))
+ psis = self._list_psis()
+ for ires in psis:
+ labels.append("PSI: %s %i" %
+ (self.topology.residue(ires).name, ires))
+ return labels
+
+ def map(self, traj):
+ y1 = mdtraj.compute_phi(traj)[1].astype(np.float32)
+ y2 = mdtraj.compute_psi(traj)[1].astype(np.float32)
+ return np.hstack((y1, y2))
+
+
+class MDFeaturizer(object):
+
+ """extracts features from MD trajectories.
+
+ Parameters
+ ----------
+
+ topfile : str
+ a path to a topology file (pdb etc.)
+ """
+
+ def __init__(self, topfile):
+ self.topology = (mdtraj.load(topfile)).topology
+
+ self.distance_indexes = []
+ self.inv_distance_indexes = []
+ self.contact_indexes = []
+ self.angle_indexes = []
+
+ self.active_features = []
+ self._dim = 0
+
+ def describe(self):
+ """
+ Returns a list of strings, one for each feature selected,
+ with human-readable descriptions of the features.
+
+ :return:
+ """
+ labels = []
+
+ for f in self.active_features:
+ labels.append(f.describe())
+
+ return labels
+
+ def select(self, selstring):
+ """
+ Selects using the given selection string
+
+ :param selstring:
+ :return:
+ """
+ return self.topology.select(selstring)
+
+ def select_Ca(self):
+ """
+ Selects Ca atoms
+
+ :return:
+ """
+ return self.topology.select("name CA")
+
+ def select_Backbone(self):
+ """
+ Selects backbone C, CA and N atoms
+
+ :return:
+ """
+ return self.topology.select("name C CA N")
+
+ def select_Heavy(self):
+ """
+ Selects all heavy atoms
+
+ :return:
+ """
+ return self.topology.select("mass >= 2")
+
+ @staticmethod
+ def pairs(sel):
+ """
+ Creates all pairs between indexes, except for 1 and 2-neighbors
+
+ :return:
+ """
+ nsel = np.shape(sel)[0]
+ npair = ((nsel - 2) * (nsel - 3)) / 2
+ pairs = np.zeros((npair, 2))
+ s = 0
+ for i in range(0, nsel - 3):
+ d = nsel - 3 - i
+ pairs[s:s + d, 0] = sel[i]
+ pairs[s:s + d, 1] = sel[i + 3:nsel]
+ s += d
+
+ return pairs
+
+ def distances(self, atom_pairs):
+ """
+ Adds the set of distances to the feature list
+ :param atom_pairs:
+ """
+ # TODO: shall we instead append here?
+ self.distance_indexes = atom_pairs
+ f = DistanceFeature(self.topology, distance_indexes=atom_pairs)
+ self.active_features.append(f)
+ self._dim += np.shape(atom_pairs)[0]
+
+ def distancesCa(self):
+ """
+ Adds the set of Ca-distances to the feature list
+ """
+ self.distance_indexes = self.pairs(self.select_Ca())
+
+ f = DistanceFeature(self.topology, atom_pairs=self.distance_indexes)
+ self.active_features.append(f)
+ self._dim += np.shape(self.distance_indexes)[0]
+
+ def inverse_distances(self, atom_pairs):
+ """
+ Adds the set of inverse distances to the feature list
+
+ :param atom_pairs:
+ """
+ self.inv_distance_indexes = atom_pairs
+
+ f = InverseDistanceFeature(atom_pairs=self.inv_distance_indexes)
+ self.active_features.append(f)
+ self._dim += np.shape(self.distance_indexes)[0]
+
+ def contacts(self, atom_pairs):
+ """
+ Adds the set of contacts to the feature list
+ :param atom_pairs:
+ """
+ f = CustomFeature(mdtraj.compute_contacts, atom_pairs=atom_pairs)
+
+ def describe():
+ labels = []
+ for pair in self.contact_indexes:
+ labels.append("CONTACT: %s - %s" % (_describe_atom(self.topology, pair[0]),
+ _describe_atom(self.topology, pair[1])))
+ return labels
+ f.describe = describe
+ f.topology = self.topology
+
+ self._dim += np.shape(atom_pairs)[0]
+ self.active_features.append(f)
+
+ def angles(self, indexes):
+ """
+ Adds the list of angles to the feature list
+
+ Parameters
+ ----------
+ indexes : np.ndarray, shape=(num_pairs, 2), dtype=int
+ """
+ f = CustomFeature(mdtraj.compute_angles, indexes=indexes)
+
+ def describe():
+ labels = []
+ for triple in self.angle_indexes:
+ labels.append("ANGLE: %s - %s - %s " %
+ (_describe_atom(self.topology, triple[0]),
+ _describe_atom(self.topology, triple[1]),
+ _describe_atom(self.topology, triple[2])))
+
+ return labels
+
+ f.describe = describe
+ f.topology = self.topology
+
+ self.active_features.append(f)
+ self.angle_indexes = indexes
+ self._dim += np.shape(indexes)[0]
+
+ def backbone_torsions(self):
+ """
+ Adds all backbone torsions
+ """
+ f = BackboneTorsionFeature(self.topology)
+ self.active_features.append(f)
+ self._dim += 2 * self.topology.n_residues
+
+ def add_custom_feature(self, feature, output_dimension):
+ """
+ Parameters
+ ----------
+ feature : object
+ an object with interface like CustomFeature (map, describe methods)
+ output_dimension : int
+ a mapped feature coming from has this dimension.
+ """
+ assert output_dimension > 0, "tried to add empty feature"
+ assert hasattr(feature, 'map'), "no map method in given feature"
+ assert hasattr(feature, 'describe')
+
+ self.active_features.append(feature)
+ self._dim += output_dimension
+
+ def dimension(self):
+ """ current dimension due to selected features """
+ return self._dim
+
+ def map(self, traj):
+ """
+ Computes the features for the given trajectory
+ :return:
+ """
+ # if there are no features selected, return given trajectory
+ if self._dim == 0:
+ return traj
+
+ # otherwise build feature vector.
+ feature_vec = []
+
+ for f in self.active_features:
+ feature_vec.append(f.map(traj).astype(np.float32))
+
+ return np.hstack(feature_vec)
diff --git a/pyemma/coordinates/io/reader.py b/pyemma/coordinates/io/reader.py
new file mode 100644
index 000000000..627b654a6
--- /dev/null
+++ b/pyemma/coordinates/io/reader.py
@@ -0,0 +1,19 @@
+'''
+Created on 18.02.2015
+
+@author: marscher
+'''
+
+
+class ChunkedReader(object):
+
+ """
+ A chunked reader shall implement chunked data access from some data source.
+ Therefore it has to implement the following methods:
+
+ next_chunk(lag=0)
+ """
+
+ def __init__(self, chunksize=0, lag=0):
+ self.chunksize = chunksize
+ self.lag = 0
diff --git a/pyemma/coordinates/io/writer.py b/pyemma/coordinates/io/writer.py
new file mode 100644
index 000000000..749b557cd
--- /dev/null
+++ b/pyemma/coordinates/io/writer.py
@@ -0,0 +1,64 @@
+'''
+Created on 22.01.2015
+
+@author: marscher
+'''
+from pyemma.util.log import getLogger
+
+import numpy as np
+from pyemma.coordinates.transform.transformer import Transformer
+
+
+log = getLogger('WriterCSV')
+__all__ = ['WriterCSV']
+
+
+class WriterCSV(Transformer):
+
+ '''
+ shall write to csv files
+ '''
+
+ def __init__(self, filename):
+ '''
+ Constructor
+ '''
+ super(WriterCSV, self).__init__()
+
+ # filename should be obtained from source trajectory filename,
+ # eg suffix it to given filename
+ self.filename = filename
+ self.last_frame = False
+
+ self.reset()
+
+ def describe(self):
+ return "[Writer filename='%s']" % self.filename
+
+ def get_constant_memory(self):
+ return 0
+
+ def dimension(self):
+ return self.data_producer.dimension()
+
+ def reset(self):
+ try:
+ self.fh.close()
+ log.debug('closed file')
+ except IOError:
+ log.exception('during close')
+ except AttributeError:
+ pass
+
+ try:
+ self.fh = open(self.filename, 'w')
+ except IOError:
+ log.exception('could not open file "%s" for writing.')
+ raise
+
+ def param_add_data(self, X, itraj, t, first_chunk, last_chunk_in_traj, last_chunk, ipass, Y=None):
+ np.savetxt(self.fh, X)
+ if last_chunk:
+ log.debug("closing file")
+ self.fh.close()
+ return True # finished
\ No newline at end of file
diff --git a/pyemma/coordinates/tests/__init__.py b/pyemma/coordinates/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pyemma/coordinates/tests/data/test.pdb b/pyemma/coordinates/tests/data/test.pdb
new file mode 100644
index 000000000..723119130
--- /dev/null
+++ b/pyemma/coordinates/tests/data/test.pdb
@@ -0,0 +1,2 @@
+ATOM 3 CH3 ACE A 1 28.490 31.600 33.379 0.00 1.00
+ATOM 7 C ACE A 2 27.760 30.640 34.299 0.00 1.00
diff --git a/pyemma/coordinates/tests/test_acf.py b/pyemma/coordinates/tests/test_acf.py
new file mode 100644
index 000000000..49e462c50
--- /dev/null
+++ b/pyemma/coordinates/tests/test_acf.py
@@ -0,0 +1,25 @@
+import unittest
+import numpy as np
+
+from pyemma.coordinates.acf import acf
+
+class TestTICA(unittest.TestCase):
+ def test(self):
+ # generate some data
+ data = np.random.rand(100,3)
+
+ testacf = acf(data)
+
+ # direct computation of acf (single trajectory, three observables)
+ N = data.shape[0]
+ refacf = np.zeros(data.shape)
+ meanfree = data - np.mean(data,axis=0)
+ padded = np.concatenate((meanfree,np.zeros(data.shape)),axis=0)
+ for tau in xrange(N):
+ refacf[tau] = (padded[0:N,:]*padded[tau:N+tau,:]).sum(axis=0)/(N-tau)
+ refacf /= refacf[0] # normalize
+
+ np.testing.assert_allclose(refacf,testacf)
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/tests/test_datainmemory.py b/pyemma/coordinates/tests/test_datainmemory.py
new file mode 100644
index 000000000..51526abcb
--- /dev/null
+++ b/pyemma/coordinates/tests/test_datainmemory.py
@@ -0,0 +1,105 @@
+'''
+Created on 04.02.2015
+
+@author: marscher
+'''
+import unittest
+
+from pyemma.coordinates.io.data_in_memory import DataInMemory
+from pyemma.util.log import getLogger
+import numpy as np
+
+logger = getLogger('TestDataInMemory')
+
+
+class TestDataInMemory(unittest.TestCase):
+
+ def testListOfArrays(self):
+
+ frames_per_traj = 100
+ dim = 3
+ data = [np.random.random((frames_per_traj, dim)) for i in xrange(3)]
+
+ d = DataInMemory(data)
+
+ self.assertEqual(d.dimension(), dim)
+
+ self.assertEqual(
+ d.trajectory_lengths(), [frames_per_traj for i in xrange(3)])
+
+ def testDataArray(self):
+ frames_per_traj = 100
+ dim = 3
+
+ data = np.random.random((frames_per_traj, dim))
+ d = DataInMemory(data)
+
+ self.assertEqual(
+ d.trajectory_lengths(), [frames_per_traj for i in xrange(1)])
+
+ def testChunkedAccess(self):
+ frames_per_traj = 100
+ dim = 3
+ data = [np.random.random((frames_per_traj, dim)) for i in xrange(3)]
+
+ d = DataInMemory(data)
+ d.chunksize = 6
+ itraj = 0
+ last_chunk = False
+ while not last_chunk:
+ last_chunk_in_traj = False
+ t = 0
+ while not last_chunk_in_traj:
+ # iterate over times within trajectory
+ X = d.next_chunk()
+ L = np.shape(X)[0]
+ # last chunk in traj?
+ last_chunk_in_traj = (t + 0 + L >= d.trajectory_length(itraj))
+ # last chunk?
+ last_chunk = (
+ last_chunk_in_traj and itraj >= d.number_of_trajectories() - 1)
+ # increment time
+ t += L
+ # increment trajectory
+ itraj += 1
+
+ @unittest.skip("known to be broken.")
+ def testChunkedLaggedAccess(self):
+ frames_per_traj = 100
+ dim = 3
+ lag = 3
+ chunksize = 6
+ data = [np.random.random((frames_per_traj, dim)) for i in xrange(3)]
+
+ d = DataInMemory(data)
+ d.chunksize = chunksize
+ itraj = 0
+ last_chunk = False
+ ipass = 3
+ while ipass > 0:
+ logger.debug("ipass : %i " % ipass)
+ while not last_chunk:
+ last_chunk_in_traj = False
+ t = 0
+ while not last_chunk_in_traj:
+ # iterate over times within trajectory
+ X, Y = d.next_chunk(lag=lag)
+ assert X is not None
+ assert Y is not None
+ L = np.shape(X)[0]
+ K = np.shape(Y)[0]
+ assert L == K, "data in memory gave different chunksizes"
+ # last chunk in traj?
+ last_chunk_in_traj = (
+ t + lag + L >= d.trajectory_length(itraj))
+ # last chunk?
+ last_chunk = (
+ last_chunk_in_traj and itraj >= d.number_of_trajectories() - 1)
+ # increment time
+ t += L
+ # increment trajectory
+ itraj += 1
+ ipass -= 1
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/tests/test_discretizer.py b/pyemma/coordinates/tests/test_discretizer.py
new file mode 100644
index 000000000..3cebe23f4
--- /dev/null
+++ b/pyemma/coordinates/tests/test_discretizer.py
@@ -0,0 +1,104 @@
+'''
+Created on 19.01.2015
+
+@author: marscher
+'''
+import itertools
+import os
+import tempfile
+import unittest
+import mdtraj
+import numpy as np
+
+from mdtraj.core.trajectory import Trajectory
+from mdtraj.core.element import hydrogen, oxygen
+from mdtraj.core.topology import Topology
+
+from pyemma.coordinates.clustering.uniform_time import UniformTimeClustering
+from ..discretizer import Discretizer
+from ..io.feature_reader import FeatureReader
+from ..transform.pca import PCA
+
+
+def create_water_topology_on_disc(n):
+ topfile = tempfile.mkstemp('.pdb')[1]
+ top = Topology()
+ chain = top.add_chain()
+
+ for i in xrange(n):
+ res = top.add_residue('r%i' % i, chain)
+ h1 = top.add_atom('H', hydrogen, res)
+ o = top.add_atom('O', oxygen, res)
+ h2 = top.add_atom('H', hydrogen, res)
+ top.add_bond(h1, o)
+ top.add_bond(h2, o)
+
+ xyz = np.zeros((n * 3, 3))
+ Trajectory(xyz, top).save_pdb(topfile)
+ return topfile
+
+
+def create_traj_on_disc(topfile, n_frames, n_atoms):
+ fn = tempfile.mktemp('.xtc')
+ xyz = np.random.random((n_frames, n_atoms, 3))
+ t = mdtraj.load(topfile)
+ t.xyz = xyz
+ t.time = np.arange(n_frames)
+ t.save(fn)
+ return fn
+
+
+class TestDiscretizer(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ c = super(TestDiscretizer, cls).setUpClass()
+ # create a fake trajectory which has 2 atoms and coordinates are just a range
+ # over all frames.
+ cls.n_frames = 1000
+ cls.n_residues = 30
+ cls.topfile = create_water_topology_on_disc(cls.n_residues)
+
+ # create some trajectories
+ t1 = create_traj_on_disc(
+ cls.topfile, cls.n_frames, cls.n_residues * 3)
+
+ t2 = create_traj_on_disc(
+ cls.topfile, cls.n_frames, cls.n_residues * 3)
+
+ cls.trajfiles = [t1, t2]
+
+ return c
+
+ @classmethod
+ def tearDownClass(cls):
+ """delete temporary files"""
+ os.unlink(cls.topfile)
+ for f in cls.trajfiles:
+ os.unlink(f)
+
+ def test(self):
+ reader = FeatureReader(self.trajfiles, self.topfile)
+ # select all possible distances
+ pairs = np.array(
+ [x for x in itertools.combinations(range(self.n_residues), 2)])
+
+ reader.featurizer.distances(pairs)
+
+ tica = PCA(output_dimension=2)
+
+ n_clusters = 2
+ clustering = UniformTimeClustering(k=n_clusters)
+
+ D = Discretizer(reader, transform=tica, cluster=clustering)
+ D.run()
+
+ self.assertEqual(len(D.dtrajs), len(self.trajfiles))
+
+ for dtraj in clustering.dtrajs:
+ unique = np.unique(dtraj)
+ self.assertEqual(unique.shape[0], n_clusters)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/tests/test_featurereader.py b/pyemma/coordinates/tests/test_featurereader.py
new file mode 100644
index 000000000..fead9d84b
--- /dev/null
+++ b/pyemma/coordinates/tests/test_featurereader.py
@@ -0,0 +1,172 @@
+'''
+Created on 23.01.2015
+
+@author: marscher
+'''
+import mdtraj
+import os
+import tempfile
+import unittest
+
+from pyemma.coordinates.io.feature_reader import FeatureReader
+from pyemma.coordinates.transform.transformer import Transformer
+from pyemma.util.log import getLogger
+import pkg_resources
+
+from pyemma.coordinates.util.chaining import build_chain, run_chain
+import numpy as np
+import cProfile
+
+
+log = getLogger('TestFeatureReader')
+
+
+class TestFeatureReader(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ c = super(TestFeatureReader, cls).setUpClass()
+ # create a fake trajectory which has 2 atoms and coordinates are just a range
+ # over all frames.
+ cls.trajfile = tempfile.mktemp('.xtc')
+ cls.n_frames = 1000
+ cls.xyz = np.arange(cls.n_frames * 2 * 3).reshape((cls.n_frames, 2, 3))
+ log.debug("shape traj: %s" % str(cls.xyz.shape))
+ cls.topfile = pkg_resources.resource_filename(
+ 'pyemma.coordinates.tests.test_featurereader', 'data/test.pdb')
+ t = mdtraj.load(cls.topfile)
+ t.xyz = cls.xyz
+ t.time = np.arange(cls.n_frames)
+ t.save(cls.trajfile)
+ return c
+
+ @classmethod
+ def tearDownClass(cls):
+ os.unlink(cls.trajfile)
+ super(TestFeatureReader, cls).tearDownClass()
+
+ def setUp(self):
+ self.pr = cProfile.Profile()
+ self.pr.enable()
+
+ def tearDown(self):
+ from pstats import Stats
+ p = Stats(self.pr)
+ p.sort_stats('cumtime')
+ # p.print_stats()
+ p.print_callers('heavy_function')
+
+ def testIteratorAccess(self):
+ reader = FeatureReader(self.trajfile, self.topfile)
+
+ frames = 0
+ data = []
+ for i, X in reader:
+ frames += X.xyz.shape[0]
+ data.append(X.xyz)
+
+ # restore shape of input
+ data = np.array(data).reshape(self.xyz.shape)
+
+ self.assertEqual(frames, reader.trajectory_lengths()[0])
+ np.testing.assert_equal(data, self.xyz)
+
+ def testIteratorAccess2(self):
+ reader = FeatureReader([self.trajfile, self.trajfile], self.topfile)
+ reader.chunksize = 100
+
+ frames = 0
+ data = []
+ for i, X in reader:
+ frames += X.xyz.shape[0]
+ data.append(X.xyz)
+ self.assertEqual(frames, reader.trajectory_lengths()[0] * 2)
+ # restore shape of input
+ data = np.array(
+ data[0:reader.trajectory_lengths()[0] / reader.chunksize]).reshape(self.xyz.shape)
+
+ np.testing.assert_equal(data, self.xyz)
+
+ def testTimeLaggedIterator(self):
+ lag = 10
+ reader = FeatureReader(self.trajfile, self.topfile)
+ reader.lag = lag
+ frames = 0
+ data = []
+ lagged = []
+ for _, X, Y in reader:
+ frames += X.xyz.shape[0]
+ data.append(X.xyz)
+ lagged.append(Y.xyz)
+
+ merged_lagged = np.array(lagged).reshape(self.xyz.shape)
+
+ # reproduce outcome
+ fake_lagged = np.empty_like(self.xyz)
+ fake_lagged[:-lag] = self.xyz[lag:]
+ fake_lagged[-lag:] = 0
+
+ np.testing.assert_equal(merged_lagged, fake_lagged)
+
+ # restore shape of input
+ data = np.array(data).reshape(self.xyz.shape)
+
+ self.assertEqual(frames, reader.trajectory_lengths()[0])
+ np.testing.assert_equal(data, self.xyz)
+
+ @unittest.skip("")
+ def testTimeLaggedAccess(self):
+ # each frame has 2 atoms with 3 coords = 6 coords per frame.
+ # coords are sequential through all frames and start with 0.
+
+ lags = [2, 200]
+
+ chunksizes = [1, 100]
+
+ for lag in lags:
+ for chunksize in chunksizes:
+ log.info("chunksize=%i\tlag=%i" % (chunksize, lag))
+
+ lagged_chunks = []
+ reader = FeatureReader(self.trajfile, self.topfile)
+ reader.chunksize = chunksize
+ reader.lag = lag
+ for _, _, y in reader:
+ lagged_chunks.append(y.xyz)
+
+ coords = self.xyz
+
+ for ii, c in enumerate(lagged_chunks[:-1]):
+ # all despite last chunk shall have chunksize
+ self.assertEqual(c.shape[0], chunksize)
+ # first lagged chunk should start at lag and stop at chunksize +
+ # lag
+ ind1 = ii * chunksize + lag
+ ind2 = ind1 + chunksize
+ #log.debug("coor slice[%i: %i]" % (ind1, ind2))
+ np.testing.assert_allclose(c, coords[ind1:ind2])
+
+ # TODO: check last lagged frame
+
+ # last lagged chunk should miss "lag" frames of input! e.g
+ # padded to maintain chunksize
+
+ last_chunk = lagged_chunks[-1]
+ # print last_chunk
+ # when is last_chunk padded?
+ # if
+ # how many zeros are going to be used?
+
+
+# expected = np.empty((chunksize, 2, 3))
+# for ii, c in enumerate(xrange(chunksize)):
+# c += 1
+# expected[ii] = coords[-c]
+# expected = np.array((coords[-2], coords[-1]))
+# print last_chunk
+# print "-"*10
+# print expected
+ #np.testing.assert_allclose(last_chunk, expected)
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/tests/test_kmeans.py b/pyemma/coordinates/tests/test_kmeans.py
new file mode 100644
index 000000000..3b35ad305
--- /dev/null
+++ b/pyemma/coordinates/tests/test_kmeans.py
@@ -0,0 +1,26 @@
+'''
+Created on 28.01.2015
+
+@author: marscher
+'''
+import unittest
+from pyemma.coordinates.clustering.kmeans import KmeansClustering
+from test_regspace import RandomDataSource
+
+
+class TestKmeans(unittest.TestCase):
+
+ def setUp(self):
+ self.k = 5
+ self.kmeans = KmeansClustering(n_clusters=self.k, max_iter=100)
+
+ self.kmeans.data_producer = RandomDataSource()
+
+ def testName(self):
+ self.kmeans.parametrize()
+
+ assert self.kmeans.dtrajs[0].dtype == int
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/tests/test_regspace.py b/pyemma/coordinates/tests/test_regspace.py
new file mode 100644
index 000000000..6bb1ba269
--- /dev/null
+++ b/pyemma/coordinates/tests/test_regspace.py
@@ -0,0 +1,108 @@
+'''
+Created on 26.01.2015
+
+@author: marscher
+'''
+import itertools
+import unittest
+import logging
+
+from pyemma.coordinates.clustering.regspace import RegularSpaceClustering, log
+import numpy as np
+import cProfile
+
+log.setLevel(logging.ERROR)
+
+
+class RandomDataSource:
+
+ def __init__(self, chunksize=1000, a=None, b=None):
+ self.n_samples = 5
+ self.data = np.random.random((self.n_samples, chunksize, 3))
+ if a is not None and b is not None:
+ self.data *= (b - a)
+ self.data += a
+ self.i = -1
+
+ def next_chunk(self, lag=0):
+ self.i += 1
+ return self.data[self.i]
+
+ def reset(self):
+ self.i = -1
+
+ def trajectory_length(self, itraj):
+ return self.data[itraj].shape[0]
+
+ def number_of_trajectories(self):
+ return self.data.shape[0]
+
+ @staticmethod
+ def distance(x, y):
+ return np.linalg.norm(x - y, 2)
+
+ @staticmethod
+ def distances(x, Y):
+ return np.linalg.norm(Y - x, 2, axis=1)
+
+
+class TestRegSpaceClustering(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ super(TestRegSpaceClustering, cls).setUpClass()
+ np.random.seed(0)
+
+ def setUp(self):
+ self.dmin = 0.3
+ self.clustering = RegularSpaceClustering(dmin=self.dmin)
+ self.clustering.data_producer = RandomDataSource()
+ #self.pr = cProfile.Profile()
+ #self.pr.enable()
+ #print "*" * 80
+
+
+ def tearDown(self):
+ pass
+# from pstats import Stats
+# p = Stats(self.pr)
+# p.strip_dirs()
+#
+# p.sort_stats('cumtime')
+# p.print_stats()
+#
+# print "*" * 80
+
+ def testAlgo(self):
+ self.clustering.parametrize()
+
+ assert self.clustering.dtrajs[0].dtype == int
+
+ # assert distance for each centroid is at least dmin
+ for c in itertools.combinations(self.clustering.clustercenters, 2):
+ if np.allclose(c[0], c[1]): # skip equal pairs
+ continue
+
+ dist = np.linalg.norm(c[0] - c[1], 2)
+
+ self.assertGreaterEqual(dist, self.dmin, "centroid pair\n%s\n%s\n has smaller"
+ " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist))
+
+ def testAssignment(self):
+ self.clustering.parametrize()
+
+ assert len(self.clustering.clustercenters) > 1
+
+ # num states == num _clustercenters?
+ self.assertEqual(len(np.unique(self.clustering.dtrajs)), len(
+ self.clustering.clustercenters), "number of unique states in dtrajs"
+ " should be equal.")
+
+ def testSpreadData(self):
+ self.clustering.data_producer = RandomDataSource(a=-2, b=2)
+ self.clustering.dmin = 2
+ self.clustering.parametrize()
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/tests/test_tica.py b/pyemma/coordinates/tests/test_tica.py
new file mode 100644
index 000000000..416d56f2b
--- /dev/null
+++ b/pyemma/coordinates/tests/test_tica.py
@@ -0,0 +1,89 @@
+'''
+Created on 02.02.2015
+
+@author: marscher
+'''
+import unittest
+import numpy as np
+
+from pyemma.coordinates.io.data_in_memory import DataInMemory
+from pyemma.coordinates.transform.tica import TICA
+from pyemma.util.log import getLogger
+
+logger = getLogger('TestTICA')
+
+
+class TestTICA(unittest.TestCase):
+
+ def test(self):
+ np.random.seed(0)
+
+ tica = TICA(lag=50, output_dimension=1)
+ data = np.random.randn(100, 10)
+ ds = DataInMemory(data)
+ tica.data_producer = ds
+
+ tica.parametrize()
+
+ Y = tica.map(data)
+
+ def test_duplicated_data(self):
+ tica = TICA(lag=1, output_dimension=1)
+
+ # make some data that has one column repeated twice
+ X = np.random.randn(100, 2)
+ X = np.hstack((X, X[:, 0, np.newaxis]))
+
+ d = DataInMemory(X)
+
+ tica.data_producer = d
+ tica.parametrize()
+
+ assert tica.eigenvectors.dtype == np.float64
+ assert tica.eigenvalues.dtype == np.float64
+
+ def test_singular_zeros(self):
+ tica = TICA(lag=1, output_dimension=1)
+
+ # make some data that has one column of all zeros
+ X = np.random.randn(100, 2)
+ X = np.hstack((X, np.zeros((100, 1))))
+
+ d = DataInMemory(X)
+
+ tica.data_producer = d
+ tica.parametrize()
+
+ assert tica.eigenvectors.dtype == np.float64
+ assert tica.eigenvalues.dtype == np.float64
+
+ @unittest.skip("datainmemory does currently not support chunking.")
+ def testChunksizeResultsTica(self):
+ chunk = 40
+ lag = 100
+ np.random.seed(0)
+ X = np.random.randn(23000, 3)
+
+ # un-chunked
+ d = DataInMemory(X)
+
+ tica = TICA(lag=lag, output_dimension=1)
+ tica.data_producer = d
+ tica.parametrize()
+
+ cov = tica.cov.copy()
+ mean = tica.mu.copy()
+
+ # ------- run again with new chunksize -------
+ d = DataInMemory(X)
+ d.chunksize = chunk
+ tica = TICA(lag=lag, output_dimension=1)
+ tica.data_producer = d
+
+ tica.parametrize()
+
+ np.testing.assert_allclose(tica.mu, mean)
+ np.testing.assert_allclose(tica.cov, cov)
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/tests/test_writer.py b/pyemma/coordinates/tests/test_writer.py
new file mode 100644
index 000000000..cdc9c225b
--- /dev/null
+++ b/pyemma/coordinates/tests/test_writer.py
@@ -0,0 +1,36 @@
+'''
+Created on 22.01.2015
+
+@author: marscher
+'''
+import os
+import tempfile
+import unittest
+import numpy as np
+
+from pyemma.coordinates.io.writer import WriterCSV
+from pyemma.coordinates.io.data_in_memory import DataInMemory
+
+
+class TestWriterCSV(unittest.TestCase):
+
+ def setUp(self):
+ self.output_file = tempfile.mktemp('', 'test_writer_csv')
+
+ def tearDown(self):
+ os.unlink(self.output_file)
+
+ def testWriter(self):
+ writer = WriterCSV(self.output_file)
+ data = np.random.random((100, 3))
+ dm = DataInMemory(data)
+ writer.data_producer = dm
+
+ writer.parametrize()
+
+ # open file and compare data
+ output = np.loadtxt(self.output_file)
+ np.testing.assert_allclose(output, data)
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/transform/__init__.py b/pyemma/coordinates/transform/__init__.py
index fe31da680..8263b1c6a 100644
--- a/pyemma/coordinates/transform/__init__.py
+++ b/pyemma/coordinates/transform/__init__.py
@@ -1,35 +1,15 @@
r"""
-
===============================================================================
transform - Transformation Utilities (:mod:`pyemma.coordinates.transform`)
===============================================================================
-
.. currentmodule:: pyemma.coordinates.transform
-Order parameters
-================
-
-.. autosummary::
- :toctree: generated/
-
- createtransform_selection - select a given set of atoms
- createtransform_distances - compute intramolecular distances
- createtransform_angles - compute angles
- createtransform_dihedrals - compute dihedral
- createtransform_minrmsd - compute minrmsd distance
-
-
-Complex transformations
-========================
-
.. autosummary::
- :toctree: generated/
-
- pca - principal components
- tica - time independent components
- transform_file
- transform_trajectory
+ :toctree: generated/
+ PCA - principal components
+ TICA - time independent components
"""
-from .api import *
-from .util import *
+
+from .pca import *
+from .tica import *
diff --git a/pyemma/coordinates/transform/api.py b/pyemma/coordinates/transform/api.py
deleted file mode 100644
index a698ea3bc..000000000
--- a/pyemma/coordinates/transform/api.py
+++ /dev/null
@@ -1,283 +0,0 @@
-r"""
-
-=========================
-coordinates.transform API
-=========================
-
-Created on Dec 30, 2013
-
-@author: noe
-
-"""
-
-__docformat__ = "restructuredtext en"
-
-import numpy as np
-import pyemma.util.pystallone as stallone
-import general_transform
-import linear_transform
-
-# shortcuts to stallone API
-coorNew = stallone.API.coorNew
-coor = stallone.API.coor
-dataNew = stallone.API.dataNew
-data = stallone.API.data
-# shortcuts to python
-CoordinateTransform = general_transform.CoordinateTransform
-PCA = linear_transform.PCA
-TICA = linear_transform.TICA
-
-
-__author__ = "Martin Scherer, Fabian Paul, Frank Noe"
-__copyright__ = "Copyright 2014, Computational Molecular Biology Group, FU-Berlin"
-__credits__ = ["Martin Scherer", "Fabian Paul", "Frank Noe"]
-__license__ = "FreeBSD"
-__version__ = "2.0.0"
-__maintainer__ = "Martin Scherer"
-__email__="m.scherer AT fu-berlin DOT de"
-
-__all__=['createtransform_selection',
- 'createtransform_distances',
- 'createtransform_angles',
- 'createtransform_dihedrals',
- 'createtransform_minrmsd',
- 'pca',
- 'tica',
- 'transform_file',
- 'transform_trajectory']
-
-################################################################################
-# Compute Order parameters from coordinates
-################################################################################
-
-def createtransform_selection(selection):
- r"""Creates a transform that will select the given set of atoms.
-
- Parameters
- ----------
- selection : integer list
- list of selected atoms
-
- """
- jsel = stallone.jarray(selection)
- T = CoordinateTransform(coorNew.transform_selection(jsel))
- return T
-
-def createtransform_distances(selection, selection2=None):
- r"""Creates a transform that computes intramolecular distances
- between the selected atoms.
-
- By default, the distances between all atoms in the given selection
- will be computed. When one selection (N integers or N tuples of
- integers) is given, the N(N+1)/2 distances dij with i
- version 2013-02-22 feb */
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include
-
-#define D(obj,i) (((double*)((obj)->data))[i])
-
-// TODO: move dcd reader impl to stallone (java)
-
-/* class for file readers */
-typedef struct reader_struct {
- int (*init)(struct reader_struct*, char*);
- int (*read_record)(struct reader_struct, double*, int);
- FILE *stream;
- int swap;
- unsigned charmm, charmm_4dims, charmm_extrablock, n;
- long endoffile;
-} reader;
-
-/* Split a line of numers into an array of doubles
- * stream: input stream
- * record: output array of doubles
- * size: input maximum space in record array
- * if zero, don't fill record, just count entries
- * returns the number of doubles read into record or 0 in case
- * of eof or -1 in case of stream read error
- */
-#define NUMBER 0
-#define SPACE 1
-int read_record_dat(reader r, double* record, int size) {
- int ch;
- int ocount, icount;
- int state;
- char buffer[0x100];
- char *bufptr;
- double d;
-
- if (ferror(r.stream))
- return -1;
- if (feof(r.stream))
- return 0;
-
- ocount = 0;
- icount = 0;
- ch = getc(r.stream);
- bufptr = buffer;
-
- if (isspace(ch))
- state = SPACE;
- else
- state = NUMBER;
-
- while (ch != '\n' && ch != -1) {
- switch (state) {
- case NUMBER:
- if (isdigit(ch) || ch == '.' || ch == '-' || ch == '+' || ch == 'e'
- || ch == 'E') {
- *bufptr = ch;
- bufptr++;
- if (bufptr - buffer >= 0x100)
- return -1;
- } else if (isspace(ch) || ch == ',') {
- *bufptr = '\0';
- d = atof(buffer);
- if (size > 0 && ocount >= size)
- return -1;
- if (size > 0)
- record[ocount] = d;
- ocount++;
- icount++;
- bufptr = buffer;
- state = SPACE;
- } else
- return -1;
- break;
- case SPACE:
- if (!isspace(ch)) {
- state = NUMBER;
- ungetc(ch, r.stream);
- }
- break;
- }
- ch = getc(r.stream);
- }
- if (bufptr != buffer) { /* last number was not converted */
- *bufptr = '\0';
- d = atof(buffer);
- if (size > 0 && ocount >= size)
- return -1;
- if (size > 0)
- record[ocount] = d;
- ocount++;
- }
-
- if (ferror(r.stream))
- return -1;
- else
- return ocount;
-}
-
-int init_dat(reader *r, char *fname) {
- int n;
- r->stream = fopen(fname, "r");
- if (!r->stream)
- return -1;
- n = read_record_dat(*r, NULL, 0);
- if (n < 0)
- return n;
- else {
- rewind(r->stream);
- return n;
- }
-}
-
-float reverse_float(const float x) {
- float result;
- char *input = (char*) &x;
- char *output = (char*) &result;
-
- output[0] = input[3];
- output[1] = input[2];
- output[2] = input[1];
- output[3] = input[0];
-
- return result;
-}
-
-unsigned long reverse_int(const unsigned long x) {
- unsigned long result;
- char *input = (char*) &x;
- char *output = (char*) &result;
-
- output[0] = input[3];
- output[1] = input[2];
- output[2] = input[1];
- output[3] = input[0];
-
- return result;
-}
-
-int read_int(unsigned int *value, FILE *stream, int swap) {
- int i;
- if (fread(&i, 4, 1, stream) != 1)
- return -1;
- if (swap)
- *value = reverse_int(i);
- else
- *value = i;
- return 1;
-}
-
-int read_float(double *value, FILE *stream, int swap) {
- float f;
- if (fread(&f, 4, 1, stream) != 1)
- return -1;
- if (swap)
- *value = (double) reverse_float(f);
- else
- *value = (double) f;
- return 1;
-}
-
-int init_dcd(reader *r, char *fname) {
- unsigned key, nset, istart, nsavc, i, newsize, numlines, namnf;
- char cord[4];
-
- printf("using dcd reader ["); // debug
-
- r->stream = fopen(fname, "r");
- if (!r->stream)
- return -1;
-
- if (fread(&key, 4, 1, r->stream) != 1)
- return -1;
- if (key != 84) {
- r->swap = 1;
- printf("byte order is reversed, "); // debug
- } else
- r->swap = 0;
-
- // Figure out how big the file is
- if (fseek(r->stream, 0, SEEK_END) != 0)
- return -1;
- r->endoffile = ftell(r->stream);
- if (r->endoffile == -1)
- return -1;
- if (fseek(r->stream, 4, SEEK_SET) != 0)
- return -1;
-
- // Read CORD
- if (fread(cord, 1, 4, r->stream) != 4)
- return -1;
- printf("sentinel %c%c%c%c, ", cord[0], cord[1], cord[2], cord[3]); // debug
- // Read NSET
- if (read_int(&nset, r->stream, r->swap) != 1)
- return -1;
- if (read_int(&istart, r->stream, r->swap) != 1)
- return -1;
- if (read_int(&nsavc, r->stream, r->swap) != 1)
- return -1;
-
- // Reposition to 40 from beginning; read namnf, number of free atoms
- printf("test namnf, "); // debug
- if (fseek(r->stream, 40, SEEK_SET) != 0)
- return -1;
- if (read_int(&namnf, r->stream, r->swap) != 1)
- return -1;
- if (namnf != 0) {
- fprintf(stderr, "namnf file format not supported.\n");
- return -1;
- }
-
- // Figure out if we're CHARMm or not
- if (fseek(r->stream, 84, SEEK_SET) != 0)
- return -1;
- if (read_int(&i, r->stream, r->swap) != 1)
- return -1;
- printf("charmm? %d, ", i != 0); // debug
- if (i == 0)
- r->charmm = 0;
- else {
- r->charmm = 1;
- r->charmm_extrablock = 0;
- r->charmm_4dims = 0;
- // check for extra block
- if (fseek(r->stream, 48, SEEK_SET) != 0)
- return -1;
- if (read_int(&i, r->stream, r->swap) != 1)
- return -1;
- if (i == 1)
- r->charmm_extrablock = 1;
- printf("extrablock %d, ", r->charmm_extrablock); // debug
- // check for 4 dims
- if (fseek(r->stream, 52, SEEK_SET) != 0)
- return -1;
- if (read_int(&i, r->stream, r->swap) != 1)
- return -1;
- if (i == 1)
- r->charmm_4dims = 1;
- printf("charmm_4dims %d, ", r->charmm_4dims); // debug
- }
-
- // Get the size of the next block, and skip it
- // This is the title
- if (fseek(r->stream, 92, SEEK_SET) != 0)
- return -1;
- if (read_int(&newsize, r->stream, r->swap) != 1)
- return -1;
- //printf("newsize %d, ", newsize); // debug
- if (read_int(&numlines, r->stream, r->swap) != 1)
- return -1;
- //printf("numlines %d, ", numlines); // debug
- if (fseek(r->stream, numlines * 80, SEEK_CUR) != 0)
- return -1;
- if (read_int(&newsize, r->stream, r->swap) != 1)
- return -1;
- //printf("newsize %d, ", newsize); // debug
-
- // Read in a 4, then the number of atoms, then another 4
- if (read_int(&i, r->stream, r->swap) != 1)
- return -1;
- if (read_int(&r->n, r->stream, r->swap) != 1)
- return -1;
- if (read_int(&i, r->stream, r->swap) != 1)
- return -1;
- printf("natoms %d ]\n", r->n); // debug
-
- return r->n * 3;
-}
-
-int read_record_dcd(reader r, double* record, int size) {
- unsigned blocksize;
- int i;
- long n;
-
- n = ftell(r.stream);
- //printf("we are at %d, end is at %d\n", n, r.endoffile); // debug
- if (n > r.endoffile) {
- fprintf(stderr, "overshot file end\n");
- return -1;
- }
- if (n == -1)
- return -1;
- if (n >= r.endoffile)
- return 0;
-
- // If this is a CHARMm file and contains an extra data block, we must skip it
- if (r.charmm && r.charmm_extrablock) {
- //printf("skipping block\n"); // debug
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- if (fseek(r.stream, blocksize, SEEK_CUR) != 0)
- return -1;
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- }
-
- // Get x coordinates
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- //printf("read X %d\n", blocksize); // debug
- if (3 * blocksize / 4 > size)
- return -1;
- for (i = 0; i < blocksize / 4; i++)
- if (read_float(&record[3 * i], r.stream, r.swap) != 1)
- return -1;
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- //printf("read X %d\n", blocksize); // debug
-
- // Get y coordinates
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- //printf("read Y %d\n", blocksize); // debug
- if (3 * blocksize / 4 > size)
- return -1;
- for (i = 0; i < blocksize / 4; i++)
- if (read_float(&record[3 * i + 1], r.stream, r.swap) != 1)
- return -1;
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- //printf("read Y %d\n", blocksize); // debug
-
- // Get z coordinates
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- //printf("read Z %d\n", blocksize); // debug
- if (3 * blocksize / 4 > size)
- return -1;
- for (i = 0; i < blocksize / 4; i++)
- if (read_float(&record[3 * i + 2], r.stream, r.swap) != 1)
- return -1;
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- //printf("read Z %d\n", blocksize); // debug
-
- // Skip the 4th dimension, if present
- if (r.charmm && r.charmm_4dims) {
- //printf("skip 4D\n"); // debug
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- if (fseek(r.stream, blocksize, SEEK_CUR) != 0)
- return -1;
- if (read_int(&blocksize, r.stream, r.swap) != 1)
- return -1;
- }
-
- return 3 * blocksize / 4;
-}
-
-reader reader_by_fname(char *fname) {
- reader r;
- int l;
- l = strlen(fname);
- if (l >= 4 && fname[l - 4] == '.' && tolower(fname[l - 3]) == 'd'
- && tolower(fname[l - 2]) == 'c' && tolower(fname[l - 1]) == 'd') {
- r.init = init_dcd;
- r.read_record = read_record_dcd;
- } else {
- r.init = init_dat;
- r.read_record = read_record_dat;
- }
- return r;
-}
-
-/**
- * TODO: instead of usage of a dictionary parameterize a java funciton
- * with a list of filenames, calculate everthing on java side an return a
- * matrix to python.
- */
-PyArrayObject* unpack_array(PyObject *obj, int dim1, int dim2) {
- PyArrayObject *cont_array;
- cont_array = NULL;
-
- if (!obj)
- goto error;
-
- // scalar
- if (dim1 == 1 && dim2 == 1) {
- cont_array = (PyArrayObject*) (PyArray_ContiguousFromAny(obj,
- PyArray_DOUBLE, 0, 0));
- if (!cont_array)
- goto error;
- } else { // vector
- if (dim2 == 1) {
- cont_array = (PyArrayObject*) (PyArray_ContiguousFromAny(obj,
- PyArray_DOUBLE, 1, 1));
- if (!cont_array)
- goto error;
- if (cont_array->dimensions[0] != dim1) {
- PyErr_SetString(PyExc_ValueError,
- "Number of order parameters and length of mean/std/weights differ. Forgot time column?");
- goto error;
- }
- } else { // matrix
- cont_array = (PyArrayObject*) (PyArray_ContiguousFromAny(obj,
- PyArray_DOUBLE, 2, 2));
- if (!cont_array)
- goto error;
- if (cont_array->dimensions[0] != dim1
- || cont_array->dimensions[1] != dim2) {
- PyErr_SetString(PyExc_ValueError,
- "Number of order parameters and length of mean/std/weights differ. Forgot time column?");
- goto error;
- }
- }
- }
- /* fall through */
-
- error: return cont_array;
-}
-
-PyArrayObject* new_array(int dim1, int dim2) {
- PyObject *array;
- PyArrayObject *cont_array;
- npy_intp dims[2];
- int i;
-
- array = NULL;
- cont_array = NULL;
- dims[0] = dim1;
- dims[1] = dim2;
- if (dim1 == 1 && dim2 == 1)
- array = PyArray_SimpleNew(0, dims, PyArray_DOUBLE);
- else if (dim2 == 1)
- array = PyArray_SimpleNew(1, dims, PyArray_DOUBLE);
- else
- array = PyArray_SimpleNew(2, dims, PyArray_DOUBLE);
- if (!array)
- goto error;
- cont_array = (PyArrayObject*) PyArray_ContiguousFromAny(array,
- PyArray_DOUBLE, 0, 2);
- if (!cont_array)
- goto error;
- Py_DECREF(array);
- /* initialize array content to zero */
- for (i = 0; i < dim1 * dim2; i++)
- D(cont_array,i) = 0.0;
- return cont_array;
-
- error: if (array)
- Py_DECREF(array);
- if (cont_array)
- Py_DECREF(cont_array);
- return NULL;
-}
-
-PyArrayObject* unpack_or_make(PyObject *dict, char *name, int dim1, int dim2) {
- PyObject *array;
- PyArrayObject *cont_array;
-
- cont_array = NULL;
- array = NULL;
-
- array = PyDict_GetItemString(dict, name); /*b*/
- if (!array) { /* make default object */
- cont_array = new_array(dim1, dim2);
- if (!cont_array)
- goto error;
- /* store a ref to the new data in the dict */
- if (PyDict_SetItemString(dict, name, (PyObject*) cont_array) != 0)
- goto error;
- } else { /* extract data from dict */
- cont_array = unpack_array(array, dim1, dim2);
- if (!cont_array)
- goto error;
- /* If unpack_array happened to copy the array to make it contiguous
- we need to store it again in the dict. We make our lives easier
- by always storing it again, regardless of what happened. */
- if (PyDict_SetItemString(dict, name, (PyObject*) cont_array) != 0)
- goto error;
- }
-
- return cont_array;
-
- error: if (cont_array)
- Py_DECREF(cont_array);
- return NULL;
-}
-
-void update_mean(const double *record, PyArrayObject* mean, int n) {
- int i;
- #pragma omp parallel for private(i)
- for (i = 0; i < n; i++)
- D(mean,i) += record[i];
-}
-
-void update_var(const double *record, PyArrayObject* var, int n) {
- int i;
- #pragma omp parallel for private(i)
- for (i = 0; i < n; i++)
- D(var,i) += record[i] * record[i];
-}
-
-void update_cov(const double *record, PyArrayObject* cov, int n) {
- int i, j;
- #pragma omp parallel for private(i,j)
- for (i = 0; i < n; i++) {
- for (j = i; j < n; j++) {
- D(cov,i*n+j) += record[i] * record[j];
- }
- }
-}
-
-void update_tcov(const double *record, PyArrayObject* tcov,
- PyArrayObject* samples, int n, int m, double *shift, int lag) {
- int i, j;
-
- if (m >= lag) {
- D(samples,0) += 1;
- #pragma omp parallel for private(i,j)
- for (i = 0; i < n; i++) {
- for (j = 0; j < n; j++) {
- D(tcov,i*n+j) += record[j]
- * shift[((m - lag) % lag) * n + i];
- }
- }
- }
- #pragma omp parallel for private(i)
- for (i = 0; i < n; i++)
- shift[(m % lag) * n + i] = record[i];
-}
-
-/**
- * calculates covariance matrices
- */
-static PyObject *run(PyObject *self, PyObject *args) {
- char *fname;
- int do_mean, do_var, do_cov, do_tcov;
- int lag;
- int i, j, n, c;
- PyObject *stats;
- PyArrayObject* mean, *var, *cov, *tcov, *samples, *tcov_samples;
- double *record, *shift;
- reader input;
-
- record = shift = NULL;
- mean = var = cov = tcov = samples = tcov_samples = NULL;
- input.stream = NULL;
- stats = NULL;
-
- if (!PyArg_ParseTuple(args, "sO!iiiii", &fname, &PyDict_Type, &stats,
- &do_mean, &do_var, &do_cov, &do_tcov, &lag))
- goto error;
-
- setlocale(LC_NUMERIC, "C");
- /* open date file and get record size */
- input = reader_by_fname(fname);
- n = input.init(&input, fname);
- if (n < 1) {
- PyErr_SetFromErrno(PyExc_IOError);
- goto error;
- }
-
- /* allocate memory for a row */
- record = (double*) malloc(n * sizeof(double));
- if (!record) {
- PyErr_NoMemory();
- goto error;
- }
-
- /* setup data structures */
- mean = unpack_or_make(stats, "mean", n, 1);
- if (!mean)
- goto error;
- if (do_var) {
- var = unpack_or_make(stats, "var", n, 1);
- if (!var)
- goto error;
- }
- if (do_cov) {
- cov = unpack_or_make(stats, "cov", n, n);
- if (!cov)
- goto error;
- }
- if (do_tcov) {
- if (lag < 1) {
- PyErr_SetString(PyExc_ValueError, "lag < 1 not allowed.");
- goto error;
- }
- tcov = unpack_or_make(stats, "tcov", n, n);
- if (!tcov)
- goto error;
- tcov_samples = unpack_or_make(stats, "tcov_samples", 1, 1);
- if (!tcov_samples)
- goto error;
- shift = (double*) malloc(n * lag * sizeof(double));
- if (!shift) {
- PyErr_NoMemory();
- goto error;
- }
- for (i = 0; i < n * lag; i++)
- shift[i] = 0;
- }
-
- samples = unpack_or_make(stats, "samples", 1, 1);
- if (!samples)
- goto error;
-
- /* iterate over file and calculate everything */
- j = 0;
- while (1) {
- if (ferror(input.stream)) {
- PyErr_SetFromErrno(PyExc_IOError);
- goto error;
- }
- c = input.read_record(input, record, n);
- if (c == 0)
- break; /* empty line or eof */
- if (c != n) {
- PyErr_SetString(PyExc_ValueError,
- "Unexpected line format or read error.");
- goto error;
- }
-
- /* debug */
- //for(i=0;i m) {
- PyErr_SetString(PyExc_ValueError,
- "requested output record length (max) > input record length");
- goto error;
- }
-
- /* convert numeric parameters to arrays */
- mean = unpack_array(py_mean, m, 1);
- W = unpack_array(py_W, m, m);
- if (!mean || !W)
- goto error;
-
- /* allocate memory */
- record = (double*) malloc(n * sizeof(double));
- if (!record) {
- PyErr_NoMemory();
- goto error;
- }
-
- /* open output file */
- out = fopen(fout, "w");
- if (!out) {
- PyErr_SetFromErrno(PyExc_IOError);
- goto error;
- }
-
- /* iterate over input */
- while (1) {
- if (ferror(input.stream)) {
- PyErr_SetFromErrno(PyExc_IOError);
- goto error;
- }
- c = input.read_record(input, record, n);
- if (c == 0)
- break; /* empty line or eof */
- if (c != n) {
- PyErr_SetString(PyExc_ValueError,
- "Unexpected line format or read error.");
- goto error;
- }
-
- #pragma omp parallel for private(i)
- for (i = 0; i < m; i++)
- record[i] = record[i + time_column] - D(mean,i);
- for (j = 0; j < max; j++) {
- sum = 0;
- #pragma omp parallel for reduction(+:sum)
- for (i = 0; i < m; i++)
- sum += record[i] * D(W,m*i+j);
- fprintf(out, "%f", sum);
- if (j != max - 1)
- fprintf(out, " ");
- }
- fprintf(out, "\n");
- }
-
- res = Py_BuildValue(""); /* =None */
-
- error: if (input.stream)
- fclose(input.stream);
- if (out)
- fclose(out);
- if (record)
- free(record);
- if (mean)
- Py_DECREF(mean);
- if (W)
- Py_DECREF(W);
- return res;
-}
-
-#define RUN_USAGE \
- "Calculate mean, variance, correlation and time-lagged correlation.\n" \
- "run(fname, d, mean, var, cov, tcov, lag)\n\n" \
- "fname: name of file that contains tabulated data\n" \
- "d: a dictionary that holds the statistical quantities.\n" \
- " Can be empty at first or can contain partial results from parts\n" \
- " of the trajectory computed in a previous run.\n" \
- "mean: (bool) if true, calculate mean\n" \
- " if false, subtract dictionary entry \'mean\' from every\n" \
- " frame before calculation of other statistical quantities\n" \
- " Simultaneous calculation of mean and other quantities of\n" \
- " uncentered data is *not* possible.\n" \
- "var: (bool) calculate varaince?\n" \
- "cov: (bool) calculate covariance matrix?\n" \
- "tcov: (bool) calculate time-lagged covariance matrix?\n" \
- "The statistical quantities can then be accessed using the dictionary\n" \
- "keys \'mean\', \'var\', \'cov\', \'tcov\', \'samples\' and\n" \
- " \'tcov_samples\'.\n" \
- "Note that statistical estimates are not normalized. To calculate \n" \
- "the actual mean, you need to compute d[\'mean\']/d[\'samples\'] and so on."
-
-#define PROJECT_USAGE \
- "Open file with tabulated data. Center and project each line.\n" \
- "project(fin, fout, mean, weights, max_proj, time_column)\n\n" \
- "fin: name of input file\n" \
- "fout: name of output file (for projected data)\n" \
- "mean: list of means; is subtracted from every line\n" \
- "weights: n x n square matrix of weights W; the ith entry of each new\n" \
- " line is y_i = Sum_k W_ki x_k, where x_k are the entries of\n" \
- " the input line after centering.\n" \
- "max_proj: only write y_0 through y_{max_proj-1} to the output file\n" \
- "time_colum: (bool) input data contains time column?" \
-
-static PyMethodDef cocoMethods[] =
- { { "run", run, METH_VARARGS, RUN_USAGE }, { "project", project,
- METH_VARARGS, PROJECT_USAGE }, { NULL, NULL, 0, NULL } };
-
-PyMODINIT_FUNC initcocovar(void) {
- (void) Py_InitModule("cocovar", cocoMethods);
- import_array();
-
-
-}
diff --git a/pyemma/coordinates/transform/general_transform.py b/pyemma/coordinates/transform/general_transform.py
deleted file mode 100644
index be4999c99..000000000
--- a/pyemma/coordinates/transform/general_transform.py
+++ /dev/null
@@ -1,43 +0,0 @@
-'''
-Created on Jan 5, 2014
-
-@author: noe
-'''
-from pyemma.util import pystallone as stallone
-
-
-class CoordinateTransform(object):
- """
- Wrapper class to stallone coordinate transform
- """
-
- def __init__(self, jcoordinatetransform):
- self._jcoordinatetransform = jcoordinatetransform
-
- def jtransform(self):
- return self._jcoordinatetransform
-
- def dimension(self):
- """
- Returns the mean vector estimated from the data
- """
- return self._jcoordinatetransform.dimension()
-
- def transform(self, x):
- """
- Transforms input data x
-
- WARNING: This is generally inefficient due to the current python-java
- interface - so using this might slow your code down. The prefered
- way of doing transforms is through the mass transform functions of
- the transform api (e.g. transform_file).
- Subclasses of CoordinateTransform might have an efficient implementation.
- """
- y = self._jcoordinatetransform.transform(stallone.ndarray_to_stallone_array(x))
- return stallone.stallone_array_to_ndarray(y)
-
- def has_efficient_transform(self):
- """
- Returns True if this object has an efficient implementaion of the transform method
- """
- return False
diff --git a/pyemma/coordinates/transform/linear_transform.py b/pyemma/coordinates/transform/linear_transform.py
deleted file mode 100644
index ecb9d07f2..000000000
--- a/pyemma/coordinates/transform/linear_transform.py
+++ /dev/null
@@ -1,129 +0,0 @@
-'''
-Created on Jan 5, 2014
-
-@author: noe
-'''
-
-from pyemma.util import pystallone as stallone
-from . import util
-from . import general_transform
-# shortcuts
-CoordinateTransform = general_transform.CoordinateTransform
-
-
-class PCA(CoordinateTransform):
- """
- Wrapper class to stallone PCA
- """
-
- def __init__(self, jpca):
- CoordinateTransform.__init__(self, jpca)
- self._evec = None
- self.ndim = None
-
- def mean(self):
- """
- Returns the mean vector estimated from the data
- """
- return stallone.stallone_array_to_ndarray(self.jtransform().getMeanVector())
-
- def covariance_matrix(self):
- """
- Returns the covariance matrix estimated from the data
- """
- return stallone.stallone_array_to_ndarray(self.jtransform().getCovarianceMatrix())
-
- def eigenvalues(self):
- """
- Returns the eigenvalues of the covariance matrix
- """
- return stallone.stallone_array_to_ndarray(self.jtransform().getEigenvalues())
-
- def eigenvector(self, i):
- """
- Returns the i'th largest eigenvector of the covariance matrix
- """
- return stallone.stallone_array_to_ndarray(self.jtransform().getEigenvector(i))
-
- def eigenvectors(self):
- """
- Returns the eigenvector matrix of the covariance matrix
- with eigenvectors as column vectors
- """
- if (self._evec is None):
- self._evec = stallone.stallone_array_to_ndarray(self.jtransform().getEigenvectorMatrix())
- return self._evec
-
- def set_dimension(self, d):
- """
- Sets the dimension for the projection. If not set, no dimension
- reduction will be made, i.e. the output data is transformed but
- has the same dimension as the input data.
- """
- self.ndim = d
- self.jtransform().setDimension(d)
-
- def transform(self, x):
- """
- Performs a linear transformation (and possibly dimension reduction
- if set_dimension has been used) of the input vector x to the
- coordinate system defined by the covariance matrix eigenvectors.
- """
- return util.project(x.flatten(), self.eigenvectors(), self.ndim)
-
- def __has_efficient_transform(self):
- """
- Returns True
- """
- return True
-
-
-class PCA_AMUSE(PCA):
-
- def __init__(self, amuse):
- self.amuse = amuse
- self.ndim = None
-
- def mean(self):
- return self.amuse.mean
-
- def covariance_matrix(self):
- return self.amuse.cov
-
- def eigenvalues(self):
- return self.amuse.pca_values
-
- def eigenvector(self, i):
- return self.amuse.pca_weights[i]
-
- def eigenvectors(self):
- return self.amuse.pca_weights
-
- def set_dimension(self, d):
- self.ndim = d
-
- def transform(self, x):
- return util.project(x.flatten(), self.eigenvectors(), self.ndim)
-
- def jtransform(self):
- if self.ndim is not None:
- A = self.eigenvectors()[:, 0: self.ndim]
- else:
- A = self.eigenvectors()
- A = stallone.ndarray_to_stallone_array(A.T)
- return stallone.API.coorNew.linear_operator(A)
-
-
-class TICA(PCA):
- """
- Wrapper class to stallone TICA
- """
-
- def __init__(self, jtica):
- PCA.__init__(self, jtica)
-
- def covariance_matrix_lagged(self):
- """
- Returns the lagged covariance matrix estimated from the data
- """
- return stallone.stallone_array_to_ndarray(self.__jtransform().getCovarianceMatrixLagged())
diff --git a/pyemma/coordinates/transform/masstransform/api.py b/pyemma/coordinates/transform/masstransform/api.py
deleted file mode 100644
index f29f53637..000000000
--- a/pyemma/coordinates/transform/masstransform/api.py
+++ /dev/null
@@ -1,41 +0,0 @@
-'''
-Created on Jan 4, 2014
-
-@author: noe
-'''
-
-from pyemma.coordinates.transform.masstransform import *
-
-###################################################################################################
-# FILE TRANSFORM OBJECTS
-###################################################################################################
-
-
-def create_custom_transform(name, dir_input, dir_output, custom_transform, transform_only_new=False, output_extension=None):
- return filetransform.FileTransform(name, custom_transform, dir_input, dir_output, filepattern="*",
- transform_only_new=transform_only_new,
- output_extension=output_extension)
-
-def create_crd2dist_transform(dir_input, dir_output, set1, set2=None, output_extension=None):
- crd2dist = transform_crd2dist.Transform_crd2dist(set1, set2)
- ft = filetransform.FileTransform("Distance computation", crd2dist, dir_input, dir_output, filepattern="*", transform_only_new=True, output_extension=output_extension)
- return ft
-
-def create_tica_transform(dir_input, dir_tica, dir_output, lag, ndim, output_extension=None):
- tica = transform_tica.Transform_TICA(lag, ndim, dir_tica=dir_tica, output_extension=output_extension)
- ft = filetransform.FileTransform("TICA", tica, dir_input, dir_output, filepattern="*", transform_only_new=False, output_extension=output_extension)
- return ft
-
-def create_clustering_regspace(dir_input, dir_work, dir_output, dmin, cluster_skipframes = 1):
- cluster = transform_clusterassign.Transform_ClusterAssign(dir_work+"/clustercenters.dat",
- skipframes = cluster_skipframes,
- filepattern="*",
- parameters={'algorithm': 'regularspatial', 'dmin': dmin, 'metric': 'euclidean'})
- ft = filetransform.FileTransform("Regular-space clustering", cluster, dir_input, dir_output, filepattern="*", transform_only_new=False)
- return ft
-
-def create_clustering_kmeans(dir_input, dir_work, dir_output, ncluster, cluster_skipframes = 1):
- return transform_clusterassign.Transform_ClusterAssign("k-means clustering", dir_input, dir_work+"/clustercenters.dat", dir_output,
- skipframes = cluster_skipframes,
- filepattern="*",
- parameters={'algorithm': 'kmeans', 'clustercenters': ncluster, 'metric': 'euclidean'})
diff --git a/pyemma/coordinates/transform/masstransform/filetransform.py b/pyemma/coordinates/transform/masstransform/filetransform.py
deleted file mode 100644
index 440cd85f2..000000000
--- a/pyemma/coordinates/transform/masstransform/filetransform.py
+++ /dev/null
@@ -1,86 +0,0 @@
-'''
-Created on Nov 16, 2013
-
-@author: noe
-
-Abstract class of a 1:1 transformation from a set of input files to a set of output files.
-This could be a file format transformation, a coordinate transformation, or a cluster discretization.
-'''
-
-import os
-import fnmatch
-
-class FileTransform(object):
- def __init__(self,
- name,
- transformer,
- dir_input,
- dir_output,
- filepattern="*",
- output_extension=None,
- transform_only_new=False,
- custom_transform=None):
- self.name = name
- self.dir_input = dir_input
- self.dir_output = dir_output
- self.filepattern = filepattern
- self.output_extension = output_extension
- self.transform_only_new = transform_only_new
- self.transformer = transformer
-
- # create sub-directories if they don't exist yet
- if (not os.path.isdir(dir_output)):
- os.mkdir(dir_output)
-
- self.processed_files = []
-
- def __process_new_input(self, all_input_files, new_input_files):
- """
- Processes the input. Should be overwritten by subclass in order to
- handle new input data and update the state of the class
-
- all_input_files: list of all input files
- new_input-files: list of input files that are new since the previous call to update()
- """
- if "process_new_input" in dir(self.transformer):
- self.transformer.process_new_input(all_input_files, new_input_files)
-
- def __transform(self, infile, outfile):
- """
- Transforms input file to output file
-
- infile: input file
- outfile: output file
- """
- if "transform" in dir(self.transformer):
- self.transformer.transform(infile, outfile)
-
-
- def update(self):
- # get all matching input files, and see if there are new files
- input_files = fnmatch.filter(os.listdir(self.dir_input), self.filepattern);
- new_files = list(set(input_files) - set(self.processed_files));
- if (not new_files):
- return # nothing to do.
-
- # full file names
- input_files_full = [os.path.join(self.dir_input, f) for f in input_files]
- new_files_full = [os.path.join(self.dir_input, f) for f in new_files]
-
- # call process_new_input
- self.__process_new_input(input_files_full, new_files_full)
- print " Algorithm '"+self.name+"' processes new input files: "+str(new_files)
- # register as processed
- self.processed_files = input_files
-
- # call transform with either all or only new files
- transformlist = input_files
- if (self.transform_only_new):
- transformlist = new_files
- for filename in transformlist:
- infile = os.path.join(self.dir_input,filename)
- outfile = os.path.join(self.dir_output,filename)
- if (not self.output_extension is None):
- outfile = os.path.splitext(outfile)[0] + self.output_extension
- print "\t transforming: "+infile+" -> "+outfile
- self.__transform(infile, outfile);
diff --git a/pyemma/coordinates/transform/masstransform/transform_clusterassign.py b/pyemma/coordinates/transform/masstransform/transform_clusterassign.py
deleted file mode 100644
index a8804eb8e..000000000
--- a/pyemma/coordinates/transform/masstransform/transform_clusterassign.py
+++ /dev/null
@@ -1,72 +0,0 @@
-'''
-Created on Nov 16, 2013
-
-@author: noe
-'''
-import os
-import filetransform
-
-from pyemma.util.pystallone import *
-from pyemma.coordinates.clustering.stalloneClustering import *
-
-class Transform_ClusterAssign(filetransform.FileTransform):
-
- file_clustercenters = "./clustercenters.dat"
- parameters={'algorithm': 'regularspatial',
- 'dmin': 1.0,
- 'metric': 'euclidean'}
- clustering = None
- assignment = None
-
- def __init__(self, file_clustercenters = "./clustercenters.dat",
- skipframes = 1,
- parameters={'algorithm': 'regularspatial', 'dmin': 1.0, 'metric': 'euclidean'},
- filepattern="*", emma_path=""):
- """
- input_directory: directory with input data files
- tica_directory: directory to store covariance and mean files
- output_directory: directory to write transformed data files to
- lag: TICA lagtime
- ndim: number of TICA dimensions to use
- """
- # call superclass initialization
- self.file_clustercenters = file_clustercenters
- self.skipframes = skipframes;
- self.parameters = parameters
- self.emma_path = emma_path
-
-
- def process_new_input(self, all_input_files, new_input_files):
- """
- Perform clustering and write cluster centers file
- """
- loader = getDataSequenceLoader(all_input_files)
- loader.scan()
- data = loader.getSingleDataLoader()
-
- self.clustering = getClusterAlgorithm(data, loader.size(), **self.parameters)
- self.clustering.perform()
- self.assignment = self.clustering.getClusterAssignment()
-
- print "number of clusters: ", self.clustering.getNumberOfClusters();
- clustercenters = self.clustering.getClusterCenters();
- writer = API.dataNew.createASCIIDataWriter(self.file_clustercenters, clustercenters.dimension(), ' ', '\n')
- writer.addAll(clustercenters)
- writer.close()
-
-
- def transform(self, infile, outfile):
- """
- Assign individual file
- """
- loader = API.dataNew.dataSequenceLoader(infile)
- loader.scan()
- fout = open(outfile, "w")
- # iterate input file and assign cluster
- it = loader.iterator()
- while it.hasNext():
- x = it.next()
- i = self.assignment.assign(x)
- fout.write(str(i)+"\n")
- # close output
- fout.close()
diff --git a/pyemma/coordinates/transform/masstransform/transform_crd2dist.py b/pyemma/coordinates/transform/masstransform/transform_crd2dist.py
deleted file mode 100644
index cea6ed5df..000000000
--- a/pyemma/coordinates/transform/masstransform/transform_crd2dist.py
+++ /dev/null
@@ -1,37 +0,0 @@
-'''
-Created on Nov 16, 2013
-
-@author: noe
-'''
-
-import filetransform
-from pyemma.util.pystallone import *
-
-class Transform_crd2dist(filetransform.FileTransform):
-
- # Atom set 1
- _set1 = JArray(JInt)([0])
- _set2 = None
-
- def __init__(self, set1, set2=None):
- """
- input_directory: directory with input data files
- tica_directory: directory to store covariance and mean files
- output_directory: directory to write transformed data files to
- lag: TICA lagtime
- ndim: number of TICA dimensions to use
- """
- # define sets for the distance computation
- self._set1 = jarray(set1)
- if not (set2 is None):
- self._set2 = jarray(set2)
-
-
- def transform(self, infile, outfile):
- """
- Transform individual file
- """
- if (self._set2 is None):
- API.coor.convertToDistances(infile, outfile, self._set1)
- else:
- API.coor.convertToDistances(infile, outfile, self._set1, self._set2)
diff --git a/pyemma/coordinates/transform/masstransform/transform_tica.py b/pyemma/coordinates/transform/masstransform/transform_tica.py
deleted file mode 100644
index c633802c5..000000000
--- a/pyemma/coordinates/transform/masstransform/transform_tica.py
+++ /dev/null
@@ -1,67 +0,0 @@
-'''
-Created on Nov 16, 2013
-
-@author: noe
-'''
-
-import os
-import numpy as np
-
-import filetransform
-from pyemma.coordinates.transform.tica import Amuse
-
-class Transform_TICA(filetransform.FileTransform):
-
- # Amuse object
- amuse = None
- dir_tica = "./"
- lag = 1
- ndim = 1
-
- def __init__(self, lag, ndim, dir_tica=None, output_extension=None):
- """
- Initializes TICA
-
- input_directory: directory with input data files
- tica_directory: directory to store covariance and mean files
- output_directory: directory to write transformed data files to
-
- lag: int (1)
- TICA lagtime
- ndim: int (1)
- number of TICA dimensions to use
- dir_tica: String or None
- If not none, TICA results will be stored in the given directory
-
- """
- self.dir_tica = dir_tica
- self.lag = lag;
- self.ndim = ndim;
-
- # create sub-directories if they don't exist yet
- if (not os.path.isdir(dir_tica)):
- os.mkdir(dir_tica)
-
-
- def process_new_input(self, all_input_files, new_input_files):
- """
- Do TICA and write the TICA matrices into the working directory
- """
- self.amuse = Amuse.compute(all_input_files, self.lag)
-
- if (self.dir_tica != None):
- np.savetxt(self.dir_tica+"/C0.dat", self.amuse.corr)
- np.savetxt(self.dir_tica+"/C"+str(self.lag)+".dat", self.amuse.tcorr)
- np.savetxt(self.dir_tica+"/eval_pca.dat", self.amuse.pca_values)
- np.savetxt(self.dir_tica+"/eval_tica.dat", self.amuse.tica_values)
- np.savetxt(self.dir_tica+"/P_pca.dat", self.amuse.pca_weights)
- np.savetxt(self.dir_tica+"/P_tica.dat", self.amuse.tica_weights)
- np.savetxt(self.dir_tica+"/data_mean.dat", self.amuse.mean)
- np.savetxt(self.dir_tica+"/data_var.dat", self.amuse.var)
-
-
- def transform(self, infile, outfile):
- """
- Transform individual file
- """
- self.amuse.project(infile, outfile, self.amuse.tica_weights, self.ndim)
diff --git a/pyemma/coordinates/transform/pca.py b/pyemma/coordinates/transform/pca.py
new file mode 100644
index 000000000..07b103a85
--- /dev/null
+++ b/pyemma/coordinates/transform/pca.py
@@ -0,0 +1,150 @@
+__author__ = 'noe'
+
+import numpy as np
+from .transformer import Transformer
+from pyemma.util.log import getLogger
+from pyemma.util.annotators import doc_inherit
+
+log = getLogger('PCA')
+__all__ = ['PCA']
+
+
+class PCA(Transformer):
+
+ r"""Principal component analysis.
+
+ Given a sequence of multivariate data :math:`X_t`,
+ computes the mean-free covariance matrix.
+
+ .. math:: C = (X - \mu)^T (X - \mu)
+
+ and solves the eigenvalue problem
+
+ .. math:: C r_i = \sigma_i r_i,
+
+ where :math:`r_i` are the principal components and :math:`\sigma_i` are
+ their respective variances.
+
+ When used as a dimension reduction method, the input data is projected onto
+ the dominant principal components.
+
+ Parameters
+ ----------
+ output_dimension : int
+ number of principal components to project onto
+
+ """
+
+ def __init__(self, output_dimension):
+ super(PCA, self).__init__()
+ self.output_dimension = output_dimension
+
+ @doc_inherit
+ def describe(self):
+ return "[PCA, output dimension = %i]" % self.output_dimension
+
+ def dimension(self):
+ """
+ Returns the number of output dimensions
+
+ :return:
+ """
+ return self.output_dimension
+
+ @doc_inherit
+ def get_constant_memory(self):
+ """Returns the constant memory requirements, in bytes."""
+ # memory for mu, C, v, R
+ dim = self.data_producer.dimension()
+
+ cov_elements = dim ** 2
+ mu_elements = dim
+
+ v_elements = dim
+ R_elements = cov_elements
+
+ return 8 * (cov_elements + mu_elements + v_elements + R_elements)
+
+ @doc_inherit
+ def get_memory_per_frame(self):
+ # memory for temporaries
+ dim = self.data_producer.dimension()
+
+ x_meanfree_elements = self.chunksize * dim
+
+ dot_prod_elements = dim
+
+ return 8 * (x_meanfree_elements + dot_prod_elements)
+
+ @doc_inherit
+ def param_init(self):
+ log.info("Running PCA")
+ self.N = 0
+ # create mean array and covariance matrix
+ dim = self.data_producer.dimension()
+ self.mu = np.zeros(dim)
+ self.C = np.zeros((dim, dim))
+
+ def param_add_data(self, X, itraj, t, first_chunk, last_chunk_in_traj,
+ last_chunk, ipass, Y=None):
+ """
+ Chunk-based parametrization of PCA. Iterates through all data twice. In the first pass, the
+ data means are estimated, in the second pass the covariance matrix is estimated.
+ Finally, the eigenvalue problem is solved to determine the principal compoennts.
+
+ :param X:
+ coordinates. axis 0: time, axes 1-..: coordinates
+ :param itraj:
+ index of the current trajectory
+ :param t:
+ time index of first frame within trajectory
+ :param first_chunk:
+ boolean. True if this is the first chunk globally.
+ :param last_chunk_in_traj:
+ boolean. True if this is the last chunk within the trajectory.
+ :param last_chunk:
+ boolean. True if this is the last chunk globally.
+ :param ipass:
+ number of pass through data
+ :param Y:
+ time-lagged data (if available)
+ :return:
+ """
+ log.debug("itraj = " + str(itraj) + ". t = " + str(t) + ". last_chunk_in_traj = " + str(last_chunk_in_traj)
+ + " last_chunk = " + str(last_chunk) + " ipass = " + str(ipass))
+
+ # pass 1: means
+ if ipass == 0:
+ self.mu += np.sum(X, axis=0)
+ self.N += np.shape(X)[0]
+ if last_chunk:
+ self.mu /= self.N
+
+ # pass 2: covariances
+ if ipass == 1:
+ Xm = X - self.mu
+ self.C += np.dot(Xm.T, Xm)
+ if last_chunk:
+ self.C /= self.N
+ return True # finished!
+
+ # by default, continue
+ return False
+
+ @doc_inherit
+ def param_finish(self):
+ (v, R) = np.linalg.eigh(self.C)
+ # sort
+ I = np.argsort(v)[::-1]
+ self.v = v[I]
+ self.R = R[:,I]
+
+ def map(self, X):
+ """
+ Projects the data onto the dominant principal components.
+
+ :param X: the input data
+ :return: the projected data
+ """
+ Y = np.dot(X, self.R[:, 0:self.output_dimension])
+ return Y
diff --git a/pyemma/coordinates/transform/tests/test_linear_transform.py b/pyemma/coordinates/transform/tests/test_linear_transform.py
deleted file mode 100644
index 67d07f065..000000000
--- a/pyemma/coordinates/transform/tests/test_linear_transform.py
+++ /dev/null
@@ -1,38 +0,0 @@
-'''
-Created on Jun 23, 2014
-
-@author: marscher
-'''
-import unittest
-import numpy as np
-from pyemma.util.pystallone import API, ndarray_to_stallone_array,\
- stallone_array_to_ndarray
-
-class TestTransformation(unittest.TestCase):
-
- def testLinearOperatorTransformation(self):
- n = 669
- # note transpose
- A = np.arange(2*n).reshape((n, 2)).astype(np.float).T
- b = np.arange(n).astype(np.float)
-
- expected = np.dot(A, b)
-
- A_st=ndarray_to_stallone_array(A)
- assert A_st.rows() == A.shape[0]
- assert A_st.columns() == A.shape[1]
-
- b=b.reshape((n/3, 3))
- b_st = ndarray_to_stallone_array(b)
- assert b_st.rows() == b.shape[0]
-
- # currently there is no python api wrapping this, so call it directly.
- transformation = API.coorNew.linear_operator(A_st)
- result = transformation.transform(b_st)
-
- # compare result
- result_nd = stallone_array_to_ndarray(result)
- assert np.allclose(expected, result_nd)
-
-if __name__ == "__main__":
- unittest.main()
\ No newline at end of file
diff --git a/pyemma/coordinates/transform/tica.py b/pyemma/coordinates/transform/tica.py
new file mode 100644
index 000000000..1f124b9a1
--- /dev/null
+++ b/pyemma/coordinates/transform/tica.py
@@ -0,0 +1,206 @@
+'''
+Created on 19.01.2015
+
+@author: marscher
+'''
+from .transformer import Transformer
+from pyemma.util.linalg import eig_corr
+from pyemma.util.log import getLogger
+from pyemma.util.annotators import doc_inherit
+
+import numpy as np
+
+log = getLogger('TICA')
+__all__ = ['TICA']
+
+
+class TICA(Transformer):
+
+ r"""
+ Time-lagged independent component analysis (TICA)
+
+ Given a sequence of multivariate data :math:`X_t`, computes the mean-free
+ covariance and time-lagged covariance matrix:
+
+ .. math::
+
+ C_0 &= (X_t - \mu)^T (X_t - \mu) \\
+ C_{\tau} &= (X_t - \mu)^T (X_t+\tau - \mu)
+ and solves the eigenvalue problem
+
+ .. math:: C_{\tau} r_i = C_0 \lambda_i r_i
+
+ where :math:`r_i` are the independent components and :math:`\lambda_i` are
+ their respective normalized time-autocorrelations. The eigenvalues are
+ related to the relaxation timescale by
+
+ .. math:: t_i = -\tau / \ln |\lambda_i|
+
+ When used as a dimension reduction method, the input data is projected
+ onto the dominant independent components.
+
+ Parameters
+ ----------
+ lag : int
+ lag time
+ output_dimension : int
+ how many significant TICS to use to reduce dimension of input data
+ epsilon : float
+ eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be
+ cut off. The remaining number of Eigenvalues define the size
+ of the output.
+
+ """
+
+ def __init__(self, lag, output_dimension, epsilon=1e-6):
+ super(TICA, self).__init__()
+
+ # store lag time to set it appropriatly in second pass of parametrize
+ self.__lag = lag
+ self.output_dimension = output_dimension
+ self.epsilon = epsilon
+
+ # covariances
+ self.cov = None
+ self.cov_tau = None
+ # mean
+ self.mu = None
+ self.N = 0
+ self.eigenvalues = None
+ self.eigenvectors = None
+
+ @doc_inherit
+ def describe(self):
+ return "[TICA, lag = %i; output dimension = %i]" \
+ % (self.lag, self.output_dimension)
+
+ def dimension(self):
+ """ output dimension"""
+ return self.output_dimension
+
+ @doc_inherit
+ def get_memory_per_frame(self):
+ # temporaries
+ dim = self.data_producer.dimension()
+
+ mean_free_vectors = 2 * dim * self.chunksize
+ dot_product = 2 * dim * self.chunksize
+
+ return 8 * (mean_free_vectors + dot_product)
+
+ @doc_inherit
+ def get_constant_memory(self):
+ dim = self.data_producer.dimension()
+
+ # memory for covariance matrices (lagged, non-lagged)
+ cov_elements = 2 * dim ** 2
+ mu_elements = dim
+
+ # TODO: shall memory req of diagonalize method go here?
+
+ return 8 * (cov_elements + mu_elements)
+
+ @doc_inherit
+ def param_init(self):
+ dim = self.data_producer.dimension()
+ assert dim > 0, "zero dimension from data producer"
+ assert self.output_dimension <= dim, \
+ ("requested more output dimensions (%i) than dimension"
+ " of input data (%i)" % (self.output_dimension, dim))
+
+ self.N = 0
+ # create mean array and covariance matrices
+ self.mu = np.zeros(dim)
+
+ self.cov = np.zeros((dim, dim))
+ self.cov_tau = np.zeros_like(self.cov)
+
+ log.info("Running TICA lag=%i; shape cov=(%i, %i)" %
+ (self.lag, dim, dim))
+
+ def param_add_data(self, X, itraj, t, first_chunk, last_chunk_in_traj,
+ last_chunk, ipass, Y=None):
+ """
+ Chunk-based parameterization of TICA. Iterates through all data twice. In the first pass, the
+ data means are estimated, in the second pass the covariance and time-lagged covariance
+ matrices are estimated. Finally, the generalized eigenvalue problem is solved to determine
+ the independent components.
+
+ :param X:
+ coordinates. axis 0: time, axes 1-..: coordinates
+ :param itraj:
+ index of the current trajectory
+ :param t:
+ time index of first frame within trajectory
+ :param first_chunk:
+ boolean. True if this is the first chunk globally.
+ :param last_chunk_in_traj:
+ boolean. True if this is the last chunk within the trajectory.
+ :param last_chunk:
+ boolean. True if this is the last chunk globally.
+ :param ipass:
+ number of pass through data
+ :param Y:
+ time-lagged data (if available)
+ :return:
+ """
+ if ipass == 0:
+ # TODO: maybe use stable sum here, since small chunksizes
+ # accumulate more errors
+ self.mu += np.sum(X, axis=0)
+ self.N += np.shape(X)[0]
+
+ if last_chunk:
+ log.debug("mean before norming:\n%s" % self.mu)
+ log.debug("norming mean by %i" % self.N)
+ self.mu /= self.N
+ log.info("calculated mean:\n%s" % self.mu)
+
+ # now we request real lagged data, since we are finished
+ # with first pass
+ self.lag = self.__lag
+
+ if ipass == 1:
+ X_meanfree = X - self.mu
+ Y_meanfree = Y - self.mu
+ self.cov += np.dot(X_meanfree.T, X_meanfree)
+ self.cov_tau += np.dot(X_meanfree.T, Y_meanfree)
+
+ if last_chunk:
+ return True # finished!
+
+ return False # not finished yet.
+
+ @doc_inherit
+ def param_finish(self):
+ # norm
+ self.cov /= self.N - 1
+ self.cov_tau /= self.N - self.lag - 1
+
+ # symmetrize covariance matrices
+ self.cov = self.cov + self.cov.T
+ self.cov /= 2.0
+
+ self.cov_tau = self.cov_tau + self.cov_tau.T
+ self.cov_tau /= 2.0
+
+ # diagonalize with low rank approximation
+ self.eigenvalues, self.eigenvectors = \
+ eig_corr(self.cov, self.cov_tau, self.epsilon)
+
+ def map(self, X):
+ """Projects the data onto the dominant independent components.
+
+ Parameters
+ ----------
+ X : ndarray(n, m)
+ the input data
+
+ Returns
+ -------
+ Y : ndarray(n,)
+ the projected data
+ """
+ X_meanfree = X - self.mu
+ Y = np.dot(X_meanfree, self.eigenvectors[:, 0:self.output_dimension])
+ return Y
diff --git a/pyemma/coordinates/transform/tica_amuse.py b/pyemma/coordinates/transform/tica_amuse.py
deleted file mode 100644
index bff82a5fc..000000000
--- a/pyemma/coordinates/transform/tica_amuse.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# -*- coding: utf-8 -*-
-r"""
-===========================================================================
-TICA - time independent component analysis (:mod:`pyemma.coordinates.tica`)
-===========================================================================
-
-.. currentmodule:: pyemma.coordinates.tica
-
-TICA
-====
-
-.. autosummary::
- :toctree: generated/
-
- correlation
- Amuse
-
-References
-----------
-Identification of slow molecular order parameters for Markov model construction
-Pérez-Hernández, Guillermo and Paul, Fabian and Giorgino, Toni and De Fabritiis,
-Gianni and Noé, Frank, The Journal of Chemical Physics, 139, 015102 (2013),
-DOI:http://dx.doi.org/10.1063/1.4811489
-
-
-
-.. TODO: describe method. See http://msmbuilder.org/theory/tICA.html
-
-.. date: Created on 19.11.2013
-
-.. moduleauthor:: Fabian Paul , marscher
-"""
-
-import os
-import sys
-import numpy
-
-__docformat__ = "restructuredtext en"
-__all__ = ['correlation', 'Amuse']
-
-from pyemma.util.log import getLogger
-log = getLogger()
-
-
-def correlation(cov, var):
- r"""Calculate covariance matrix from correlation matrix.
-
- Parameters
- ----------
- cov :
- var :
-
- Returns
- -------
- corr :
-
- """
- n = cov.shape[0]
- corr = numpy.empty([n, n])
- for i in xrange(n):
- for j in xrange(n):
- corr[i, j] = cov[i, j] / numpy.sqrt(var[i] * var[j])
- return corr
-
-def log_loop(iterable):
- '''Loop over iterable and display counter on stdout.'''
-
- for i, x in enumerate(iterable):
- sys.stdout.write('%05d\r' % i)
- sys.stdout.flush()
- yield x
-
-def rename(fname, directory, inset=None):
- '''Change directory and extension of file name.
- New extension is inset+original extension.'''
-
- res = directory + os.sep + os.path.basename(fname)
- if inset:
- name, orig_ext = os.path.splitext(res)
- res = name + inset + orig_ext
- return res
-
-class Amuse(object):
- """
- TODO: document class
- """
-
- @classmethod
- def fromfiles(cls, mean, pca_weights, tica_weights, time_column=False):
- '''Initialize from files.'''
-
- amuse = cls(time_column)
- amuse.mean = numpy.genfromtxt(mean)
- error = Exception('Number of dimensions in mean and matrix does not agree.')
- if pca_weights:
- amuse.pca_weights = numpy.genfromtxt(pca_weights)
- if not amuse.pca_weights.shape[0] == amuse.pca_weights.shape[1] == amuse.mean.shape[0]:
- raise error
- if tica_weights:
- amuse.tica_weights = numpy.genfromtxt(tica_weights)
- if not amuse.tica_weights.shape[0] == amuse.tica_weights.shape[1] == amuse.mean.shape[0]:
- raise error
- amuse.n = amuse.mean.shape[0]
- return amuse
-
- @classmethod
- def compute(cls, files, lag, normalize=False, time_column=False, mean=None):
- """
- compute it from given files
- """
- amuse = cls(time_column)
- if not files:
- raise Exception('No input trajectories were given.')
-
- # case for a single file
- if not type(files) == list:
- files = [files]
-
- ''' import correlation covariance C extension module '''
- from pyemma.coordinates.transform import cocovar
-
-
- # calculate mean
- if mean is None:
- log.info('computing mean')
- mean_stats = {}
- for f in log_loop(files):
- cocovar.run(f, mean_stats, True, False, False, False, 0)
- amuse.mean = mean_stats['mean'] / mean_stats['samples']
- else:
- # use mean specified by the user
- if amuse.time_column:
- amuse.mean = numpy.hstack((numpy.zeros(1), mean))
- else:
- amuse.mean = mean
-
- # calculate rest of statistics
- log.info('computing covariances')
- stats = {'mean': amuse.mean}
- for f in log_loop(files):
- cocovar.run(f, stats, False, False, True, True, lag)
-
- amuse.n = stats['cov'].shape[0]
-
- cov = stats['cov'] / stats['samples']
- if stats['tcov_samples'] != 0:
- tcov = stats['tcov'] / stats['tcov_samples']
- else:
- tcov = stats['tcov_samples']
- amuse.var = cov.diagonal()
-
- if normalize:
- corr = correlation(cov, amuse.var)
- tcorr = correlation(tcov, amuse.var)
- else:
- corr = cov
- tcorr = tcov
-
- # symmetrization
- tcorr = 0.5 * (tcorr + numpy.transpose(tcorr))
-
- # remove time column
- if amuse.time_column:
- corr = corr[1:, 1:]
- tcorr = tcorr[1:, 1:]
- amuse.var = amuse.var[1:]
- amuse.mean = amuse.mean[1:]
- amuse.n = amuse.n - 1
-
- amuse.pca_values, amuse.pca_weights = numpy.linalg.eigh(corr)
- # normalize weights by dividing by the standard deviation of the pcs
- for i,l in enumerate(amuse.pca_values):
- if l == 0.0:
- print >> sys.stderr, "Warning: variance of PC #%d is zero. Will"\
- "ignore its contribution to the ICs." % i
- # remove contribution of this PC by setting the weights to zero
- amuse.pca_weights[:,i] *= 0.0
- else:
- amuse.pca_weights[:,i] /= numpy.sqrt(l)
-
-
- pc_tcorr = numpy.dot(numpy.dot(numpy.transpose(amuse.pca_weights), tcorr), amuse.pca_weights)
- amuse.tica_values, amuse.intermediate_weights = numpy.linalg.eigh(pc_tcorr)
- amuse.tica_weights = numpy.dot(amuse.pca_weights, amuse.intermediate_weights)
-
- # sort eigenvalues und eigenvectors
- sort_perm = numpy.argsort(numpy.abs(amuse.pca_values))[::-1]
- amuse.pca_values = amuse.pca_values[sort_perm]
- amuse.pca_weights = amuse.pca_weights[:, sort_perm]
-
- sort_perm = numpy.argsort(numpy.abs(amuse.tica_values))[::-1]
- amuse.tica_values = amuse.tica_values[sort_perm]
- amuse.tica_weights = amuse.tica_weights[:, sort_perm]
-
- # if the transformation involves scaling of the original coordinates
- # using standard deviation, include this transformation in the matrices
- if normalize:
- std = numpy.sqrt(amuse.var)
- amuse.tica_weights = numpy.transpose(numpy.transpose(amuse.tica_weights) / std)
- amuse.pca_weights = numpy.transpose(numpy.transpose(amuse.pca_weights) / std)
-
- amuse.corr = corr
- amuse.tcorr = tcorr
-
- return amuse
-
- def __init__(self, time_column):
- """
- TODO: document this
- """
- self.time_column = time_column
-
- def pca(self, fin, fout, keep_pc):
- '''Perform PCA on data'''
- self.project(fin, fout, self.pca_weights, keep_pc)
-
- def tica(self, fin, fout, keep_ic):
- '''Perform TICA on data'''
- self.project(fin, fout, self.tica_weights, keep_ic)
-
- def project(self, fin, fout, weights, keep_n=None):
- """
- project it
- """
- if fin == fout:
- raise Exception('Input file name is equal to output file name.')
- if not keep_n:
- keep_n = self.n # keep all
-
- ''' import correlation covariance C extension module '''
- from pyemma.coordinates.transform import cocovar
- cocovar.project(fin, fout, self.mean, weights, keep_n, self.time_column)
diff --git a/pyemma/coordinates/transform/transformer.py b/pyemma/coordinates/transform/transformer.py
new file mode 100644
index 000000000..9a15c9022
--- /dev/null
+++ b/pyemma/coordinates/transform/transformer.py
@@ -0,0 +1,348 @@
+__author__ = 'noe'
+
+from pyemma.util.log import getLogger
+
+import numpy as np
+
+
+log = getLogger('Transformer')
+__all__ = ['Transformer']
+
+
+class Transformer(object):
+
+ """
+
+ Parameters
+ ----------
+ chunksize : int (optional)
+ the chunksize used to batch process underlying data
+ lag : int (optional)
+ if you want to process time lagged data, set this to a value > 0.
+ """
+
+ def __init__(self, chunksize=100, lag=0):
+ self.chunksize = chunksize
+ self._lag = lag
+ self._in_memory = False
+ self._dataproducer = None
+
+ @property
+ def data_producer(self):
+ """where does this transformer gets its data from"""
+ return self._dataproducer
+
+ @data_producer.setter
+ def data_producer(self, dp):
+ self._dataproducer = dp
+
+ @property
+ def chunksize(self):
+ """chunksize defines how much data is being processed at once."""
+ return self._chunksize
+
+ @chunksize.setter
+ def chunksize(self, size):
+ assert size >= 0, "chunksize has to be positive"
+ self._chunksize = int(size)
+
+ @property
+ def in_memory(self):
+ """are results stored in memory?"""
+ return self._in_memory
+
+ def operate_in_memory(self):
+ """
+ If called, the output will be stored in memory
+ """
+ self._in_memory = True
+ # output data
+ self.Y = [np.zeros((self.trajectory_length(itraj), self.dimension()))
+ for itraj in xrange(self.number_of_trajectories())]
+
+ @property
+ def lag(self):
+ """lag time, at which a second time lagged data source will be processed.
+ """
+ return self._lag
+
+ @lag.setter
+ def lag(self, lag):
+ assert lag >= 0, "lag time has to be positive."
+ self._lag = int(lag)
+
+ def number_of_trajectories(self):
+ """
+ Returns the number of trajectories.
+
+ Returns
+ -------
+ int : number of trajectories
+ """
+ return self.data_producer.number_of_trajectories()
+
+ def trajectory_length(self, itraj):
+ """
+ Returns the length of trajectory with given index.
+
+ Parameters
+ ----------
+ itraj : int
+ trajectory index
+
+ Returns
+ -------
+ int : length of trajectory
+ """
+ return self.data_producer.trajectory_length(itraj)
+
+ def trajectory_lengths(self):
+ """
+ Returns the length of each trajectory.
+
+ Returns
+ -------
+ int : length of each trajectory
+ """
+ return self.data_producer.trajectory_lengths()
+
+ def n_frames_total(self):
+ """
+ Returns total number of frames.
+
+ Returns
+ -------
+ int : n_frames_total
+ """
+ return self.data_producer.n_frames_total()
+
+ def get_memory_per_frame(self):
+ """
+ Returns the memory requirements per frame, in bytes
+ """
+ return 4 * self.dimension()
+
+ def get_constant_memory(self):
+ """Returns the constant memory requirements, in bytes."""
+ return 0
+
+ def describe(self):
+ """ get a representation of this Transformer"""
+ return self.__str__()
+
+ def parametrize(self):
+ r""" parametrize this Transformer
+ """
+ # check if ready
+ if self.data_producer is None:
+ raise RuntimeError('Called parametrize of %s while data producer is not'
+ ' yet set. Ensure "data_producer" attribute is set!'
+ % self.describe())
+ # init
+ self.param_init()
+ # feed data, until finished
+ add_data_finished = False
+ ipass = 0
+
+ # parametrize
+ while not add_data_finished:
+ first_chunk = True
+ self.data_producer.reset()
+ # iterate over trajectories
+ last_chunk = False
+ itraj = 0
+ lag = self._lag
+ while not last_chunk:
+ last_chunk_in_traj = False
+ t = 0
+ while not last_chunk_in_traj:
+ # iterate over times within trajectory
+ if lag == 0:
+ X = self.data_producer.next_chunk()
+ Y = None
+ else:
+ if self.trajectory_length(itraj) <= lag:
+ log.error(
+ "trajectory nr %i to short, skipping it" % self.itraj)
+ break
+ X, Y = self.data_producer.next_chunk(lag=lag)
+ L = np.shape(X)[0]
+ # last chunk in traj?
+ last_chunk_in_traj = (
+ t + lag + L >= self.trajectory_length(itraj))
+ # last chunk?
+ last_chunk = (
+ last_chunk_in_traj and itraj >= self.number_of_trajectories() - 1)
+ # first chunk
+ add_data_finished = self.param_add_data(
+ X, itraj, t, first_chunk, last_chunk_in_traj, last_chunk, ipass, Y=Y)
+ first_chunk = False
+ # increment time
+ t += L
+ # increment trajectory
+ itraj += 1
+ ipass += 1
+ # finish parametrization
+ self.param_finish()
+ # memory mode? Then map all results
+ if self.in_memory:
+ self.map_to_memory()
+
+ def param_init(self):
+ """
+ Initializes the parametrization.
+ """
+ # create mean array and covariance matrix
+ pass
+
+ def param_finish(self):
+ """
+ Finalizes the parametrization.
+ """
+ pass
+
+ def map_to_memory(self):
+ """maps results to memory. Will be stored in attribute :attr:`Y`."""
+ # if operating in main memory, do all the mapping now
+ self.data_producer.reset()
+ # iterate over trajectories
+ last_chunk = False
+ itraj = 0
+ while not last_chunk:
+ last_chunk_in_traj = False
+ t = 0
+ while not last_chunk_in_traj:
+ X = self.data_producer.next_chunk()
+ L = np.shape(X)[0]
+ # last chunk in traj?
+ last_chunk_in_traj = (t + L >= self.trajectory_length(itraj))
+ # last chunk?
+ last_chunk = (
+ last_chunk_in_traj and itraj >= self.number_of_trajectories() - 1)
+ # write
+ self.Y[itraj][t:t + L] = self.map(X)
+ # increment time
+ t += L
+ # increment trajectory
+ itraj += 1
+
+ def reset(self):
+ """reset data position"""
+ if self.in_memory:
+ # operate in memory, implement iterator here
+ self.itraj = 0
+ self.t = 0
+ else:
+ # operate in pipeline
+ self.data_producer.reset()
+
+ def next_chunk(self, lag=0):
+ """
+ transforms next available chunk from either in memory data or internal
+ data_producer
+
+ Parameters
+ ----------
+ lag : int
+ time delay of second data source.
+
+ Returns
+ -------
+ X, (Y if lag > 0) : array_like
+ mapped (transformed) data
+ """
+ if self.in_memory:
+ if self.itraj >= self.number_of_trajectories():
+ return None
+ # operate in memory, implement iterator here
+ traj_len = self.trajectory_length(self.itraj)
+ if lag == 0:
+ Y = self.Y[self.itraj][
+ self.t:min(self.t + self.chunksize, traj_len)]
+ # increment counters
+ self.t += self.chunksize
+ if self.t >= traj_len:
+ self.itraj += 1
+ self.t = 0
+ return Y
+ else:
+ Y0 = self.Y[self.itraj][
+ self.t:min(self.t + self.chunksize, traj_len)]
+ Ytau = self.Y[self.itraj][
+ self.t + lag:min(self.t + self.chunksize + lag, traj_len)]
+ # increment counters
+ self.t += self.chunksize
+ if self.t >= traj_len:
+ self.itraj += 1
+ return (Y0, Ytau)
+ else:
+ # operate in pipeline
+ if lag == 0:
+ X = self.data_producer.next_chunk()
+ return self.map(X)
+ else:
+ (X0, Xtau) = self.data_producer.next_chunk(lag=lag)
+ return (self.map(X0), self.map(Xtau))
+
+ def __iter__(self):
+ self.reset()
+ self.last_chunk = False
+ self.itraj = 0
+ self.t = 0
+ return self
+
+ def next(self):
+ """ enable iteration over transformed data.
+
+ Returns
+ -------
+ (itraj, X) : (int, ndarray(n, m)
+ itraj corresponds to input sequence number (eg. trajectory index)
+ and X is the transformed data, n = chunksize or n < chunksize at end
+ of input.
+
+ """
+ # iterate over trajectories
+ if self.itraj >= self.number_of_trajectories():
+ raise StopIteration
+
+ X = self.data_producer.next_chunk()
+ L = np.shape(X)[0]
+ self.t += L
+ last_itraj = self.itraj
+ if self.t >= self.trajectory_length(self.itraj):
+ self.itraj += 1
+ self.t = 0
+ return (last_itraj, self.map(X))
+
+ @staticmethod
+ def distance(x, y):
+ """ stub for calculating the euclidian norm between x and y.
+
+ Parameters
+ ----------
+ x : ndarray(n)
+ y : ndarray(n)
+
+ Returns
+ -------
+ d : float
+ euclidean distance
+ """
+ return np.linalg.norm(x - y, 2)
+
+ @staticmethod
+ def distances(x, Y):
+ """ stub for calculating the euclidian norm between x and a set of points Y.
+
+ Parameters
+ ----------
+ x : ndarray(n)
+ Y : ndarray(m, n)
+
+ Returns
+ -------
+ distances : ndarray(m)
+ euclidean distances between points in Y to x
+ """
+ return np.linalg.norm(Y - x, 2, axis=1)
diff --git a/pyemma/coordinates/transform/util.py b/pyemma/coordinates/transform/util.py
deleted file mode 100644
index 4342ccc59..000000000
--- a/pyemma/coordinates/transform/util.py
+++ /dev/null
@@ -1,37 +0,0 @@
-'''
-Created on Jan 5, 2014
-
-@author: noe
-'''
-
-import numpy as np
-
-def project(x, T, ndim = None):
- """
- Projects vector x onto the coordinate system T
-
- Performs a linear projection of vector x to a coordinate system
- given by the column vectors in T. This projection is used in linear
- dimension reduction methods such as PCA and TICA, where T are the
- respective eigenvector matrices.
-
- Parameters
- ----------
- x : ndarray (n)
- coordinate vector to be projected
- T : ndarray (n x n)
- coordinate system with vectors in its columns
- ndim = None : int
- Dimension of the output vector. Only the first ndim vectors of T
- will be used for projection. When set to None (default), no dimension
- reduction will be made, and the output vector has size n.
-
- Returns
- -------
- Projected coordinate vector
-
- """
- if (ndim is None):
- return np.dot(x.T, T)
- else:
- return np.dot(x.T, T[:, 0: ndim])
\ No newline at end of file
diff --git a/pyemma/coordinates/util/__init__.py b/pyemma/coordinates/util/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pyemma/coordinates/util/chaining.py b/pyemma/coordinates/util/chaining.py
new file mode 100644
index 000000000..1d87e82e1
--- /dev/null
+++ b/pyemma/coordinates/util/chaining.py
@@ -0,0 +1,26 @@
+'''
+Created on 17.02.2015
+
+@author: marscher
+'''
+
+
+def build_chain(transformers, chunksize=None):
+ """
+ utility method to build a working pipeline out of given data source and
+ transformers
+ """
+
+ for i in xrange(1, len(transformers)):
+ transformers[i].data_producer = transformers[i - 1]
+
+ if chunksize is not None:
+ for t in transformers:
+ t.chunksize = chunksize
+
+ return transformers
+
+
+def run_chain(chain):
+ for c in chain:
+ c.parametrize()
diff --git a/pyemma/coordinates/util/stat.py b/pyemma/coordinates/util/stat.py
new file mode 100644
index 000000000..17c59556d
--- /dev/null
+++ b/pyemma/coordinates/util/stat.py
@@ -0,0 +1,100 @@
+import numpy as np
+
+__author__ = 'Fabian Paul'
+__all__ = ['subsample', 'hist']
+
+
+def subsample(transform, dimensions, stride=1):
+ '''Returns in-memory trajectories of the transformed data, optionally
+ reduced in the number of dimensions and/or time resolution.
+
+ Parameters
+ ----------
+ transfrom : pyemma.coordinates.transfrom.Transformer object
+ transform that provides the input data
+ dimensions : tuple of indexes
+ indices of dimensions you like to keep
+ stride : int
+ only take every n'th frame
+
+ Returns
+ -------
+ list of (traj_length[i]/stride,len(dimensions)) ndarrays
+
+ Notes
+ -----
+ This function may be RAM intensive if stride is too large or
+ too many dimensions are selected.
+
+ Example
+ -------
+ plotting trajectories
+ >>> import matplotlib.pyplot as plt
+ >>> %matplotlib inline # only for ipython notebook
+
+ >>> trajs = subsample(transform,dimensions=(0,),stride=100)
+ >>> for traj in trajs:
+ >>> plt.figure()
+ >>> plt.plot(traj[:,0])
+ '''
+ trajs = [np.zeros((0, len(dimensions)))
+ for _ in xrange(transform.number_of_trajectories())]
+ for i, chunk in transform:
+ trajs[i] = np.concatenate((trajs[i], chunk[::stride, dimensions]))
+ return trajs
+
+
+def hist(transform, dimensions, nbins):
+ '''Computes the N-dimensional histogram of the transformed data.
+
+ Parameters
+ ----------
+ transform : pyemma.coordinates.transfrom.Transformer object
+ transform that provides the input data
+ dimensions : tuple of indices
+ indices of the dimensions you want to examine
+ nbins : tuple of ints
+ number of bins along each dimension
+
+ Returns
+ -------
+ counts : (bins[0],bins[1],...) ndarray of ints
+ counts compatible with pyplot.pcolormesh and pyplot.bar
+ edges : list of (bins[i]) ndarrays
+ bin edges compatible with pyplot.pcolormesh and pyplot.bar,
+ see below.
+
+ Example
+ -------
+ >>> import matplotlib.pyplot as plt
+ >>> %matplotlib inline # only for ipython notebook
+
+ >>> counts,edges=hist(transform,dimensions=(0,1),nbins=(20,30))
+ >>> plt.pcolormesh(edges[0],edges[1],counts.T)
+
+ >>> counts,edges=hist(transform,dimensions=(1,),nbins=(50,))
+ >>> plt.bar(edges[0][:-1],counts,width=edges[0][1:]-edges[0][:-1])
+ '''
+ maximum = np.ones(len(dimensions)) * (-np.inf)
+ minimum = np.ones(len(dimensions)) * np.inf
+ # compute min and max
+ for _, chunk in transform:
+ maximum = np.max(
+ np.vstack((
+ maximum,
+ np.max(chunk[:, dimensions], axis=0))),
+ axis=0)
+ minimum = np.min(
+ np.vstack((
+ minimum,
+ np.min(chunk[:, dimensions], axis=0))),
+ axis=0)
+ # define bins
+ bins = [np.linspace(m, M, num=n)
+ for m, M, n in zip(minimum, maximum, nbins)]
+ hist = np.zeros(np.array(nbins) - 1)
+ # compute actual histogram
+ for _, chunk in transform:
+ part, _ = np.histogramdd(chunk[:, dimensions], bins=bins)
+ hist += part
+ return hist, bins
diff --git a/pyemma/msm/analysis/api.py b/pyemma/msm/analysis/api.py
index 45f026516..56500797b 100644
--- a/pyemma/msm/analysis/api.py
+++ b/pyemma/msm/analysis/api.py
@@ -1,8 +1,8 @@
r"""
-======================
-Emma2 MSM Analysis API
-======================
+=======================
+PyEMMA MSM Analysis API
+=======================
"""
diff --git a/pyemma/msm/estimation/api.py b/pyemma/msm/estimation/api.py
index 63c8e4ce7..257c00764 100644
--- a/pyemma/msm/estimation/api.py
+++ b/pyemma/msm/estimation/api.py
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-
r"""
-========================
-Emma2 MSM Estimation API
-========================
+=========================
+PyEMMA MSM Estimation API
+=========================
"""
diff --git a/pyemma/msm/io/api.py b/pyemma/msm/io/api.py
index 011d5909d..651cdd406 100644
--- a/pyemma/msm/io/api.py
+++ b/pyemma/msm/io/api.py
@@ -1,8 +1,8 @@
r"""
-================
-Emma2 MSM io API
-================
+=================
+PyEMMA MSM io API
+=================
"""
diff --git a/pyemma/msm/ui/viewer.py b/pyemma/msm/ui/viewer.py
index 3e9e40e34..0b0931f65 100644
--- a/pyemma/msm/ui/viewer.py
+++ b/pyemma/msm/ui/viewer.py
@@ -1,11 +1,11 @@
import mdtraj as md
import IPython
-from mdtraj.html import TrajectoryView, enable_notebook
-import IPython
+from mdtraj.html import TrajectoryView, enable_notebook
+from IPython.html.widgets.widget_int import IntSliderWidget
-def view_traj(traj, topology_file=None, stride=1):
+def view_traj(traj, topology_file=None, stride=1, **kwargs):
r"""Opens a trajectory viewer (from mdtraj).
Parameters
@@ -19,14 +19,21 @@ def view_traj(traj, topology_file=None, stride=1):
stride : int
If traj is a file name, this is the number of frames
to skip between two successive trajectory reads.
-
+ **kwargs : optional arguments
+ are passed to `mdtraj.html.TrajectoryView`
"""
if isinstance(traj, str):
traj = md.load(traj, top=topology_file, stride=stride)
- widget = md.html.TrajectoryView(traj, colorBy='atom')
+ # ensure we're able to use TrajectoryView
+ enable_notebook()
+
+ if not 'colorBy' in kwargs:
+ kwargs['colorBy'] = 'atom'
+
+ widget = TrajectoryView(traj, **kwargs)
IPython.display.display(widget)
- slider = IPython.html.widgets.IntSliderWidget(max=traj.n_frames - 1)
+ slider = IntSliderWidget(max=traj.n_frames - 1)
def on_value_change(name, val):
widget.frame = val
diff --git a/pyemma/pyemma.cfg b/pyemma/pyemma.cfg
index b8753a9f6..77be95950 100644
--- a/pyemma/pyemma.cfg
+++ b/pyemma/pyemma.cfg
@@ -1,3 +1,10 @@
+################################################################################
+# PyEMMA configuration file
+#
+# notes:
+# - comments are not allowed in line, since they would be appended to the value!
+################################################################################
+
[Logging]
enabled = True
toconsole = True
@@ -8,6 +15,8 @@ format = %%(asctime)s %%(name)-12s %%(levelname)-8s %%(message)s
[Java]
+# should start the virtual machine on import?
+startup = False
# Optional classpath extension, empty by default.
# To define multiple classpathes, one should separate them with os specific path sep:
# ':' for Linux
diff --git a/pyemma/util/annotators.py b/pyemma/util/annotators.py
new file mode 100644
index 000000000..94b49bb4a
--- /dev/null
+++ b/pyemma/util/annotators.py
@@ -0,0 +1,71 @@
+"""
+doc_inherit decorator
+
+Usage:
+
+class Foo(object):
+ def foo(self):
+ "Frobber"
+ pass
+
+class Bar(Foo):
+ @doc_inherit
+ def foo(self):
+ pass
+
+Now, Bar.foo.__doc__ == Bar().foo.__doc__ == Foo.foo.__doc__ == "Frobber"
+"""
+
+from functools import wraps
+
+__all__ = ['doc_inherit']
+
+
+class DocInherit(object):
+
+ """
+ Docstring inheriting method descriptor
+
+ The class itself is also used as a decorator
+ """
+
+ def __init__(self, mthd):
+ self.mthd = mthd
+ self.name = mthd.__name__
+
+ def __get__(self, obj, cls):
+ if obj:
+ return self.get_with_inst(obj, cls)
+ else:
+ return self.get_no_inst(cls)
+
+ def get_with_inst(self, obj, cls):
+
+ overridden = getattr(super(cls, obj), self.name, None)
+
+ @wraps(self.mthd, assigned=('__name__', '__module__'))
+ def f(*args, **kwargs):
+ return self.mthd(obj, *args, **kwargs)
+
+ return self.use_parent_doc(f, overridden)
+
+ def get_no_inst(self, cls):
+
+ for parent in cls.__mro__[1:]:
+ overridden = getattr(parent, self.name, None)
+ if overridden:
+ break
+
+ @wraps(self.mthd, assigned=('__name__', '__module__'))
+ def f(*args, **kwargs):
+ return self.mthd(*args, **kwargs)
+
+ return self.use_parent_doc(f, overridden)
+
+ def use_parent_doc(self, func, source):
+ if source is None:
+ raise NameError("Can't find '%s' in parents" % self.name)
+ func.__doc__ = source.__doc__
+ return func
+
+doc_inherit = DocInherit
diff --git a/pyemma/util/config.py b/pyemma/util/config.py
index 4d7a47dda..039168060 100644
--- a/pyemma/util/config.py
+++ b/pyemma/util/config.py
@@ -10,7 +10,7 @@
1. $CWD/pyemma.cfg
2. /etc/pyemma.cfg
3. ~/pyemma.cfg
-4. $PYTHONPATH/Emma2/pyemma.cfg (always taken as default configuration file)
+4. $PYTHONPATH/pyemma/pyemma.cfg (always taken as default configuration file)
The default values are stored in later file to ensure these values are always
defined. This is preferred over hardcoding them somewhere in the Python code.
@@ -25,13 +25,21 @@
.. literalinclude:: ../../pyemma/pyemma.cfg
:language: ini
-To access the config at runtime eg. the logging section
+To access the config at runtime eg. the logging section:
.. code-block:: python
from pyemma.util.config import config
print config.Logging.level
+Notes
+-----
+All values are being stored as strings, so to compare eg. if a value is True,
+compare for
+.. code-block:: python
+ if config['section].my_bool == 'True':
+ pass
+
.. codeauthor:: Martin Scherer
@@ -112,7 +120,7 @@ def readConfiguration():
try:
import psutil
# available virtual memory in mb
- max_avail = psutil.virtual_memory().available / 1024**2
+ max_avail = psutil.virtual_memory().available / 1024 ** 2
maxheap = max(maxheap, max_avail)
except ImportError:
diff --git a/pyemma/util/linalg.py b/pyemma/util/linalg.py
index a4d180871..f51ab96fa 100644
--- a/pyemma/util/linalg.py
+++ b/pyemma/util/linalg.py
@@ -3,20 +3,21 @@
import numpy as np
import scipy.linalg
-def _sort_by_norm(evals,evecs):
+
+def _sort_by_norm(evals, evecs):
"""
Sorts the eigenvalues and eigenvectors by descending norm of the eigenvalues
Parameters
----------
- :param evals: ndarray(n)
+ evals: ndarray(n)
eigenvalues
- :param evecs: ndarray(n,n)
+ evecs: ndarray(n,n)
eigenvectors in a column matrix
- Returns:
- --------
- :return:
+ Returns
+ -------
+ (evals, evecs) : ndarray(m), ndarray(n,m)
the sorted eigenvalues and eigenvectors
"""
@@ -26,27 +27,28 @@ def _sort_by_norm(evals,evecs):
I = np.argsort(evnorms)[::-1]
# permute
evals2 = evals[I]
- evecs2 = evecs[:,I]
+ evecs2 = evecs[:, I]
# done
- return (evals2,evecs2)
+ return (evals2, evecs2)
-def eig_corr(C0, Ct, epsilon = 1e-6):
+def eig_corr(C0, Ct, epsilon=1e-6):
"""
Solve the generalized eigenvalues problem with correlation matrices C0 and Ct
- Parameters:
- -----------
+ Parameters
+ ----------
C0 : ndarray (n,n)
time-instantaneous correlation matrix. Must be symmetric positive definite
Ct : ndarray (n,n)
time-lagged correlation matrix. Must be symmetric
epsilon : float
- eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be cut off.
- The remaining number of Eigenvalues define the size of the output.
+ eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be
+ cut off. The remaining number of Eigenvalues define the size of
+ the output.
- Returns:
- --------
+ Returns
+ -------
l : ndarray (m)
The first m generalized eigenvalues, sorted by descending norm
R : ndarray (n,m)
@@ -60,7 +62,7 @@ def eig_corr(C0, Ct, epsilon = 1e-6):
# compute the Eigenvalues of C0 using Schur factorization
(S, V) = scipy.linalg.schur(C0)
s = np.diag(S)
- (s, V) = _sort_by_norm(s,V) # sort them
+ (s, V) = _sort_by_norm(s, V) # sort them
S = np.diag(s)
# determine the cutoff. We know that C0 is an spd matrix,
@@ -73,20 +75,20 @@ def eig_corr(C0, Ct, epsilon = 1e-6):
evnorms = np.abs(s)
n = np.shape(evnorms)[0]
m = n - np.searchsorted(evnorms[::-1], epsilon)
- Vm = V[:,0:m]
+ Vm = V[:, 0:m]
sm = s[0:m]
# transform Ct to orthogonal basis given by the eigenvectors of C0
Sinvhalf = 1.0 / np.sqrt(sm)
T = np.dot(np.diag(Sinvhalf), Vm.T)
- Ct_trans = np.dot(np.dot(T, Ct),T.T)
+ Ct_trans = np.dot(np.dot(T, Ct), T.T)
# solve the symmetric eigenvalue problem in the new basis
- (l,R_trans) = scipy.linalg.eigh(Ct_trans)
- (l,R_trans) = sort_by_norm(l,R_trans)
+ (l, R_trans) = scipy.linalg.eigh(Ct_trans)
+ (l, R_trans) = _sort_by_norm(l, R_trans)
# transform the eigenvectors back to the old basis
R = np.dot(T.T, R_trans)
# return result
- return (l, R)
\ No newline at end of file
+ return (l, R)
diff --git a/pyemma/util/log.py b/pyemma/util/log.py
index a006f7262..8ff58b9af 100644
--- a/pyemma/util/log.py
+++ b/pyemma/util/log.py
@@ -3,16 +3,24 @@
@author: marscher
'''
-__all__ = ['getLogger', 'enabled']
+__all__ = ['getLogger', 'enabled', 'CRITICAL', 'DEBUG', 'FATAL', 'INFO', 'NOTSET',
+ 'WARN', 'WARNING']
import logging
+reload(logging)
+
+from logging import CRITICAL, FATAL, ERROR, WARNING, WARN, INFO, DEBUG, NOTSET
enabled = False
-""" set up a dummy logger if logging is disabled"""
+
class dummyLogger:
+
+ """ set up a dummy logger if logging is disabled"""
+
def dummy(self, kwargs):
pass
+
def __getattr__(self, name):
return self.dummy
@@ -27,40 +35,53 @@ def setupLogging():
from pyemma.util.config import conf_values
args = conf_values['Logging']
- if args.enabled:
+ enabled = args.enabled == 'True'
+ toconsole = args.toconsole == 'True'
+ tofile = args.tofile == 'True'
+
+ if enabled:
try:
logging.basicConfig(level=args.level,
format=args.format,
datefmt='%d-%m-%y %H:%M:%S')
except IOError as ie:
import warnings
- warnings.warn('logging could not be initialized, because of %s' % ie)
+ warnings.warn(
+ 'logging could not be initialized, because of %s' % ie)
return
- """ in case we want to log to both file and stream, add a separate handler"""
+ # in case we want to log to both file and stream, add a separate handler
formatter = logging.Formatter(args.format)
+ root_logger = logging.getLogger('')
+ root_handlers = root_logger.handlers
- if args.tofile:
+ if toconsole:
+ ch = root_handlers[0]
+ ch.setLevel(args.level)
+ ch.setFormatter(formatter)
+ else: # remove first handler (which should be streamhandler)
+ assert len(root_handlers) == 1
+ streamhandler = root_handlers.pop()
+ assert isinstance(streamhandler, logging.StreamHandler)
+ if tofile:
# set delay to True, to prevent creation of empty log files
fh = logging.FileHandler(args.file, mode='a', delay=True)
fh.setFormatter(formatter)
fh.setLevel(args.level)
- logging.getLogger('').addHandler(fh)
- if args.toconsole and args.tofile:
- # since we have used basicConfig (without file args),
- # a StreamHandler is already used. Set our arguments.
- ch = logging.getLogger('').handlers[0]
- ch.setLevel(args.level)
- ch.setFormatter(formatter)
+ root_logger.addHandler(fh)
+
+ # if user enabled logging, but disallowed file and console logging, disable
+ # logging completely.
+ if not tofile and not toconsole:
+ enabled = False
+ dummyInstance = dummyLogger()
else:
dummyInstance = dummyLogger()
- enabled = args.enabled
-
def getLogger(name=None):
if not enabled:
return dummyInstance
- """ if name is not given, return a logger with name of the calling module."""
+ # if name is not given, return a logger with name of the calling module.
if not name:
import traceback
t = traceback.extract_stack(limit=2)
@@ -71,9 +92,8 @@ def getLogger(name=None):
name = path[pos:]
- #logging.getLogger().debug('getLogger set name to %s; path was %s' % (name, path))
return logging.getLogger(name)
# init logging
-setupLogging()
\ No newline at end of file
+setupLogging()
diff --git a/pyemma/util/pystallone.py b/pyemma/util/pystallone.py
index 5c0d90ae7..b489ea2cf 100644
--- a/pyemma/util/pystallone.py
+++ b/pyemma/util/pystallone.py
@@ -29,6 +29,7 @@
from __future__ import absolute_import
from pystallone import startJVM
from .log import getLogger as _getLogger
+from pyemma.util.config import conf_values as _conf_values
_log = _getLogger()
@@ -37,8 +38,7 @@ def _get_jvm_args():
reads in the configuration values for the java virtual machine and
returns a list containing them all.
"""
- from pyemma.util.config import conf_values
- java_conf = conf_values['Java']
+ java_conf = _conf_values['Java']
initHeap = '-Xms%s' % java_conf.initheap
maxHeap = '-Xmx%s' % java_conf.maxheap
# optargs may contain lots of options separated by whitespaces, need a list
@@ -46,12 +46,14 @@ def _get_jvm_args():
optional_cp = "-Djava.class.path=%s" % java_conf.classpath
return [initHeap, maxHeap, optional_cp] + optionalArgs
-try:
- args = _get_jvm_args()
- startJVM(None, args)
-except:
- _log.exception("jvm startup failed")
- raise
+if _conf_values['Java'].startup == 'True':
+ #_log.debug("try to startup java")
+ try:
+ args = _get_jvm_args()
+ startJVM(None, args)
+ except:
+ _log.exception("jvm startup failed")
+ raise
# after we have successfully started the jvm, we import the rest of the symbols.
from pystallone import *
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 822da5166..220cc6313 100755
--- a/setup.py
+++ b/setup.py
@@ -72,7 +72,6 @@ def extensions():
2. src dist install (have pre-converted c files and pyx files)
a) cython present -> fine
b) no cython -> use .c files
-
"""
USE_CYTHON = False
try:
@@ -85,6 +84,8 @@ def extensions():
from setup_util import detect_openmp
openmp_enabled, needs_gomp = detect_openmp()
+ exts = []
+
mle_trev_given_pi_dense_module = \
Extension('pyemma.msm.estimation.dense.mle_trev_given_pi',
sources=['pyemma/msm/estimation/dense/mle_trev_given_pi.pyx',
@@ -102,17 +103,21 @@ def extensions():
sources=['pyemma/msm/estimation/sparse/mle_trev.pyx',
'pyemma/msm/estimation/sparse/_mle_trev.c'])
- mle_trev_module = [mle_trev_given_pi_dense_module,
- mle_trev_given_pi_sparse_module,
- mle_trev_sparse_module]
-
+ exts += [mle_trev_given_pi_dense_module,
+ mle_trev_given_pi_sparse_module,
+ mle_trev_sparse_module]
+
+# kahan_sum = Extension('pyemma.coordinates.coordinate_transformation.exts.stable_sum',
+# sources=['pyemma/coordinates/coordinate_transformation/exts/stable_sum.pyx'],
+# extra_compile_args = ['-g'])
+#
+# update_mean = Extension('pyemma.coordinates.coordinate_transformation.exts.fmath_wrapper',
+# sources=['pyemma/coordinates/coordinate_transformation/exts/fmath_wrapper.pyx'],)
+#
+# #exts += [kahan_sum]
+# exts += [update_mean]
if USE_CYTHON: # if we have cython available now, cythonize module
- mle_trev_module = cythonize(mle_trev_module)
-
- #
- cocovar = Extension('pyemma.coordinates.transform.cocovar',
- sources=['pyemma/coordinates/transform/cocovar.c'])
- exts = mle_trev_module + [cocovar]
+ exts = cythonize(exts)
if openmp_enabled:
warnings.warn('enabled openmp')
@@ -210,7 +215,10 @@ def script_entry_points():
# runtime dependencies
install_requires=['numpy>=1.6.0',
'scipy>=0.11',
- 'pystallone>=1.0.0b3'],
+ 'pystallone>=1.0.0b3',
+ 'mdtraj',
+ 'scikit-learn',
+ 'psutil'],
zip_safe=False,
)