refactor: use factories for common operations

prokolyvakis · Oct 3, 2023 · 90827f5 · 90827f5
1 parent 2a7b14a
commit 90827f5
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 133 deletions.
diff --git a/experiments/common.py b/experiments/common.py
@@ -2,6 +2,15 @@
 import numpy as np
 import plotly.graph_objects as go
 
+from mudpod.clustering import DipMeans
+from mudpod.projections import IdentityProjector
+from mudpod.projections import JohnsonLindenstrauss
+from mudpod.observer import PercentileObserver
+from mudpod.observer import RandomObserver
+from mudpod.projections import View
+from mudpod.unimodality import UnimodalityTest
+from mudpod.unimodality import MonteCarloUnimodalityTest
+
 
 def plot_clustered_data(data: np.ndarray, labels: np.ndarray) -> None:
     """Plots clustered data.
@@ -61,3 +70,78 @@ def group_data_points(data: np.ndarray, clusters: np.ndarray) -> list[np.ndarray
     m = m[m[:, -1].argsort()]
     m = np.split(m[:, :-1], np.unique(m[:, -1], return_index=True)[1][1:])
     return m
+
+
+def get_view(arguments: dict) -> View:
+    """Get a view based on the config parameters existing in arguments.
+
+    Args:
+        arguments: a dict containing the config parameters.
+    Returns:
+        The parametrized view.
+    """
+    pt = str(arguments['<pj>'])
+    if pt == 'jl':
+        p = JohnsonLindenstrauss()
+    elif pt == 'i':
+        p = IdentityProjector()
+    else:
+       raise ValueError(f'The projection type: {pt} is not supported!')
+
+
+    dt = str(arguments['--dist'])
+    ot = str(arguments['--obs'])
+    if ot == 'percentile':
+        o = PercentileObserver(0.99, dt)
+    elif ot == 'random':
+        o = RandomObserver()
+    else:
+       raise ValueError(f'The observer type: {ot} is not supported!')
+
+    v = View(p, o, dt)
+
+    return v
+
+
+def get_monte_carlo_test(arguments: dict, workers_num: int = 1) -> MonteCarloUnimodalityTest:
+    """Get a Monte Carlo unimodality test.
+
+    Args:
+        arguments: a dict containing the config parameters.
+        workers_num: an integer indicating the number of workers.
+    Returns:
+        A parametrized Monte Carlo unimodality test.
+    """
+    v = get_view(arguments)
+
+    t = UnimodalityTest(v, float(arguments['<pv>']))
+    mct = MonteCarloUnimodalityTest(
+        t,
+        sim_num=int(arguments['<sims>']),
+        workers_num=workers_num
+    )
+
+    return mct
+
+
+def get_dip_means(arguments: dict, seed: int, workers_num: int = 1) -> DipMeans:
+    """Get a DipMeans clustering instance.
+
+    Args:
+        arguments: a dict containing the config parameters.
+        seed: a random seed.
+        workers_num: an integer indicating the number of workers.
+    Returns:
+        A parametrized DipMeans instance.
+    """
+    v = get_view(arguments)
+
+    dm = DipMeans(
+        view=v,
+        pval=float(arguments['<pv>']),
+        sim_num=int(arguments['<sims>']),
+        workers_num=workers_num,
+        random_state=seed
+    )
+
+    return dm
diff --git a/experiments/real/pre_clustering.py b/experiments/real/pre_clustering.py
@@ -27,14 +27,9 @@
 from sklearn.metrics import silhouette_score
 from umap import UMAP
 
+from experiments.common import get_dip_means
 from experiments.common import plot_clustered_data
 from mudpod.misc import set_seed
-from mudpod.clustering import DipMeans
-from mudpod.projections import IdentityProjector
-from mudpod.projections import JohnsonLindenstrauss
-from mudpod.observer import PercentileObserver
-from mudpod.observer import RandomObserver
-from mudpod.projections import View
 
 
 logger.remove()
@@ -72,40 +67,17 @@ def get_data(
 if __name__ == "__main__":
     arguments = docopt(__doc__)
 
+    SEED = int(arguments['--seed'])
+    set_seed(SEED)
+
     n_samples = arguments['--samples'] or None
     if n_samples is not None:
         n_samples = int(n_samples)
     x, y = get_data(Path(arguments['<p>']), samples=n_samples)
 
-    SEED = int(arguments['--seed'])
-    set_seed(SEED)
-
-    pt = str(arguments['<pj>'])
-    if pt == 'jl':
-        p = JohnsonLindenstrauss()
-    elif pt == 'i':
-        p = IdentityProjector()
-    else:
-       raise ValueError(f'The projection type: {pt} is not supported!')
-
-
-    dt = str(arguments['--dist'])
-    ot = str(arguments['--obs'])
-    if ot == 'percentile':
-        o = PercentileObserver(0.99, dt)
-    elif ot == 'random':
-        o = RandomObserver()
-    else:
-       raise ValueError(f'The observer type: {ot} is not supported!')
-
-    v = View(p, o, dt)
-
-    dm = DipMeans(
-        view=v,
-        pval=float(arguments['<pv>']),
-        sim_num=int(arguments['<sims>']),
-        workers_num=1,
-        random_state=SEED
+    dm = get_dip_means(
+        arguments=arguments,
+        seed=SEED
     )
 
     clusters = dm.fit(x).labels_

diff --git a/experiments/synthetic/clustering.py b/experiments/synthetic/clustering.py
@@ -19,14 +19,9 @@
 from loguru import logger
 from sklearn.metrics import normalized_mutual_info_score
 
+from experiments.common import get_dip_means
 from experiments.common import plot_clustered_data
 from experiments.synthetic.misc import load
-from mudpod.clustering import DipMeans
-from mudpod.projections import IdentityProjector
-from mudpod.projections import JohnsonLindenstrauss
-from mudpod.observer import PercentileObserver
-from mudpod.observer import RandomObserver
-from mudpod.projections import View
 from mudpod.misc import set_seed
 
 
@@ -39,37 +34,15 @@
 if __name__ == "__main__":
     arguments = docopt(__doc__)
 
-    x, y = load(str(arguments['<d>']))
-
     SEED = int(arguments['--seed'])
     set_seed(SEED)
 
-    pt = str(arguments['<pj>'])
-    if pt == 'jl':
-        p = JohnsonLindenstrauss()
-    elif pt == 'i':
-        p = IdentityProjector()
-    else:
-       raise ValueError(f'The projection type: {pt} is not supported!')
-
-
-    dt = str(arguments['--dist'])
-    ot = str(arguments['--obs'])
-    if ot == 'percentile':
-        o = PercentileObserver(0.99, dt)
-    elif ot == 'random':
-        o = RandomObserver()
-    else:
-       raise ValueError(f'The observer type: {ot} is not supported!')
-
-    v = View(p, o, dt)
+    x, y = load(str(arguments['<d>']))
 
-    dm = DipMeans(
-        view=v,
-        pval=float(arguments['<pv>']),
-        sim_num=int(arguments['<sims>']),
-        workers_num=1,
-        random_state=SEED
+
+    dm = get_dip_means(
+        arguments=arguments,
+        seed=SEED
     )
 
     clusters = dm.fit(x).labels_

diff --git a/experiments/synthetic/two_gaussians_mix.py b/experiments/synthetic/two_gaussians_mix.py
@@ -19,16 +19,10 @@
 from docopt import docopt
 from loguru import logger
 
+from experiments.common import get_monte_carlo_test
 from experiments.common import plot_clustered_data
 from experiments.synthetic.misc import TwoDimGaussianSumGenerator
 from mudpod.misc import set_seed
-from mudpod.projections import IdentityProjector
-from mudpod.projections import JohnsonLindenstrauss
-from mudpod.observer import PercentileObserver
-from mudpod.observer import RandomObserver
-from mudpod.projections import View
-from mudpod.unimodality import UnimodalityTest
-from mudpod.unimodality import MonteCarloUnimodalityTest
 
 
 logger.remove()
@@ -43,31 +37,6 @@
     SEED = int(arguments['--seed'])
     set_seed(SEED)
 
-    pt = str(arguments['<pj>'])
-    if pt == 'jl':
-        p = JohnsonLindenstrauss()
-    elif pt == 'i':
-        p = IdentityProjector()
-    else:
-       raise ValueError(f'The projection type: {pt} is not supported!')
-
-    dt = str(arguments['--dist'])
-    ot = str(arguments['--obs'])
-    if ot == 'percentile':
-        o = PercentileObserver(0.99, dt)
-    elif ot == 'random':
-        o = RandomObserver()
-    else:
-       raise ValueError(f'The observer type: {ot} is not supported!')
-
-    v = View(p, o, dt)
-    t = UnimodalityTest(v, float(arguments['<pv>']))
-    mct = MonteCarloUnimodalityTest(
-        t,
-        sim_num=int(arguments['<sims>']),
-        workers_num=1
-    )
-
     n_samples = int(arguments['--samples'])
     std = float(arguments['--noise'])
     g = TwoDimGaussianSumGenerator(
@@ -76,6 +45,8 @@
       random_state=SEED
     )
 
+    mct = get_monte_carlo_test(arguments=arguments, workers_num=1)
+
     tr = 'unimodal' if mct.test(g.x) else 'bimodal'
     msg = dict(arguments)
     msg['groundtruth'] = g.t

diff --git a/experiments/synthetic/unimodality.py b/experiments/synthetic/unimodality.py
@@ -27,15 +27,9 @@
 from sklearn.datasets import load_digits
 from sklearn.datasets import load_iris
 
+from experiments.common import get_monte_carlo_test
 from experiments.common import plot_clustered_data
 from mudpod.misc import set_seed
-from mudpod.projections import IdentityProjector
-from mudpod.projections import JohnsonLindenstrauss
-from mudpod.observer import PercentileObserver
-from mudpod.observer import RandomObserver
-from mudpod.projections import View
-from mudpod.unimodality import UnimodalityTest
-from mudpod.unimodality import MonteCarloUnimodalityTest
 
 logger.remove()
 # add a new handler with level set to INFO
@@ -63,37 +57,13 @@ def get_dataset(name: str) -> Callable:
     SEED = int(arguments['--seed'])
     set_seed(SEED)
 
-    pt = str(arguments['<pj>'])
-    if pt == 'jl':
-        p = JohnsonLindenstrauss()
-    elif pt == 'i':
-        p = IdentityProjector()
-    else:
-       raise ValueError(f'The projection type: {pt} is not supported!')
-
-
-    dt = str(arguments['--dist'])
-    ot = str(arguments['--obs'])
-    if ot == 'percentile':
-        o = PercentileObserver(0.99, dt)
-    elif ot == 'random':
-        o = RandomObserver()
-    else:
-       raise ValueError(f'The observer type: {ot} is not supported!')
-
-    v = View(p, o, dt)
-    t = UnimodalityTest(v, float(arguments['<pv>']))
-    mct = MonteCarloUnimodalityTest(
-        t,
-        sim_num=int(arguments['<sims>']),
-        workers_num=1
-    )
-
     data_func = get_dataset(str(arguments['<d>']))
     n_samples = int(arguments['--samples'])
     noise = float(arguments['--noise'])
     x, y = data_func(n_samples=n_samples, noise=noise, random_state=SEED)
 
+    mct = get_monte_carlo_test(arguments=arguments, workers_num=1)
+
     msg = dict(arguments)
     msg['result'] = 'unimodal' if mct.test(x) else 'multimodal'
     msg.pop('--help')