core: change name to mudpod

prokolyvakis · Oct 3, 2023 · 2a7b14a · 2a7b14a
1 parent 3a7a737
commit 2a7b14a
Show file tree

Hide file tree

Showing 18 changed files with 1,338 additions and 1,690 deletions.
diff --git a/experiments/real/hgf_clustering.py b/experiments/real/hgf_clustering.py
@@ -2,6 +2,7 @@
    pre-trained hugging face models.
 """
 import sys
+import warnings
 
 from loguru import logger
 import numpy as np
@@ -14,13 +15,13 @@
 from experiments.common import plot_clustered_data
 from experiments.real.utils import DataHandler
 from experiments.real.utils import SplitMode
-from hdunim.clustering import DipMeans
-from hdunim.projections import IdentityProjector
-from hdunim.projections import JohnsonLindenstrauss
-from hdunim.observer import PercentileObserver
-from hdunim.observer import RandomObserver
-from hdunim.projections import View
-from hdunim.misc import set_seed
+from mudpod.clustering import DipMeans
+from mudpod.projections import IdentityProjector
+from mudpod.projections import JohnsonLindenstrauss
+from mudpod.observer import PercentileObserver
+from mudpod.observer import RandomObserver
+from mudpod.projections import View
+from mudpod.misc import set_seed
 
 SEED = 120
 
@@ -30,6 +31,7 @@
 logger.remove()
 # add a new handler with level set to INFO
 logger.add(sys.stderr, level="INFO")
+warnings.filterwarnings("ignore")
 
 
 if __name__ == "__main__":
@@ -50,7 +52,6 @@
     x, y = data_handler.get(SplitMode.TEST)
 
     v = View(JohnsonLindenstrauss, PercentileObserver(0.99))
-    # v = View(IdentityProjector, RandomObserver(), 'mahalanobis')
 
     dm = DipMeans(view=v, pval=0.05, sim_num=100, workers_num=10, random_state=SEED)
 

diff --git a/experiments/real/pre_clustering.py b/experiments/real/pre_clustering.py
@@ -2,17 +2,21 @@
    pre-trained embeddings stored in Numpy saved arrays.
 
 Usage:
-  pre_clustering.py <p> <pj> <pv> <sims> [--samples=<s> --seed=<sd>]
+  pre_clustering.py <p> <pj> <pv> <sims> [--samples=<s> --seed=<sd> --dist=<ds> --obs=<o> --plot=<f>]
   pre_clustering.py -h | --help
 
 Options:
   -h --help         Show this screen.
   --samples=<s>     Optional number of samples [default: ].
   --seed=<sd>       The seed [default: 42].
+  --dist=<ds>       The type of distance [default: mahalanobis].
+  --obs=<o>         The type of the observer [default: percentile].
+  --plot=<f>        Whether to produce a plot or not [default: False].
 """
 from pathlib import Path
 import sys
 from typing import Optional
+import warnings
 
 from docopt import docopt
 from loguru import logger
@@ -24,17 +28,19 @@
 from umap import UMAP
 
 from experiments.common import plot_clustered_data
-from hdunim.misc import set_seed
-from hdunim.clustering import DipMeans
-from hdunim.projections import IdentityProjector
-from hdunim.projections import JohnsonLindenstrauss
-from hdunim.observer import PercentileObserver
-from hdunim.projections import View
+from mudpod.misc import set_seed
+from mudpod.clustering import DipMeans
+from mudpod.projections import IdentityProjector
+from mudpod.projections import JohnsonLindenstrauss
+from mudpod.observer import PercentileObserver
+from mudpod.observer import RandomObserver
+from mudpod.projections import View
 
 
 logger.remove()
 # add a new handler with level set to INFO
 logger.add(sys.stderr, level="INFO")
+warnings.filterwarnings("ignore")
 
 
 def get_data(
@@ -49,8 +55,9 @@ def get_data(
             'Either the embeddings or the labels do not confront to the naming'
             ' convention, i.e., the embeddings to be stored in a file named:'
             ' `embeddings.npy` and the labels in a file named: `labels.npy`!'
+            ' Original error: %s', str(e)
         )
-        raise FileNotFoundError(e)
+        raise
 
     if samples is None:
         return x, y
@@ -65,23 +72,39 @@ def get_data(
 if __name__ == "__main__":
     arguments = docopt(__doc__)
 
-    SEED = int(arguments['--seed'])
-    set_seed(SEED)
-
     n_samples = arguments['--samples'] or None
     if n_samples is not None:
         n_samples = int(n_samples)
     x, y = get_data(Path(arguments['<p>']), samples=n_samples)
 
+    SEED = int(arguments['--seed'])
+    set_seed(SEED)
+
     pt = str(arguments['<pj>'])
-    p = JohnsonLindenstrauss if pt == 'jl' else IdentityProjector
-    v = View(p, PercentileObserver(0.99))
+    if pt == 'jl':
+        p = JohnsonLindenstrauss()
+    elif pt == 'i':
+        p = IdentityProjector()
+    else:
+       raise ValueError(f'The projection type: {pt} is not supported!')
 
+
+    dt = str(arguments['--dist'])
+    ot = str(arguments['--obs'])
+    if ot == 'percentile':
+        o = PercentileObserver(0.99, dt)
+    elif ot == 'random':
+        o = RandomObserver()
+    else:
+       raise ValueError(f'The observer type: {ot} is not supported!')
+
+    v = View(p, o, dt)
+
     dm = DipMeans(
         view=v,
         pval=float(arguments['<pv>']),
         sim_num=int(arguments['<sims>']),
-        workers_num=10,
+        workers_num=1,
         random_state=SEED
     )
 
@@ -90,13 +113,18 @@ def get_data(
 
     msg = dict(arguments)
     msg['result'] = f'The NMI score is {nmi}'
+    msg.pop('--help')
+
+    if eval(msg['--plot']):
+        reducer = UMAP(random_state=SEED)
+        reducer.fit(x)
+        embeddings = reducer.transform(x)
+
+        plot_clustered_data(embeddings, y)
+
+    msg.pop('--plot')
+
     logger.info(
         'The inputs and the output of the experiment is: '
         f'{msg}'
     )
-
-    reducer = UMAP(random_state=SEED)
-    reducer.fit(x)
-    embeddings = reducer.transform(x)
-
-    plot_clustered_data(embeddings, y)
diff --git a/experiments/synthetic/clustering.py b/experiments/synthetic/clustering.py
@@ -1,40 +1,76 @@
-"""Clustering experiments with synthetic datasets."""
+"""Clustering experiments with synthetic datasets.
+
+Usage:
+  clustering.py <d> <pj> <pv> <sims> [--samples=<s> --noise=<n> --seed=<sd>  --dist=<ds> --obs=<o>]
+  clustering.py -h | --help
+
+Options:
+  -h --help         Show this screen.
+  --samples=<s>     The number of samples [default: 200].
+  --noise=<n>       The standard deviation inside the clusters [default: 0].
+  --seed=<sd>       The seed [default: 42].
+  --dist=<ds>       The type of distance [default: mahalanobis].
+  --obs=<o>         The type of the observer [default: percentile].
+"""
 import sys
+import warnings
 
+from docopt import docopt
 from loguru import logger
 from sklearn.metrics import normalized_mutual_info_score
 
 from experiments.common import plot_clustered_data
 from experiments.synthetic.misc import load
-from hdunim.clustering import DipMeans
-from hdunim.projections import IdentityProjector
-from hdunim.projections import JohnsonLindenstrauss
-from hdunim.observer import PercentileObserver
-from hdunim.observer import RandomObserver
-from hdunim.projections import View
-from hdunim.misc import set_seed
+from mudpod.clustering import DipMeans
+from mudpod.projections import IdentityProjector
+from mudpod.projections import JohnsonLindenstrauss
+from mudpod.observer import PercentileObserver
+from mudpod.observer import RandomObserver
+from mudpod.projections import View
+from mudpod.misc import set_seed
 
-SEED = 128
-
-set_seed(SEED)
 
 logger.remove()
 # add a new handler with level set to INFO
 logger.add(sys.stderr, level="INFO")
+warnings.filterwarnings("ignore")
 
 
 if __name__ == "__main__":
-    fname = 'xclara.arff'
-    x, y = load(fname)
-    # mask = np.isin(y, [5, 8])
-    # x = x[mask]
-    # y = y[mask]
+    arguments = docopt(__doc__)
+
+    x, y = load(str(arguments['<d>']))
+
+    SEED = int(arguments['--seed'])
+    set_seed(SEED)
+
+    pt = str(arguments['<pj>'])
+    if pt == 'jl':
+        p = JohnsonLindenstrauss()
+    elif pt == 'i':
+        p = IdentityProjector()
+    else:
+       raise ValueError(f'The projection type: {pt} is not supported!')
+
+
+    dt = str(arguments['--dist'])
+    ot = str(arguments['--obs'])
+    if ot == 'percentile':
+        o = PercentileObserver(0.99, dt)
+    elif ot == 'random':
+        o = RandomObserver()
+    else:
+       raise ValueError(f'The observer type: {ot} is not supported!')
 
-    v = View(JohnsonLindenstrauss, PercentileObserver(0.99))
-    # v = View(JohnsonLindenstrauss, RandomObserver())
-    # v = View(IdentityProjector, RandomObserver())
+    v = View(p, o, dt)
 
-    dm = DipMeans(view=v, pval=0.001, sim_num=100, workers_num=10, random_state=SEED)
+    dm = DipMeans(
+        view=v,
+        pval=float(arguments['<pv>']),
+        sim_num=int(arguments['<sims>']),
+        workers_num=1,
+        random_state=SEED
+    )
 
     clusters = dm.fit(x).labels_
 

diff --git a/experiments/synthetic/two_gaussians_mix.py b/experiments/synthetic/two_gaussians_mix.py
@@ -21,14 +21,14 @@
 
 from experiments.common import plot_clustered_data
 from experiments.synthetic.misc import TwoDimGaussianSumGenerator
-from hdunim.misc import set_seed
-from hdunim.projections import IdentityProjector
-from hdunim.projections import JohnsonLindenstrauss
-from hdunim.observer import PercentileObserver
-from hdunim.observer import RandomObserver
-from hdunim.projections import View
-from hdunim.unimodality import UnimodalityTest
-from hdunim.unimodality import MonteCarloUnimodalityTest
+from mudpod.misc import set_seed
+from mudpod.projections import IdentityProjector
+from mudpod.projections import JohnsonLindenstrauss
+from mudpod.observer import PercentileObserver
+from mudpod.observer import RandomObserver
+from mudpod.projections import View
+from mudpod.unimodality import UnimodalityTest
+from mudpod.unimodality import MonteCarloUnimodalityTest
 
 
 logger.remove()