Skip to content

Commit

Permalink
core: change name to mudpod
Browse files Browse the repository at this point in the history
  • Loading branch information
prokolyvakis committed Oct 3, 2023
1 parent 3a7a737 commit 2a7b14a
Show file tree
Hide file tree
Showing 18 changed files with 1,338 additions and 1,690 deletions.
17 changes: 9 additions & 8 deletions experiments/real/hgf_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
pre-trained hugging face models.
"""
import sys
import warnings

from loguru import logger
import numpy as np
Expand All @@ -14,13 +15,13 @@
from experiments.common import plot_clustered_data
from experiments.real.utils import DataHandler
from experiments.real.utils import SplitMode
from hdunim.clustering import DipMeans
from hdunim.projections import IdentityProjector
from hdunim.projections import JohnsonLindenstrauss
from hdunim.observer import PercentileObserver
from hdunim.observer import RandomObserver
from hdunim.projections import View
from hdunim.misc import set_seed
from mudpod.clustering import DipMeans
from mudpod.projections import IdentityProjector
from mudpod.projections import JohnsonLindenstrauss
from mudpod.observer import PercentileObserver
from mudpod.observer import RandomObserver
from mudpod.projections import View
from mudpod.misc import set_seed

SEED = 120

Expand All @@ -30,6 +31,7 @@
logger.remove()
# add a new handler with level set to INFO
logger.add(sys.stderr, level="INFO")
warnings.filterwarnings("ignore")


if __name__ == "__main__":
Expand All @@ -50,7 +52,6 @@
x, y = data_handler.get(SplitMode.TEST)

v = View(JohnsonLindenstrauss, PercentileObserver(0.99))
# v = View(IdentityProjector, RandomObserver(), 'mahalanobis')

dm = DipMeans(view=v, pval=0.05, sim_num=100, workers_num=10, random_state=SEED)

Expand Down
68 changes: 48 additions & 20 deletions experiments/real/pre_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@
pre-trained embeddings stored in Numpy saved arrays.
Usage:
pre_clustering.py <p> <pj> <pv> <sims> [--samples=<s> --seed=<sd>]
pre_clustering.py <p> <pj> <pv> <sims> [--samples=<s> --seed=<sd> --dist=<ds> --obs=<o> --plot=<f>]
pre_clustering.py -h | --help
Options:
-h --help Show this screen.
--samples=<s> Optional number of samples [default: ].
--seed=<sd> The seed [default: 42].
--dist=<ds> The type of distance [default: mahalanobis].
--obs=<o> The type of the observer [default: percentile].
--plot=<f> Whether to produce a plot or not [default: False].
"""
from pathlib import Path
import sys
from typing import Optional
import warnings

from docopt import docopt
from loguru import logger
Expand All @@ -24,17 +28,19 @@
from umap import UMAP

from experiments.common import plot_clustered_data
from hdunim.misc import set_seed
from hdunim.clustering import DipMeans
from hdunim.projections import IdentityProjector
from hdunim.projections import JohnsonLindenstrauss
from hdunim.observer import PercentileObserver
from hdunim.projections import View
from mudpod.misc import set_seed
from mudpod.clustering import DipMeans
from mudpod.projections import IdentityProjector
from mudpod.projections import JohnsonLindenstrauss
from mudpod.observer import PercentileObserver
from mudpod.observer import RandomObserver
from mudpod.projections import View


logger.remove()
# add a new handler with level set to INFO
logger.add(sys.stderr, level="INFO")
warnings.filterwarnings("ignore")


def get_data(
Expand All @@ -49,8 +55,9 @@ def get_data(
'Either the embeddings or the labels do not confront to the naming'
' convention, i.e., the embeddings to be stored in a file named:'
' `embeddings.npy` and the labels in a file named: `labels.npy`!'
' Original error: %s', str(e)
)
raise FileNotFoundError(e)
raise

if samples is None:
return x, y
Expand All @@ -65,23 +72,39 @@ def get_data(
if __name__ == "__main__":
arguments = docopt(__doc__)

SEED = int(arguments['--seed'])
set_seed(SEED)

n_samples = arguments['--samples'] or None
if n_samples is not None:
n_samples = int(n_samples)
x, y = get_data(Path(arguments['<p>']), samples=n_samples)

SEED = int(arguments['--seed'])
set_seed(SEED)

pt = str(arguments['<pj>'])
p = JohnsonLindenstrauss if pt == 'jl' else IdentityProjector
v = View(p, PercentileObserver(0.99))
if pt == 'jl':
p = JohnsonLindenstrauss()
elif pt == 'i':
p = IdentityProjector()
else:
raise ValueError(f'The projection type: {pt} is not supported!')


dt = str(arguments['--dist'])
ot = str(arguments['--obs'])
if ot == 'percentile':
o = PercentileObserver(0.99, dt)
elif ot == 'random':
o = RandomObserver()
else:
raise ValueError(f'The observer type: {ot} is not supported!')

v = View(p, o, dt)

dm = DipMeans(
view=v,
pval=float(arguments['<pv>']),
sim_num=int(arguments['<sims>']),
workers_num=10,
workers_num=1,
random_state=SEED
)

Expand All @@ -90,13 +113,18 @@ def get_data(

msg = dict(arguments)
msg['result'] = f'The NMI score is {nmi}'
msg.pop('--help')

if eval(msg['--plot']):
reducer = UMAP(random_state=SEED)
reducer.fit(x)
embeddings = reducer.transform(x)

plot_clustered_data(embeddings, y)

msg.pop('--plot')

logger.info(
'The inputs and the output of the experiment is: '
f'{msg}'
)

reducer = UMAP(random_state=SEED)
reducer.fit(x)
embeddings = reducer.transform(x)

plot_clustered_data(embeddings, y)
76 changes: 56 additions & 20 deletions experiments/synthetic/clustering.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,76 @@
"""Clustering experiments with synthetic datasets."""
"""Clustering experiments with synthetic datasets.
Usage:
clustering.py <d> <pj> <pv> <sims> [--samples=<s> --noise=<n> --seed=<sd> --dist=<ds> --obs=<o>]
clustering.py -h | --help
Options:
-h --help Show this screen.
--samples=<s> The number of samples [default: 200].
--noise=<n> The standard deviation inside the clusters [default: 0].
--seed=<sd> The seed [default: 42].
--dist=<ds> The type of distance [default: mahalanobis].
--obs=<o> The type of the observer [default: percentile].
"""
import sys
import warnings

from docopt import docopt
from loguru import logger
from sklearn.metrics import normalized_mutual_info_score

from experiments.common import plot_clustered_data
from experiments.synthetic.misc import load
from hdunim.clustering import DipMeans
from hdunim.projections import IdentityProjector
from hdunim.projections import JohnsonLindenstrauss
from hdunim.observer import PercentileObserver
from hdunim.observer import RandomObserver
from hdunim.projections import View
from hdunim.misc import set_seed
from mudpod.clustering import DipMeans
from mudpod.projections import IdentityProjector
from mudpod.projections import JohnsonLindenstrauss
from mudpod.observer import PercentileObserver
from mudpod.observer import RandomObserver
from mudpod.projections import View
from mudpod.misc import set_seed

SEED = 128

set_seed(SEED)

logger.remove()
# add a new handler with level set to INFO
logger.add(sys.stderr, level="INFO")
warnings.filterwarnings("ignore")


if __name__ == "__main__":
fname = 'xclara.arff'
x, y = load(fname)
# mask = np.isin(y, [5, 8])
# x = x[mask]
# y = y[mask]
arguments = docopt(__doc__)

x, y = load(str(arguments['<d>']))

SEED = int(arguments['--seed'])
set_seed(SEED)

pt = str(arguments['<pj>'])
if pt == 'jl':
p = JohnsonLindenstrauss()
elif pt == 'i':
p = IdentityProjector()
else:
raise ValueError(f'The projection type: {pt} is not supported!')


dt = str(arguments['--dist'])
ot = str(arguments['--obs'])
if ot == 'percentile':
o = PercentileObserver(0.99, dt)
elif ot == 'random':
o = RandomObserver()
else:
raise ValueError(f'The observer type: {ot} is not supported!')

v = View(JohnsonLindenstrauss, PercentileObserver(0.99))
# v = View(JohnsonLindenstrauss, RandomObserver())
# v = View(IdentityProjector, RandomObserver())
v = View(p, o, dt)

dm = DipMeans(view=v, pval=0.001, sim_num=100, workers_num=10, random_state=SEED)
dm = DipMeans(
view=v,
pval=float(arguments['<pv>']),
sim_num=int(arguments['<sims>']),
workers_num=1,
random_state=SEED
)

clusters = dm.fit(x).labels_

Expand Down
16 changes: 8 additions & 8 deletions experiments/synthetic/two_gaussians_mix.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@

from experiments.common import plot_clustered_data
from experiments.synthetic.misc import TwoDimGaussianSumGenerator
from hdunim.misc import set_seed
from hdunim.projections import IdentityProjector
from hdunim.projections import JohnsonLindenstrauss
from hdunim.observer import PercentileObserver
from hdunim.observer import RandomObserver
from hdunim.projections import View
from hdunim.unimodality import UnimodalityTest
from hdunim.unimodality import MonteCarloUnimodalityTest
from mudpod.misc import set_seed
from mudpod.projections import IdentityProjector
from mudpod.projections import JohnsonLindenstrauss
from mudpod.observer import PercentileObserver
from mudpod.observer import RandomObserver
from mudpod.projections import View
from mudpod.unimodality import UnimodalityTest
from mudpod.unimodality import MonteCarloUnimodalityTest


logger.remove()
Expand Down
Loading

0 comments on commit 2a7b14a

Please sign in to comment.