From c73e9fbae1f8d63e57da9c202459420769a40310 Mon Sep 17 00:00:00 2001 From: Dayvid Victor Date: Fri, 4 Jul 2014 11:57:36 -0300 Subject: [PATCH] new examples added --- examples/plot_generation_example.py | 134 +++++++++++++++ examples/plot_imbalanced_pg_comparision.py | 29 +--- ..._examples.py => plot_selection_example.py} | 0 examples/tmp.py | 156 ++++++++++++++++++ examples/utils.py | 25 +++ 5 files changed, 320 insertions(+), 24 deletions(-) create mode 100644 examples/plot_generation_example.py rename examples/{plot_examples.py => plot_selection_example.py} (100%) create mode 100644 examples/tmp.py create mode 100644 examples/utils.py diff --git a/examples/plot_generation_example.py b/examples/plot_generation_example.py new file mode 100644 index 0000000..8f5055b --- /dev/null +++ b/examples/plot_generation_example.py @@ -0,0 +1,134 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +============================================== +Prototype Selection and Generation Comparision +============================================== +A comparison of a several prototype selection and generation algorithms in +the project on synthetic datasets. +The point of this example is to illustrate the nature of decision boundaries +after applying instance reduction techniques. +This should be taken with a grain of salt, as the intuition conveyed by +these examples does not necessarily carry over to real datasets. + +In particular in high dimensional spaces data can more easily be separated +linearly and the simplicity of classifiers such as naive Bayes and linear SVMs +might lead to better generalization. + +The plots show training points in solid colors and testing points +semi-transparent. + +The lower right shows: +- S: score on the traning set. +- R: reduction ratio. + +License: BSD 3 clause +""" + +print(__doc__) + + +import numpy as np +import pylab as pl +from matplotlib.colors import ListedColormap +from sklearn.cross_validation import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_moons, make_circles, make_classification +from sklearn.neighbors import KNeighborsClassifier +from protopy.selection.enn import ENN +from protopy.selection.cnn import CNN +from protopy.selection.renn import RENN +from protopy.selection.allknn import AllKNN +from protopy.selection.tomek_links import TomekLinks +from protopy.generation.sgp import SGP, SGP2, ASGP + +import utils + +h = .02 # step size in the mesh + +names = ["KNN", "SGP", "SGP2", "ASGP"] + +r_min, r_mis = 0.15, 0.15 + +classifiers = [ + KNeighborsClassifier(1), + SGP(r_min=r_min, r_mis=r_mis), + SGP2(r_min=r_min, r_mis=r_mis), + ASGP(r_min=r_min, r_mis=r_mis)] + +X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, + random_state=1, n_clusters_per_class=1) + +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) +linearly_separable = (X, y) + +datasets = [make_moons(noise=0.3, random_state=0), + make_circles(noise=0.2, factor=0.5, random_state=1), + linearly_separable + ] + +figure = pl.figure(figsize=(27, 9)) +i = 1 +# iterate over datasets +for ds in datasets: + # preprocess dataset, split into training and test part + X, y = ds + X = StandardScaler().fit_transform(X) + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + + X, y = utils.generate_imbalance(X, y, positive_label=1, ir=1.5) + # just plot the dataset first + cm = pl.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = pl.subplot(len(datasets), len(classifiers) + 1, i) + # Plot the training points + ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = pl.subplot(len(datasets), len(classifiers) + 1, i) + clf.fit(np.array(X), np.array(y)) + + red = clf.reduction_ if hasattr(clf, 'reduction_') else 0.0 + if hasattr(clf, 'reduction_'): + X_prot, y_prot = clf.X_, clf.y_ + else: + X_prot, y_prot = X, y + + + # Plot the decision boundary. For that, we will assign a color to each + # point in the mesh [x_min, m_max]x[y_min, y_max]. + if hasattr(clf, "decision_function"): + Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + else: + Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + + # Plot also the prototypes + ax.scatter(X_prot[:, 0], X_prot[:, 1], c=y_prot, cmap=cm_bright) + + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, 'R:' + ('%.2f' % red).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + +figure.subplots_adjust(left=.02, right=.98) +pl.show() diff --git a/examples/plot_imbalanced_pg_comparision.py b/examples/plot_imbalanced_pg_comparision.py index bbc2efb..64ff540 100644 --- a/examples/plot_imbalanced_pg_comparision.py +++ b/examples/plot_imbalanced_pg_comparision.py @@ -37,6 +37,8 @@ from protopy.selection.tomek_links import TomekLinks from protopy.generation.sgp import SGP, SGP2, ASGP +import utils as utils + h = .02 # step size in the mesh names = ["KNN", "SGP", "SGP2", "ASGP"] @@ -46,7 +48,7 @@ KNeighborsClassifier(3), SGP(r_min=0.2, r_mis=0.05), SGP2(r_min=0.2, r_mis=0.05), - ASGP(r_min=0.2, r_mis=0.05)] + ASGP(r_min=0.2, r_mis=0.05, pos_class=1)] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) @@ -60,34 +62,13 @@ linearly_separable ] -def random_subset(iterator, k): - result = iterator[:k] - i = k - tmp_it = iterator[k:] - for item in tmp_it: - i = i + 1 - s = int(np.random.random() * i) - if s < k: - result[s] = item - return result - -def generate_imbalance(X, y, positive_label=1, ir=2): - mask = y == positive_label - seq = np.arange(y.shape[0])[mask] - k = float(sum(mask))/ir - idx = np.asarray(random_subset(seq, int(k))) - mask = ~mask - mask[idx] = True - return X[mask], y[mask] - - figure = pl.figure(figsize=(27, 9)) i = 1 # iterate over datasets for ds in datasets: # preprocess dataset, split into training and test part X, y = ds - X, y = generate_imbalance(X, y) + X, y = utils.generate_imbalance(X, y) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) @@ -114,7 +95,7 @@ def generate_imbalance(X, y, positive_label=1, ir=2): # iterate over classifiers for name, clf in zip(names, classifiers): ax = pl.subplot(len(datasets), len(classifiers) + 1, i) - clf.fit(X_train, y_train) + clf.fit(np.array(X_train), np.array(y_train)) y_pred = clf.predict(X_test) fp_rate, tp_rate, thresholds = roc_curve( diff --git a/examples/plot_examples.py b/examples/plot_selection_example.py similarity index 100% rename from examples/plot_examples.py rename to examples/plot_selection_example.py diff --git a/examples/tmp.py b/examples/tmp.py new file mode 100644 index 0000000..b1691e1 --- /dev/null +++ b/examples/tmp.py @@ -0,0 +1,156 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +============================================== +Prototype Selection and Generation Comparision +============================================== +A comparison of a several prototype selection and generation algorithms in +the project on synthetic datasets. +The point of this example is to illustrate the nature of decision boundaries +after applying instance reduction techniques. +This should be taken with a grain of salt, as the intuition conveyed by +these examples does not necessarily carry over to real datasets. + +In particular in high dimensional spaces data can more easily be separated +linearly and the simplicity of classifiers such as naive Bayes and linear SVMs +might lead to better generalization. + +The plots show training points in solid colors and testing points +semi-transparent. + +The lower right shows: +- S: score on the traning set. +- R: reduction ratio. + +License: BSD 3 clause +""" + +print(__doc__) + + +import numpy as np +import pylab as pl +from matplotlib.colors import ListedColormap +from sklearn.cross_validation import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_moons, make_circles, make_classification +from sklearn.neighbors import KNeighborsClassifier +from protopy.selection.enn import ENN +from protopy.selection.cnn import CNN +from protopy.selection.renn import RENN +from protopy.selection.allknn import AllKNN +from protopy.selection.tomek_links import TomekLinks +from protopy.generation.sgp import SGP, SGP2, ASGP + +h = .02 # step size in the mesh + +figure = pl.figure(figsize=(27,9)) + +names = ["KNN", "SGP", "SGP2", "ASGP"] +classifiers = [ + KNeighborsClassifier(1), + SGP(r_min=0.05, r_mis=0.05), + SGP2(r_min=0.05, r_mis=0.05), + ASGP(r_min=0.05, r_mis=0.05)] + + +def get_datasets(): + mu1 = [4, 5] + si1 = [[0.75, 0.25], [0.25, 0.75]] + + mu2 = [5, 5] + si2 = [[0.25, 0.75], [0.75, 0.25]] + + samples = 100 + + X1 = np.random.multivariate_normal( + np.asarray(mu1), np.asarray(si1), samples) + X2 = np.random.multivariate_normal( + np.asarray(mu2), np.asarray(si2), samples) + X = np.vstack((X1, X2)) + y = np.asarray([0] * samples + [1] * samples) + + z = zip(X, y) + np.random.shuffle(z) + X, y = zip(*z) + X, y = np.asarray(X), np.asarray(y) + + normal_dists = make_classification(n_features=2, n_redundant=0, n_informative=2, + random_state=1, n_clusters_per_class=1) + + rng = np.random.RandomState(2) + X += 2 * rng.uniform(size=X.shape) + linearly_separable = (X, y) + + datasets = [make_moons(noise=0.3, random_state=0)] + # make_circles(noise=0.2, factor=0.5, random_state=1), + # linearly_separable, + # ] + return datasets + + + +datasets = get_datasets() + +i = 0 +for ds in datasets: + X, y = ds + X = StandardScaler().fit_transform(X) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + + cm = pl.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = pl.subplot(len(datasets), len(classifiers) + 1, i) + # Plot the training points + ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i = i + 1 + + + for name, clf in zip(names, classifiers): + ax = pl.subplot(len(datasets), len(classifiers) + 1, i) + clf.fit(X, y) + red = clf.reduction_ if hasattr(clf, 'reduction_') else 0.0 + + X_prot, y_prot = X, y + if hasattr(clf, 'reduction_'): + X_prot, y_prot = clf.X_, clf.y_ + + # Plot the decision boundary. For that, we will assign a color to each + # point in the mesh [x_min, m_max]x[y_min, y_max]. + Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + + # Plot points + ax.scatter(X_prot[:, 0], X_prot[:, 1], c=y_prot, cmap=cm_bright) + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, 'RED:' + ('%.2f' % red).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + +figure.subplots_adjust(left=.02, right=.98) +pl.show() + + + + diff --git a/examples/utils.py b/examples/utils.py new file mode 100644 index 0000000..bb01c73 --- /dev/null +++ b/examples/utils.py @@ -0,0 +1,25 @@ +import numpy as np + + +def random_subset(iterator, k): + result = iterator[:k] + i = k + tmp_it = iterator[k:] + for item in tmp_it: + i = i + 1 + s = int(np.random.random() * i) + if s < k: + result[s] = item + return result + +def generate_imbalance(X, y, positive_label=1, ir=2): + mask = y == positive_label + seq = np.arange(y.shape[0])[mask] + k = float(sum(mask))/ir + idx = np.asarray(random_subset(seq, int(k))) + mask = ~mask + mask[idx] = True + return X[mask], y[mask] + + +