From d9b08fbe28e2311d3b6ab0c79578ebd52b9ed19c Mon Sep 17 00:00:00 2001 From: Dayvid Victor Date: Mon, 23 Jun 2014 18:58:47 -0300 Subject: [PATCH] enn example --- examples/enn_example.py | 39 ++++++++++ examples/plot_comparision.py | 137 +++++++++++++++++++++++++++++++++++ examples/protopy | 1 + protopy/selection/enn.py | 7 +- 4 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 examples/enn_example.py create mode 100644 examples/plot_comparision.py create mode 120000 examples/protopy diff --git a/examples/enn_example.py b/examples/enn_example.py new file mode 100644 index 0000000..f45fdaf --- /dev/null +++ b/examples/enn_example.py @@ -0,0 +1,39 @@ +import numpy as np +import matplotlib.pyplot as plt + +from protopy.selection.enn import ENN + +mu1 = [4, 5] +si1 = [[0.75, 0.25], [0.25, 0.75]] + +mu2 = [6, 5] +si2 = [[0.25, 0.75], [0.75, 0.25]] + +samples = 200 + +X1 = np.random.multivariate_normal( + np.asarray(mu1), np.asarray(si1), samples) +X2 = np.random.multivariate_normal( + np.asarray(mu2), np.asarray(si2), samples) + +X = np.vstack((X1, X2)) + +y = np.asarray([0] * samples + [1] * samples) + + +plt.plot(X[y==0].T[0], X[y==0].T[1], 'bs', X[y==1].T[0], X[y==1].T[1],'ro') +plt.axis([0, 10, 0, 10]) +plt.title('Original Dataset') +plt.show() +plt.clf() + +editednn = ENN() +X_, y_ = editednn.reduce_data(X, y) + +plt.plot(X_[y_==0].T[0], X_[y_==0].T[1], 'bs', X_[y_==1].T[0], X_[y_==1].T[1],'ro') +plt.axis([0, 10, 0, 10]) +plt.title('ENN') +plt.show() +plt.clf() + + diff --git a/examples/plot_comparision.py b/examples/plot_comparision.py new file mode 100644 index 0000000..94e3d09 --- /dev/null +++ b/examples/plot_comparision.py @@ -0,0 +1,137 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +====================== +Classifiers Comparison +====================== +A comparison of a several classifiers in scikit-learn on synthetic datasets. +The point of this example is to illustrate the nature of decision boundaries +of different classifiers. +This should be taken with a grain of salt, as the intuition conveyed by +these examples does not necessarily carry over to real datasets. + +In particular in high dimensional spaces data can more easily be separated +linearly and the simplicity of classifiers such as naive Bayes and linear SVMs +might lead to better generalization. + +The plots show training points in solid colors and testing points +semi-transparent. The lower right shows the classification accuracy on the test +set. +""" +print(__doc__) + + +# Code source: Gael Varoqueux +# Andreas Mueller +# Modified for Documentation merge by Jaques Grobler +# License: BSD 3 clause + +import numpy as np +import pylab as pl +from matplotlib.colors import ListedColormap +from sklearn.cross_validation import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_moons, make_circles, make_classification +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.lda import LDA +from sklearn.qda import QDA +from protopy.selection.enn import ENN + +h = .02 # step size in the mesh + +#names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", +# "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] +names = ["KNN", "ENN"] + + +classifiers = [ + KNeighborsClassifier(3), + ENN()] +''' + SVC(kernel="linear", C=0.025), + SVC(gamma=2, C=1), + DecisionTreeClassifier(max_depth=5), + RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), + AdaBoostClassifier(), + GaussianNB(), + LDA(), + QDA()] +''' +X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, + random_state=1, n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) +linearly_separable = (X, y) + +datasets = [make_moons(noise=0.3, random_state=0), + make_circles(noise=0.2, factor=0.5, random_state=1), + linearly_separable + ] + +figure = pl.figure(figsize=(27, 9)) +i = 1 +# iterate over datasets +for ds in datasets: + # preprocess dataset, split into training and test part + X, y = ds + X = StandardScaler().fit_transform(X) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + + # just plot the dataset first + cm = pl.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = pl.subplot(len(datasets), len(classifiers) + 1, i) + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = pl.subplot(len(datasets), len(classifiers) + 1, i) + clf.fit(X_train, y_train) + score = clf.score(X_test, y_test) + + # Plot the decision boundary. For that, we will assign a color to each + # point in the mesh [x_min, m_max]x[y_min, y_max]. + if hasattr(clf, "decision_function"): + Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + else: + Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + + # Plot also the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, + alpha=0.6) + + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + +figure.subplots_adjust(left=.02, right=.98) +pl.show() diff --git a/examples/protopy b/examples/protopy new file mode 120000 index 0000000..2a100ac --- /dev/null +++ b/examples/protopy @@ -0,0 +1 @@ +../protopy \ No newline at end of file diff --git a/protopy/selection/enn.py b/protopy/selection/enn.py index 825f44a..2e369b2 100644 --- a/protopy/selection/enn.py +++ b/protopy/selection/enn.py @@ -41,7 +41,7 @@ class ENN(InstanceReductionMixin): Examples -------- - >>> from protopy.selection import ENN + >>> from protopy.selection.enn import ENN >>> import numpy as np >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]]) >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2]) @@ -90,3 +90,8 @@ def reduce_data(self, X, y): self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_ + + + def predict_proba(self, X): + return self.classifier.predict_proba(X) +