merge conflict solved

dvro · Jul 4, 2014 · 2671a06 · 2671a06
2 parents 7f1bd10 + c73e9fb
commit 2671a06
Show file tree

Hide file tree

Showing 5 changed files with 320 additions and 24 deletions.
diff --git a/examples/plot_generation_example.py b/examples/plot_generation_example.py
@@ -0,0 +1,134 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+==============================================
+Prototype Selection and Generation Comparision
+==============================================
+A comparison of a several prototype selection and generation algorithms in 
+the project on synthetic datasets.
+The point of this example is to illustrate the nature of decision boundaries
+after applying instance reduction techniques.
+This should be taken with a grain of salt, as the intuition conveyed by
+these examples does not necessarily carry over to real datasets.
+
+In particular in high dimensional spaces data can more easily be separated
+linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
+might lead to better generalization.
+
+The plots show training points in solid colors and testing points
+semi-transparent. 
+
+The lower right shows:
+- S: score on the traning set.
+- R: reduction ratio.
+
+License: BSD 3 clause
+"""
+
+print(__doc__)
+
+
+import numpy as np
+import pylab as pl
+from matplotlib.colors import ListedColormap
+from sklearn.cross_validation import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.datasets import make_moons, make_circles, make_classification
+from sklearn.neighbors import KNeighborsClassifier
+from protopy.selection.enn import ENN
+from protopy.selection.cnn import CNN
+from protopy.selection.renn import RENN
+from protopy.selection.allknn import AllKNN
+from protopy.selection.tomek_links import TomekLinks
+from protopy.generation.sgp import SGP, SGP2, ASGP
+
+import utils
+
+h = .02  # step size in the mesh
+
+names = ["KNN", "SGP", "SGP2", "ASGP"]
+
+r_min, r_mis = 0.15, 0.15
+
+classifiers = [
+    KNeighborsClassifier(1),
+    SGP(r_min=r_min, r_mis=r_mis),
+    SGP2(r_min=r_min, r_mis=r_mis),
+    ASGP(r_min=r_min, r_mis=r_mis)]
+
+X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
+                           random_state=1, n_clusters_per_class=1)
+
+rng = np.random.RandomState(2)
+X += 2 * rng.uniform(size=X.shape)
+linearly_separable = (X, y)
+
+datasets = [make_moons(noise=0.3, random_state=0),
+            make_circles(noise=0.2, factor=0.5, random_state=1),
+            linearly_separable
+            ]
+
+figure = pl.figure(figsize=(27, 9))
+i = 1
+# iterate over datasets
+for ds in datasets:
+    # preprocess dataset, split into training and test part
+    X, y = ds
+    X = StandardScaler().fit_transform(X)
+    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
+
+    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                         np.arange(y_min, y_max, h))
+
+    X, y = utils.generate_imbalance(X, y, positive_label=1, ir=1.5)
+    # just plot the dataset first
+    cm = pl.cm.RdBu
+    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
+    ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
+    # Plot the training points
+    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright)
+    ax.set_xlim(xx.min(), xx.max())
+    ax.set_ylim(yy.min(), yy.max())
+    ax.set_xticks(())
+    ax.set_yticks(())
+    i += 1
+
+    # iterate over classifiers
+    for name, clf in zip(names, classifiers):
+        ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
+        clf.fit(np.array(X), np.array(y))
+
+        red = clf.reduction_ if hasattr(clf, 'reduction_') else 0.0
+        if hasattr(clf, 'reduction_'):
+            X_prot, y_prot = clf.X_, clf.y_ 
+        else:
+            X_prot, y_prot = X, y
+
+
+        # Plot the decision boundary. For that, we will assign a color to each
+        # point in the mesh [x_min, m_max]x[y_min, y_max].
+        if hasattr(clf, "decision_function"):
+            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+        else:
+            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
+
+        # Put the result into a color plot
+        Z = Z.reshape(xx.shape)
+        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
+
+        # Plot also the prototypes
+        ax.scatter(X_prot[:, 0], X_prot[:, 1], c=y_prot, cmap=cm_bright)
+
+        ax.set_xlim(xx.min(), xx.max())
+        ax.set_ylim(yy.min(), yy.max())
+        ax.set_xticks(())
+        ax.set_yticks(())
+        ax.set_title(name)
+        ax.text(xx.max() - .3, yy.min() + .3, 'R:' + ('%.2f' % red).lstrip('0'),
+                size=15, horizontalalignment='right')
+        i += 1
+
+figure.subplots_adjust(left=.02, right=.98)
+pl.show()
diff --git a/examples/plot_imbalanced_pg_comparision.py b/examples/plot_imbalanced_pg_comparision.py
@@ -37,6 +37,8 @@
 from protopy.selection.tomek_links import TomekLinks
 from protopy.generation.sgp import SGP, SGP2, ASGP
 
+import utils as utils
+
 h = .02  # step size in the mesh
 
 names = ["KNN", "SGP", "SGP2", "ASGP"]
@@ -46,7 +48,7 @@
     KNeighborsClassifier(3),
     SGP(r_min=0.2, r_mis=0.05),
     SGP2(r_min=0.2, r_mis=0.05),
-    ASGP(r_min=0.2, r_mis=0.05)]
+    ASGP(r_min=0.2, r_mis=0.05, pos_class=1)]
 
 X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                            random_state=1, n_clusters_per_class=1)
@@ -60,34 +62,13 @@
             linearly_separable
             ]
 
-def random_subset(iterator, k):
-    result = iterator[:k]
-    i = k
-    tmp_it = iterator[k:]
-    for item in tmp_it:
-        i = i + 1
-        s = int(np.random.random() * i)
-        if s < k:
-            result[s] = item
-    return result
-
-def generate_imbalance(X, y, positive_label=1, ir=2):
-    mask = y == positive_label
-    seq = np.arange(y.shape[0])[mask]
-    k = float(sum(mask))/ir
-    idx = np.asarray(random_subset(seq, int(k)))
-    mask = ~mask
-    mask[idx] = True
-    return X[mask], y[mask]
-
-
 figure = pl.figure(figsize=(27, 9))
 i = 1
 # iterate over datasets
 for ds in datasets:
     # preprocess dataset, split into training and test part
     X, y = ds
-    X, y = generate_imbalance(X, y)
+    X, y = utils.generate_imbalance(X, y)
 
     X = StandardScaler().fit_transform(X)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
@@ -114,7 +95,7 @@ def generate_imbalance(X, y, positive_label=1, ir=2):
     # iterate over classifiers
     for name, clf in zip(names, classifiers):
         ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
-        clf.fit(X_train, y_train)
+        clf.fit(np.array(X_train), np.array(y_train))
 
         y_pred = clf.predict(X_test)
         fp_rate, tp_rate, thresholds = roc_curve(

diff --git a/examples/plot_selection_examples.py → examples/plot_selection_example.py b/examples/plot_selection_examples.py → examples/plot_selection_example.py
diff --git a/examples/tmp.py b/examples/tmp.py
@@ -0,0 +1,156 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+==============================================
+Prototype Selection and Generation Comparision
+==============================================
+A comparison of a several prototype selection and generation algorithms in 
+the project on synthetic datasets.
+The point of this example is to illustrate the nature of decision boundaries
+after applying instance reduction techniques.
+This should be taken with a grain of salt, as the intuition conveyed by
+these examples does not necessarily carry over to real datasets.
+
+In particular in high dimensional spaces data can more easily be separated
+linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
+might lead to better generalization.
+
+The plots show training points in solid colors and testing points
+semi-transparent. 
+
+The lower right shows:
+- S: score on the traning set.
+- R: reduction ratio.
+
+License: BSD 3 clause
+"""
+
+print(__doc__)
+
+
+import numpy as np
+import pylab as pl
+from matplotlib.colors import ListedColormap
+from sklearn.cross_validation import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.datasets import make_moons, make_circles, make_classification
+from sklearn.neighbors import KNeighborsClassifier
+from protopy.selection.enn import ENN
+from protopy.selection.cnn import CNN
+from protopy.selection.renn import RENN
+from protopy.selection.allknn import AllKNN
+from protopy.selection.tomek_links import TomekLinks
+from protopy.generation.sgp import SGP, SGP2, ASGP
+
+h = .02  # step size in the mesh
+
+figure = pl.figure(figsize=(27,9))
+
+names = ["KNN", "SGP", "SGP2", "ASGP"]
+classifiers = [
+    KNeighborsClassifier(1),
+    SGP(r_min=0.05, r_mis=0.05),
+    SGP2(r_min=0.05, r_mis=0.05),
+    ASGP(r_min=0.05, r_mis=0.05)]
+
+
+def get_datasets():
+    mu1 = [4, 5]
+    si1 = [[0.75, 0.25], [0.25, 0.75]]
+
+    mu2 = [5, 5]
+    si2 = [[0.25, 0.75], [0.75, 0.25]]
+
+    samples = 100
+
+    X1 = np.random.multivariate_normal(
+        np.asarray(mu1), np.asarray(si1), samples)
+    X2 = np.random.multivariate_normal(
+        np.asarray(mu2), np.asarray(si2), samples)
+    X = np.vstack((X1, X2))
+    y = np.asarray([0] * samples + [1] * samples)
+
+    z = zip(X, y)
+    np.random.shuffle(z)
+    X, y = zip(*z)
+    X, y = np.asarray(X), np.asarray(y)
+
+    normal_dists = make_classification(n_features=2, n_redundant=0, n_informative=2,
+                               random_state=1, n_clusters_per_class=1)
+
+    rng = np.random.RandomState(2)
+    X += 2 * rng.uniform(size=X.shape)
+    linearly_separable = (X, y)
+
+    datasets = [make_moons(noise=0.3, random_state=0)]
+    #            make_circles(noise=0.2, factor=0.5, random_state=1),
+    #            linearly_separable,
+    #            ]
+    return datasets
+
+
+
+datasets = get_datasets()
+
+i = 0
+for ds in datasets:
+    X, y = ds
+    X = StandardScaler().fit_transform(X)
+
+    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
+    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                         np.arange(y_min, y_max, h))
+
+    cm = pl.cm.RdBu
+    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
+    ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
+    # Plot the training points
+    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright)
+    ax.set_xlim(xx.min(), xx.max())
+    ax.set_ylim(yy.min(), yy.max())
+    ax.set_xticks(())
+    ax.set_yticks(())
+    i = i + 1
+
+
+    for name, clf in zip(names, classifiers):
+        ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
+        clf.fit(X, y)
+        red = clf.reduction_ if hasattr(clf, 'reduction_') else 0.0
+
+        X_prot, y_prot = X, y
+        if hasattr(clf, 'reduction_'):
+            X_prot, y_prot = clf.X_, clf.y_
+
+        # Plot the decision boundary. For that, we will assign a color to each
+        # point in the mesh [x_min, m_max]x[y_min, y_max].
+        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
+
+        # Put the result into a color plot
+        Z = Z.reshape(xx.shape)
+        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
+
+        # Plot points
+        ax.scatter(X_prot[:, 0], X_prot[:, 1], c=y_prot, cmap=cm_bright)
+        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
+        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                             np.arange(y_min, y_max, h))
+
+        ax.set_xlim(xx.min(), xx.max())
+        ax.set_ylim(yy.min(), yy.max())
+        ax.set_xticks(())
+        ax.set_yticks(())
+        ax.set_title(name)
+        ax.text(xx.max() - .3, yy.min() + .3, 'RED:' + ('%.2f' % red).lstrip('0'),
+                size=15, horizontalalignment='right')
+        i += 1
+
+figure.subplots_adjust(left=.02, right=.98)
+pl.show()
+
+
+
+
diff --git a/examples/utils.py b/examples/utils.py
@@ -0,0 +1,25 @@
+import numpy as np
+
+
+def random_subset(iterator, k):
+    result = iterator[:k]
+    i = k
+    tmp_it = iterator[k:]
+    for item in tmp_it:
+        i = i + 1
+        s = int(np.random.random() * i)
+        if s < k:
+            result[s] = item
+    return result
+
+def generate_imbalance(X, y, positive_label=1, ir=2):
+    mask = y == positive_label
+    seq = np.arange(y.shape[0])[mask]
+    k = float(sum(mask))/ir
+    idx = np.asarray(random_subset(seq, int(k)))
+    mask = ~mask
+    mask[idx] = True
+    return X[mask], y[mask]
+
+
+