e-mission · corinne-hcr · Feb 20, 2021 · Feb 20, 2021 · Feb 21, 2021 · Feb 22, 2021
diff --git a/tour_model_eval/plot_document_clustering.ipynb b/tour_model_eval/plot_document_clustering.ipynb
@@ -0,0 +1,285 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Clustering text documents using k-means\n",
+    "\n",
+    "This is an example showing how the scikit-learn can be used to cluster\n",
+    "documents by topics using a bag-of-words approach. This example uses\n",
+    "a scipy.sparse matrix to store the features instead of standard numpy arrays.\n",
+    "\n",
+    "Two feature extraction methods can be used in this example:\n",
+    "\n",
+    "  - TfidfVectorizer uses a in-memory vocabulary (a python dict) to map the most\n",
+    "    frequent words to features indices and hence compute a word occurrence\n",
+    "    frequency (sparse) matrix. The word frequencies are then reweighted using\n",
+    "    the Inverse Document Frequency (IDF) vector collected feature-wise over\n",
+    "    the corpus.\n",
+    "\n",
+    "  - HashingVectorizer hashes word occurrences to a fixed dimensional space,\n",
+    "    possibly with collisions. The word count vectors are then normalized to\n",
+    "    each have l2-norm equal to one (projected to the euclidean unit-ball) which\n",
+    "    seems to be important for k-means to work in high dimensional space.\n",
+    "\n",
+    "    HashingVectorizer does not provide IDF weighting as this is a stateless\n",
+    "    model (the fit method does nothing). When IDF weighting is needed it can\n",
+    "    be added by pipelining its output to a TfidfTransformer instance.\n",
+    "\n",
+    "Two algorithms are demoed: ordinary k-means and its more scalable cousin\n",
+    "minibatch k-means.\n",
+    "\n",
+    "Additionally, latent semantic analysis can also be used to reduce\n",
+    "dimensionality and discover latent patterns in the data.\n",
+    "\n",
+    "It can be noted that k-means (and minibatch k-means) are very sensitive to\n",
+    "feature scaling and that in this case the IDF weighting helps improve the\n",
+    "quality of the clustering by quite a lot as measured against the \"ground truth\"\n",
+    "provided by the class label assignments of the 20 newsgroups dataset.\n",
+    "\n",
+    "This improvement is not visible in the Silhouette Coefficient which is small\n",
+    "for both as this measure seem to suffer from the phenomenon called\n",
+    "\"Concentration of Measure\" or \"Curse of Dimensionality\" for high dimensional\n",
+    "datasets such as text data. Other measures such as V-measure and Adjusted Rand\n",
+    "Index are information theoretic based evaluation scores: as they are only based\n",
+    "on cluster assignments rather than distances, hence not affected by the curse\n",
+    "of dimensionality.\n",
+    "\n",
+    "Note: as k-means is optimizing a non-convex objective function, it will likely\n",
+    "end up in a local optimum. Several runs with independent random init might be\n",
+    "necessary to get a good convergence.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Author: Peter Prettenhofer <[email protected]>\n",
+    "#         Lars Buitinck\n",
+    "# License: BSD 3 clause\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "from sklearn.decomposition import TruncatedSVD\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.feature_extraction.text import HashingVectorizer\n",
+    "from sklearn.feature_extraction.text import TfidfTransformer\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.preprocessing import Normalizer\n",
+    "from sklearn import metrics\n",
+    "\n",
+    "from sklearn.cluster import KMeans, MiniBatchKMeans\n",
+    "\n",
+    "import logging\n",
+    "from optparse import OptionParser\n",
+    "import sys\n",
+    "from time import time\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "# Display progress logs on stdout\n",
+    "logging.basicConfig(level=logging.INFO,\n",
+    "                    format='%(asctime)s %(levelname)s %(message)s')\n",
+    "\n",
+    "# parse commandline arguments\n",
+    "op = OptionParser()\n",
+    "op.add_option(\"--lsa\",\n",
+    "              dest=\"n_components\", type=\"int\",\n",
+    "              help=\"Preprocess documents with latent semantic analysis.\")\n",
+    "op.add_option(\"--no-minibatch\",\n",
+    "              action=\"store_false\", dest=\"minibatch\", default=True,\n",
+    "              help=\"Use ordinary k-means algorithm (in batch mode).\")\n",
+    "op.add_option(\"--no-idf\",\n",
+    "              action=\"store_false\", dest=\"use_idf\", default=True,\n",
+    "              help=\"Disable Inverse Document Frequency feature weighting.\")\n",
+    "op.add_option(\"--use-hashing\",\n",
+    "              action=\"store_true\", default=False,\n",
+    "              help=\"Use a hashing feature vectorizer\")\n",
+    "op.add_option(\"--n-features\", type=int, default=10000,\n",
+    "              help=\"Maximum number of features (dimensions)\"\n",
+    "                   \" to extract from text.\")\n",
+    "op.add_option(\"--verbose\",\n",
+    "              action=\"store_true\", dest=\"verbose\", default=False,\n",
+    "              help=\"Print progress reports inside k-means algorithm.\")\n",
+    "\n",
+    "print(__doc__)\n",
+    "op.print_help()\n",
+    "\n",
+    "\n",
+    "def is_interactive():\n",
+    "    return not hasattr(sys.modules['__main__'], '__file__')\n",
+    "\n",
+    "\n",
+    "# work-around for Jupyter notebook and IPython console\n",
+    "argv = [] if is_interactive() else sys.argv[1:]\n",
+    "(opts, args) = op.parse_args(argv)\n",
+    "if len(args) > 0:\n",
+    "    op.error(\"this script takes no arguments.\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "\n",
+    "# #############################################################################\n",
+    "# Load some categories from the training set\n",
+    "categories = [\n",
+    "    'alt.atheism',\n",
+    "    'talk.religion.misc',\n",
+    "    'comp.graphics',\n",
+    "    'sci.space',\n",
+    "]\n",
+    "# Uncomment the following to do the analysis on all the categories\n",
+    "# categories = None\n",
+    "\n",
+    "print(\"Loading 20 newsgroups dataset for categories:\")\n",
+    "print(categories)\n",
+    "\n",
+    "dataset = fetch_20newsgroups(subset='all', categories=categories,\n",
+    "                             shuffle=True, random_state=42)\n",
+    "\n",
+    "print(\"%d documents\" % len(dataset.data))\n",
+    "print(\"%d categories\" % len(dataset.target_names))\n",
+    "print()\n",
+    "\n",
+    "labels = dataset.target\n",
+    "true_k = np.unique(labels).shape[0]\n",
+    "\n",
+    "print(\"Extracting features from the training dataset \"\n",
+    "      \"using a sparse vectorizer\")\n",
+    "t0 = time()\n",
+    "if opts.use_hashing:\n",
+    "    if opts.use_idf:\n",
+    "        # Perform an IDF normalization on the output of HashingVectorizer\n",
+    "        hasher = HashingVectorizer(n_features=opts.n_features,\n",
+    "                                   stop_words='english', alternate_sign=False,\n",
+    "                                   norm=None)\n",
+    "        vectorizer = make_pipeline(hasher, TfidfTransformer())\n",
+    "    else:\n",
+    "        vectorizer = HashingVectorizer(n_features=opts.n_features,\n",
+    "                                       stop_words='english',\n",
+    "                                       alternate_sign=False, norm='l2')\n",
+    "else:\n",
+    "    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n",
+    "                                 min_df=2, stop_words='english',\n",
+    "                                 use_idf=opts.use_idf)\n",
+    "X = vectorizer.fit_transform(dataset.data)\n",
+    "\n",
+    "print(\"done in %fs\" % (time() - t0))\n",
+    "print(\"n_samples: %d, n_features: %d\" % X.shape)\n",
+    "print()\n",
+    "\n",
+    "if opts.n_components:\n",
+    "    print(\"Performing dimensionality reduction using LSA\")\n",
+    "    t0 = time()\n",
+    "    # Vectorizer results are normalized, which makes KMeans behave as\n",
+    "    # spherical k-means for better results. Since LSA/SVD results are\n",
+    "    # not normalized, we have to redo the normalization.\n",
+    "    svd = TruncatedSVD(opts.n_components)\n",
+    "    normalizer = Normalizer(copy=False)\n",
+    "    lsa = make_pipeline(svd, normalizer)\n",
+    "\n",
+    "    X = lsa.fit_transform(X)\n",
+    "\n",
+    "    print(\"done in %fs\" % (time() - t0))\n",
+    "\n",
+    "    explained_variance = svd.explained_variance_ratio_.sum()\n",
+    "    print(\"Explained variance of the SVD step: {}%\".format(\n",
+    "        int(explained_variance * 100)))\n",
+    "\n",
+    "    print()\n",
+    "\n",
+    "\n",
+    "# #############################################################################\n",
+    "# Do the actual clustering\n",
+    "\n",
+    "if opts.minibatch:\n",
+    "    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n",
+    "                         init_size=1000, batch_size=1000, verbose=opts.verbose)\n",
+    "else:\n",
+    "    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n",
+    "                verbose=opts.verbose)\n",
+    "\n",
+    "print(\"Clustering sparse data with %s\" % km)\n",
+    "t0 = time()\n",
+    "km.fit(X)\n",
+    "print(\"done in %0.3fs\" % (time() - t0))\n",
+    "print()\n",
+    "\n",
+    "# Test what labels are\n",
+    "np.set_printoptions(threshold=np.inf)\n",
+    "print(\"----------Test---------------\")\n",
+    "print('labels is %s' % labels)\n",
+    "print('The type of labels is %s' % type(labels))\n",
+    "print('the shape of labels is n_samples %s' % labels.shape)\n",
+    "print('km.labels_ is %s' % km.labels_)\n",
+    "print('The type of km.labels_ is %s' % type(km.labels_))\n",
+    "print('the shape of km.labels_ is n_samples %s' % km.labels_.shape)\n",
+    "print('true_k is %s'% true_k)\n",
+    "print('np.unique(labels) is %s' % np.unique(labels))\n",
+    "print('np.unique(labels).shape is %s' % np.unique(labels).shape)\n",
+    "print(\"----------Test---------------\")\n",
+    "print()\n",
+    "\n",
+    "\n",
+    "\n",
+    "print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\n",
+    "print(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\n",
+    "print(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\n",
+    "print(\"Adjusted Rand-Index: %.3f\"\n",
+    "      % metrics.adjusted_rand_score(labels, km.labels_))\n",
+    "print(\"Silhouette Coefficient: %0.3f\"\n",
+    "      % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n",
+    "\n",
+    "print()\n",
+    "\n",
+    "\n",
+    "if not opts.use_hashing:\n",
+    "    print(\"Top terms per cluster:\")\n",
+    "\n",
+    "    if opts.n_components:\n",
+    "        original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n",
+    "        order_centroids = original_space_centroids.argsort()[:, ::-1]\n",
+    "    else:\n",
+    "        order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n",
+    "\n",
+    "    terms = vectorizer.get_feature_names()\n",
+    "    for i in range(true_k):\n",
+    "        print(\"Cluster %d:\" % i, end='')\n",
+    "        for ind in order_centroids[i, :10]:\n",
+    "            print(' %s' % terms[ind], end='')\n",
+    "        print()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}