From d1ff495912d423821fe8a42b5e23451e42763390 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Sat, 20 Feb 2021 12:02:05 -0800 Subject: [PATCH 01/16] example code from sklearn --- .../plot_document_clustering.ipynb | 285 ++++++++++ tour_model_eval/plot_kmeans_digits.ipynb | 531 ++++++++++++++++++ 2 files changed, 816 insertions(+) create mode 100644 tour_model_eval/plot_document_clustering.ipynb create mode 100644 tour_model_eval/plot_kmeans_digits.ipynb diff --git a/tour_model_eval/plot_document_clustering.ipynb b/tour_model_eval/plot_document_clustering.ipynb new file mode 100644 index 0000000..828ce5a --- /dev/null +++ b/tour_model_eval/plot_document_clustering.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Clustering text documents using k-means\n", + "\n", + "This is an example showing how the scikit-learn can be used to cluster\n", + "documents by topics using a bag-of-words approach. This example uses\n", + "a scipy.sparse matrix to store the features instead of standard numpy arrays.\n", + "\n", + "Two feature extraction methods can be used in this example:\n", + "\n", + " - TfidfVectorizer uses a in-memory vocabulary (a python dict) to map the most\n", + " frequent words to features indices and hence compute a word occurrence\n", + " frequency (sparse) matrix. The word frequencies are then reweighted using\n", + " the Inverse Document Frequency (IDF) vector collected feature-wise over\n", + " the corpus.\n", + "\n", + " - HashingVectorizer hashes word occurrences to a fixed dimensional space,\n", + " possibly with collisions. The word count vectors are then normalized to\n", + " each have l2-norm equal to one (projected to the euclidean unit-ball) which\n", + " seems to be important for k-means to work in high dimensional space.\n", + "\n", + " HashingVectorizer does not provide IDF weighting as this is a stateless\n", + " model (the fit method does nothing). When IDF weighting is needed it can\n", + " be added by pipelining its output to a TfidfTransformer instance.\n", + "\n", + "Two algorithms are demoed: ordinary k-means and its more scalable cousin\n", + "minibatch k-means.\n", + "\n", + "Additionally, latent semantic analysis can also be used to reduce\n", + "dimensionality and discover latent patterns in the data.\n", + "\n", + "It can be noted that k-means (and minibatch k-means) are very sensitive to\n", + "feature scaling and that in this case the IDF weighting helps improve the\n", + "quality of the clustering by quite a lot as measured against the \"ground truth\"\n", + "provided by the class label assignments of the 20 newsgroups dataset.\n", + "\n", + "This improvement is not visible in the Silhouette Coefficient which is small\n", + "for both as this measure seem to suffer from the phenomenon called\n", + "\"Concentration of Measure\" or \"Curse of Dimensionality\" for high dimensional\n", + "datasets such as text data. Other measures such as V-measure and Adjusted Rand\n", + "Index are information theoretic based evaluation scores: as they are only based\n", + "on cluster assignments rather than distances, hence not affected by the curse\n", + "of dimensionality.\n", + "\n", + "Note: as k-means is optimizing a non-convex objective function, it will likely\n", + "end up in a local optimum. Several runs with independent random init might be\n", + "necessary to get a good convergence.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Author: Peter Prettenhofer \n", + "# Lars Buitinck\n", + "# License: BSD 3 clause\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.decomposition import TruncatedSVD\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.feature_extraction.text import HashingVectorizer\n", + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import Normalizer\n", + "from sklearn import metrics\n", + "\n", + "from sklearn.cluster import KMeans, MiniBatchKMeans\n", + "\n", + "import logging\n", + "from optparse import OptionParser\n", + "import sys\n", + "from time import time\n", + "\n", + "import numpy as np\n", + "\n", + "\n", + "# Display progress logs on stdout\n", + "logging.basicConfig(level=logging.INFO,\n", + " format='%(asctime)s %(levelname)s %(message)s')\n", + "\n", + "# parse commandline arguments\n", + "op = OptionParser()\n", + "op.add_option(\"--lsa\",\n", + " dest=\"n_components\", type=\"int\",\n", + " help=\"Preprocess documents with latent semantic analysis.\")\n", + "op.add_option(\"--no-minibatch\",\n", + " action=\"store_false\", dest=\"minibatch\", default=True,\n", + " help=\"Use ordinary k-means algorithm (in batch mode).\")\n", + "op.add_option(\"--no-idf\",\n", + " action=\"store_false\", dest=\"use_idf\", default=True,\n", + " help=\"Disable Inverse Document Frequency feature weighting.\")\n", + "op.add_option(\"--use-hashing\",\n", + " action=\"store_true\", default=False,\n", + " help=\"Use a hashing feature vectorizer\")\n", + "op.add_option(\"--n-features\", type=int, default=10000,\n", + " help=\"Maximum number of features (dimensions)\"\n", + " \" to extract from text.\")\n", + "op.add_option(\"--verbose\",\n", + " action=\"store_true\", dest=\"verbose\", default=False,\n", + " help=\"Print progress reports inside k-means algorithm.\")\n", + "\n", + "print(__doc__)\n", + "op.print_help()\n", + "\n", + "\n", + "def is_interactive():\n", + " return not hasattr(sys.modules['__main__'], '__file__')\n", + "\n", + "\n", + "# work-around for Jupyter notebook and IPython console\n", + "argv = [] if is_interactive() else sys.argv[1:]\n", + "(opts, args) = op.parse_args(argv)\n", + "if len(args) > 0:\n", + " op.error(\"this script takes no arguments.\")\n", + " sys.exit(1)\n", + "\n", + "\n", + "# #############################################################################\n", + "# Load some categories from the training set\n", + "categories = [\n", + " 'alt.atheism',\n", + " 'talk.religion.misc',\n", + " 'comp.graphics',\n", + " 'sci.space',\n", + "]\n", + "# Uncomment the following to do the analysis on all the categories\n", + "# categories = None\n", + "\n", + "print(\"Loading 20 newsgroups dataset for categories:\")\n", + "print(categories)\n", + "\n", + "dataset = fetch_20newsgroups(subset='all', categories=categories,\n", + " shuffle=True, random_state=42)\n", + "\n", + "print(\"%d documents\" % len(dataset.data))\n", + "print(\"%d categories\" % len(dataset.target_names))\n", + "print()\n", + "\n", + "labels = dataset.target\n", + "true_k = np.unique(labels).shape[0]\n", + "\n", + "print(\"Extracting features from the training dataset \"\n", + " \"using a sparse vectorizer\")\n", + "t0 = time()\n", + "if opts.use_hashing:\n", + " if opts.use_idf:\n", + " # Perform an IDF normalization on the output of HashingVectorizer\n", + " hasher = HashingVectorizer(n_features=opts.n_features,\n", + " stop_words='english', alternate_sign=False,\n", + " norm=None)\n", + " vectorizer = make_pipeline(hasher, TfidfTransformer())\n", + " else:\n", + " vectorizer = HashingVectorizer(n_features=opts.n_features,\n", + " stop_words='english',\n", + " alternate_sign=False, norm='l2')\n", + "else:\n", + " vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n", + " min_df=2, stop_words='english',\n", + " use_idf=opts.use_idf)\n", + "X = vectorizer.fit_transform(dataset.data)\n", + "\n", + "print(\"done in %fs\" % (time() - t0))\n", + "print(\"n_samples: %d, n_features: %d\" % X.shape)\n", + "print()\n", + "\n", + "if opts.n_components:\n", + " print(\"Performing dimensionality reduction using LSA\")\n", + " t0 = time()\n", + " # Vectorizer results are normalized, which makes KMeans behave as\n", + " # spherical k-means for better results. Since LSA/SVD results are\n", + " # not normalized, we have to redo the normalization.\n", + " svd = TruncatedSVD(opts.n_components)\n", + " normalizer = Normalizer(copy=False)\n", + " lsa = make_pipeline(svd, normalizer)\n", + "\n", + " X = lsa.fit_transform(X)\n", + "\n", + " print(\"done in %fs\" % (time() - t0))\n", + "\n", + " explained_variance = svd.explained_variance_ratio_.sum()\n", + " print(\"Explained variance of the SVD step: {}%\".format(\n", + " int(explained_variance * 100)))\n", + "\n", + " print()\n", + "\n", + "\n", + "# #############################################################################\n", + "# Do the actual clustering\n", + "\n", + "if opts.minibatch:\n", + " km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n", + " init_size=1000, batch_size=1000, verbose=opts.verbose)\n", + "else:\n", + " km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n", + " verbose=opts.verbose)\n", + "\n", + "print(\"Clustering sparse data with %s\" % km)\n", + "t0 = time()\n", + "km.fit(X)\n", + "print(\"done in %0.3fs\" % (time() - t0))\n", + "print()\n", + "\n", + "# Test what labels are\n", + "np.set_printoptions(threshold=np.inf)\n", + "print(\"----------Test---------------\")\n", + "print('labels is %s' % labels)\n", + "print('The type of labels is %s' % type(labels))\n", + "print('the shape of labels is n_samples %s' % labels.shape)\n", + "print('km.labels_ is %s' % km.labels_)\n", + "print('The type of km.labels_ is %s' % type(km.labels_))\n", + "print('the shape of km.labels_ is n_samples %s' % km.labels_.shape)\n", + "print('true_k is %s'% true_k)\n", + "print('np.unique(labels) is %s' % np.unique(labels))\n", + "print('np.unique(labels).shape is %s' % np.unique(labels).shape)\n", + "print(\"----------Test---------------\")\n", + "print()\n", + "\n", + "\n", + "\n", + "print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\n", + "print(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\n", + "print(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\n", + "print(\"Adjusted Rand-Index: %.3f\"\n", + " % metrics.adjusted_rand_score(labels, km.labels_))\n", + "print(\"Silhouette Coefficient: %0.3f\"\n", + " % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n", + "\n", + "print()\n", + "\n", + "\n", + "if not opts.use_hashing:\n", + " print(\"Top terms per cluster:\")\n", + "\n", + " if opts.n_components:\n", + " original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n", + " order_centroids = original_space_centroids.argsort()[:, ::-1]\n", + " else:\n", + " order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", + "\n", + " terms = vectorizer.get_feature_names()\n", + " for i in range(true_k):\n", + " print(\"Cluster %d:\" % i, end='')\n", + " for ind in order_centroids[i, :10]:\n", + " print(' %s' % terms[ind], end='')\n", + " print()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/tour_model_eval/plot_kmeans_digits.ipynb b/tour_model_eval/plot_kmeans_digits.ipynb new file mode 100644 index 0000000..62f7cd6 --- /dev/null +++ b/tour_model_eval/plot_kmeans_digits.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# A demo of K-Means clustering on the handwritten digits data\n", + "\n", + "In this example we compare the various initialization strategies for K-means in\n", + "terms of runtime and quality of the results.\n", + "\n", + "As the ground truth is known here, we also apply different cluster quality\n", + "metrics to judge the goodness of fit of the cluster labels to the ground truth.\n", + "\n", + "Cluster quality metrics evaluated (see `clustering_evaluation` for\n", + "definitions and discussions of the metrics):\n", + "\n", + "=========== ========================================================\n", + "Shorthand full name\n", + "=========== ========================================================\n", + "homo homogeneity score\n", + "compl completeness score\n", + "v-meas V measure\n", + "ARI adjusted Rand index\n", + "AMI adjusted mutual information\n", + "silhouette silhouette coefficient\n", + "=========== ========================================================\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Automatically created module for IPython interactive environment\n" + ] + } + ], + "source": [ + "print(__doc__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the dataset\n", + "\n", + "We will start by loading the `digits` dataset. This dataset contains\n", + "handwritten digits from 0 to 9. In the context of clustering, one would like\n", + "to group images such that the handwritten digits on the image are the same.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# digits: 10; # samples: 1797; # features 64\n", + "------Test------\n", + "data.shape is (1797, 64)\n", + "labels is [0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0\n", + " 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9\n", + " 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4\n", + " 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7\n", + " 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2\n", + " 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 3 1 3 9 1\n", + " 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 5 4 8 8 4 9 0 8 9 8 0 1 2\n", + " 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9\n", + " 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8\n", + " 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2\n", + " 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0\n", + " 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2\n", + " 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7\n", + " 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1\n", + " 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8\n", + " 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2\n", + " 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7\n", + " 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9\n", + " 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1\n", + " 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1\n", + " 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0\n", + " 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9\n", + " 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5\n", + " 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4\n", + " 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9\n", + " 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 2 7 8 2 0 1 2 6 3\n", + " 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 4 6 3 1 3 9 1 7 6 8 4\n", + " 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 4 9 0 8 9 8 0 1 2 3 4 5 6\n", + " 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7\n", + " 7 3 5 1 0 0 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6\n", + " 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9\n", + " 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6\n", + " 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3\n", + " 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4\n", + " 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7\n", + " 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7\n", + " 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7\n", + " 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7\n", + " 9 5 4 8 8 4 9 0 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7\n", + " 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4\n", + " 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0\n", + " 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8\n", + " 9 0 1 2 3 4 5 6 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1\n", + " 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 0 1 7 6 3 2 1 7\n", + " 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 2 5 7 9 5 4 4 9 0 8\n", + " 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6\n", + " 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5\n", + " 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7\n", + " 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8]\n", + "------Test------\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.datasets import load_digits\n", + "\n", + "data, labels = load_digits(return_X_y=True)\n", + "(n_samples, n_features), n_digits = data.shape, np.unique(labels).size\n", + "\n", + "print(\n", + " f\"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}\"\n", + ")\n", + "\n", + "print('------Test------')\n", + "np.set_printoptions(threshold=np.inf)\n", + "print('data.shape is ',data.shape)\n", + "print(\"labels is %s\" % labels)\n", + "print('------Test------')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define our evaluation benchmark\n", + "\n", + "We will first our evaluation benchmark. During this benchmark, we intend to\n", + "compare different initialization methods for KMeans. Our benchmark will:\n", + "\n", + "* create a pipeline which will scale the data using a\n", + " :class:`~sklearn.preprocessing.StandardScaler`;\n", + "* train and time the pipeline fitting;\n", + "* measure the performance of the clustering obtained via different metrics.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from time import time\n", + "from sklearn import metrics\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "\n", + "def bench_k_means(kmeans, name, data, labels):\n", + " \"\"\"Benchmark to evaluate the KMeans initialization methods.\n", + "\n", + " Parameters\n", + " ----------\n", + " kmeans : KMeans instance\n", + " A :class:`~sklearn.cluster.KMeans` instance with the initialization\n", + " already set.\n", + " name : str\n", + " Name given to the strategy. It will be used to show the results in a\n", + " table.\n", + " data : ndarray of shape (n_samples, n_features)\n", + " The data to cluster.\n", + " labels : ndarray of shape (n_samples,)\n", + " The labels used to compute the clustering metrics which requires some\n", + " supervision.\n", + " \"\"\"\n", + " t0 = time()\n", + " estimator = make_pipeline(StandardScaler(), kmeans).fit(data)\n", + " fit_time = time() - t0\n", + " results = [name, fit_time, estimator[-1].inertia_]\n", + "\n", + " # Define the metrics which require only the true labels and estimator\n", + " # labels\n", + " clustering_metrics = [\n", + " metrics.homogeneity_score,\n", + " metrics.completeness_score,\n", + " metrics.v_measure_score,\n", + " metrics.adjusted_rand_score,\n", + " metrics.adjusted_mutual_info_score,\n", + " ]\n", + " results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]\n", + " \n", + " print('------Test------')\n", + " print('estimator[-1].labels_', estimator[-1].labels_)\n", + " print('------Test------')\n", + "\n", + " # The silhouette score requires the full dataset\n", + " results += [\n", + " metrics.silhouette_score(data, estimator[-1].labels_,\n", + " metric=\"euclidean\", sample_size=300,)\n", + " ]\n", + "\n", + " # Show the results\n", + " formatter_result = (\"{:9s}\\t{:.3f}s\\t{:.0f}\\t{:.3f}\\t{:.3f}\"\n", + " \"\\t{:.3f}\\t{:.3f}\\t{:.3f}\\t{:.3f}\")\n", + " print(formatter_result.format(*results))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run the benchmark\n", + "\n", + "We will compare three approaches:\n", + "\n", + "* an initialization using `kmeans++`. This method is stochastic and we will\n", + " run the initialization 4 times;\n", + "* a random initialization. This method is stochastic as well and we will run\n", + " the initialization 4 times;\n", + "* an initialization based on a :class:`~sklearn.decomposition.PCA`\n", + " projection. Indeed, we will use the components of the\n", + " :class:`~sklearn.decomposition.PCA` to initialize KMeans. This method is\n", + " deterministic and a single initialization suffice.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "__________________________________________________________________________________\n", + "init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette\n", + "------Test------\n", + "estimator[-1].labels_ [2 8 8 3 9 3 1 0 3 3 2 6 4 3 9 5 1 0 8 3 2 6 8 3 6 5 1 6 8 3 2 3 5 5 1 5 2\n", + " 3 8 3 8 9 6 0 0 3 5 6 2 2 8 6 0 8 6 2 6 8 1 3 3 0 3 5 9 1 1 1 9 8 6 5 2 3\n", + " 3 6 5 6 2 2 6 0 1 3 5 6 0 8 1 3 6 5 3 8 0 8 8 9 3 8 9 2 5 5 1 3 1 6 0 5 9\n", + " 9 0 8 8 8 8 5 0 3 8 9 8 8 9 3 2 8 3 8 2 6 5 5 9 5 1 0 8 3 2 6 8 3 9 5 1 0\n", + " 8 3 2 6 8 3 9 5 1 0 8 3 2 3 5 5 1 5 2 3 8 3 8 9 6 0 0 3 5 6 2 2 5 5 0 8 5\n", + " 2 6 8 1 3 3 0 3 3 9 1 1 1 9 3 6 5 2 3 5 8 8 8 2 2 6 0 1 3 8 6 0 3 6 3 3 6\n", + " 0 1 8 9 0 6 9 2 5 0 1 3 1 6 0 5 9 9 0 5 8 5 8 5 5 6 8 8 9 3 2 8 3 8 2 8 5\n", + " 3 9 3 1 7 3 3 2 6 5 3 9 5 1 0 8 0 2 8 5 3 9 3 1 7 8 3 2 3 3 3 1 3 2 3 8 3\n", + " 8 9 8 0 0 3 3 8 2 2 5 8 0 3 5 2 6 5 1 5 3 7 3 3 9 1 1 1 9 0 8 5 2 0 3 5 3\n", + " 5 2 2 8 0 1 3 5 6 0 9 1 3 8 3 0 8 0 1 8 9 5 8 9 2 3 3 1 0 1 8 0 5 9 9 0 5\n", + " 3 5 5 3 0 6 5 9 3 3 9 6 2 8 0 3 2 6 4 3 9 3 1 7 6 3 2 6 4 3 9 5 1 7 8 3 2\n", + " 6 5 3 9 8 1 7 3 3 2 3 3 5 1 3 2 3 3 3 3 9 6 7 7 3 6 6 2 2 4 5 7 3 5 2 4 3\n", + " 1 3 3 7 3 3 9 1 1 1 9 3 6 3 2 3 3 4 8 5 2 2 6 7 1 3 5 6 0 6 1 3 6 3 3 6 6\n", + " 1 3 9 3 6 9 2 5 3 1 3 8 6 7 5 6 9 7 5 3 5 8 5 0 3 0 6 8 8 9 3 2 3 3 3 2 4\n", + " 4 3 9 3 1 0 8 3 2 4 8 3 0 3 1 0 8 3 2 4 4 3 9 3 1 0 8 3 2 8 3 3 1 3 2 3 6\n", + " 3 8 9 4 0 0 3 3 4 2 2 4 4 0 1 4 2 4 4 1 3 3 0 3 3 9 1 1 1 9 3 4 3 2 3 3 4\n", + " 8 5 2 2 4 0 1 4 4 4 0 9 1 3 4 3 3 4 0 1 8 9 3 8 9 2 3 3 1 3 1 4 0 3 9 9 0\n", + " 4 8 5 4 0 0 3 3 9 6 8 9 3 2 6 3 6 2 8 5 3 9 0 1 0 3 7 2 8 5 3 9 5 1 0 8 6\n", + " 2 8 5 3 6 5 1 7 3 6 2 6 9 5 1 5 2 6 8 6 3 9 8 7 6 3 5 8 2 2 3 8 0 8 8 2 8\n", + " 3 1 3 3 0 3 3 9 1 1 1 9 6 8 1 2 6 0 8 3 8 2 2 8 0 1 3 5 8 7 9 1 3 8 3 6 8\n", + " 0 1 8 9 3 8 0 2 3 3 1 6 1 8 0 5 9 7 0 8 8 8 5 5 0 7 5 9 3 8 0 6 2 8 6 8 2\n", + " 8 5 3 9 5 1 0 1 3 2 8 5 3 9 5 2 0 6 6 2 8 5 3 9 5 1 0 8 3 2 3 5 0 1 5 2 3\n", + " 3 3 3 9 8 0 0 3 5 8 2 2 8 5 0 6 5 2 4 5 1 3 3 0 3 3 9 1 1 1 9 3 8 8 2 3 5\n", + " 5 8 5 2 2 8 0 1 3 0 8 0 9 1 3 8 3 3 8 0 1 8 9 3 8 9 2 5 3 1 3 1 8 0 5 9 9\n", + " 0 5 6 0 5 5 0 3 5 9 6 3 9 3 2 8 3 8 4 5 3 9 3 1 0 3 3 2 4 5 3 9 3 1 0 3 3\n", + " 2 4 5 3 9 3 1 0 3 3 2 3 3 3 1 3 2 3 3 3 8 9 4 0 0 3 3 4 5 0 3 4 2 4 4 1 3\n", + " 3 0 3 3 6 1 1 1 9 3 4 3 2 3 3 4 3 4 2 2 4 0 1 3 5 4 9 1 3 4 3 3 4 0 1 3 7\n", + " 3 4 9 2 5 3 1 3 1 4 7 3 9 7 0 5 3 4 5 3 0 3 3 7 7 3 2 3 3 3 2 8 5 3 7 5 1\n", + " 0 0 3 2 8 4 3 7 5 1 0 8 3 2 8 4 3 9 3 1 0 5 3 2 3 5 5 1 0 2 3 5 3 8 7 8 0\n", + " 0 3 3 8 2 2 0 8 4 2 8 4 1 3 3 0 3 3 9 1 1 1 0 3 8 6 2 3 5 4 3 4 2 2 8 0 1\n", + " 3 4 8 0 9 1 3 8 0 3 8 0 1 3 9 3 8 9 2 5 3 1 3 1 8 0 3 9 9 0 5 8 4 4 5 0 3\n", + " 9 9 3 3 9 3 2 5 3 5 2 6 5 3 9 3 1 0 5 3 2 6 4 3 9 3 1 0 8 3 2 8 5 3 9 5 1\n", + " 0 5 3 2 3 3 5 1 5 2 3 5 3 5 9 8 0 0 5 1 8 2 2 5 5 0 5 5 2 8 5 1 3 3 0 3 3\n", + " 9 1 1 1 9 3 8 5 2 3 3 5 5 3 2 2 8 0 1 5 5 8 0 9 1 3 8 3 3 8 0 1 8 9 3 8 9\n", + " 2 5 5 1 3 1 8 0 5 9 9 0 5 5 5 5 5 0 3 5 9 8 8 9 3 2 8 3 8 2 6 4 3 9 3 1 5\n", + " 8 3 2 8 4 3 9 5 1 0 5 3 2 6 3 3 9 3 1 0 8 3 2 3 5 3 1 5 2 3 8 3 8 9 8 0 0\n", + " 3 3 8 2 2 4 4 0 8 4 2 6 4 1 3 3 0 3 3 9 1 1 1 9 3 8 3 2 3 1 5 8 3 2 2 6 0\n", + " 1 3 4 8 0 9 1 3 8 3 3 8 0 1 8 9 3 8 9 2 8 3 1 3 1 8 0 8 9 9 0 4 8 4 4 3 0\n", + " 3 5 9 8 8 9 3 2 8 2 6 5 3 9 3 1 0 3 3 2 6 5 3 9 3 1 0 3 3 2 6 5 3 9 3 1 0\n", + " 3 3 2 3 3 3 1 3 2 3 3 3 3 9 6 0 0 3 3 6 2 2 5 5 7 6 5 2 6 5 1 3 3 0 3 3 9\n", + " 1 1 1 9 3 6 3 2 3 3 5 3 5 2 2 6 0 1 3 5 6 0 9 1 3 6 3 3 6 0 1 3 9 3 6 9 2\n", + " 3 3 1 3 1 6 0 3 9 9 0 5 6 5 5 3 0 3 3 9 6 8 9 3 2 6 3 8 2 8 4 3 9 5 6 5 8\n", + " 6 2 8 4 3 9 5 1 6 2 8 4 3 9 5 8 0 8 7 9 6 5 5 1 8 2 6 8 5 8 9 8 0 5 5 5 8\n", + " 2 2 4 5 5 8 4 2 8 5 1 8 0 5 0 8 9 1 1 1 8 6 8 5 2 3 8 4 8 2 8 0 1 3 5 8 5\n", + " 8 1 3 8 3 0 8 5 1 8 9 3 8 9 2 5 3 8 3 8 8 0 8 9 9 0 4 3 5 5 3 8 5 9 5 2 8\n", + " 0 0 2 8 5 3 9 3 1 0 3 3 2 8 8 8 9 3 1 0 5 3 2 8 5 5 9 9 1 0 8 3 2 3 3 6 1\n", + " 3 2 3 3 3 5 7 8 0 0 0 3 8 2 2 5 5 0 8 5 2 8 5 1 8 0 0 3 0 9 1 1 1 9 3 8 3\n", + " 2 3 3 5 8 5 2 2 8 0 1 0 8 8 0 9 1 3 8 3 3 8 0 1 8 9 8 8 9 2 3 3 1 3 1 8 0\n", + " 3 9 9 0 5 8 5 5 3 0 3 3 9 8 8 9 3 2 8 3 3]\n", + "------Test------\n", + "k-means++\t0.157s\t69485\t0.613\t0.660\t0.636\t0.482\t0.632\t0.134\n", + "------Test------\n", + "estimator[-1].labels_ [8 5 5 7 6 7 4 0 7 7 8 0 1 7 6 9 4 0 5 7 8 0 5 0 6 5 4 0 5 7 8 7 9 9 4 9 8\n", + " 7 5 7 5 6 0 0 0 7 5 0 8 8 5 0 0 5 0 8 0 0 4 7 7 0 7 9 6 4 4 4 6 5 0 9 8 7\n", + " 5 0 5 0 8 8 0 0 4 7 9 0 0 5 4 7 0 9 7 5 0 5 5 6 7 5 6 8 5 9 4 7 4 0 0 5 6\n", + " 6 0 5 5 0 5 9 0 7 5 6 5 5 6 7 8 5 7 5 8 0 9 9 6 9 4 0 5 7 8 0 5 9 6 9 4 5\n", + " 5 7 8 0 5 0 6 9 4 5 5 7 8 7 9 9 4 9 8 7 5 7 5 6 0 0 0 7 9 0 8 8 5 5 0 5 5\n", + " 8 0 5 4 7 7 5 7 7 6 4 4 4 6 7 0 9 8 7 9 5 5 5 8 8 0 5 4 7 5 0 5 7 0 7 7 0\n", + " 0 4 5 6 0 0 6 8 9 0 4 7 4 0 5 5 6 6 5 9 5 9 5 9 9 6 5 5 6 7 8 5 7 5 8 5 9\n", + " 7 6 7 4 2 7 0 8 0 9 7 6 5 4 0 5 0 8 5 9 7 6 7 4 2 5 7 8 7 7 7 4 7 8 7 5 7\n", + " 5 6 5 0 0 7 7 5 8 8 9 5 0 7 5 8 0 9 4 9 7 2 7 7 6 4 4 4 6 0 5 5 8 0 7 9 7\n", + " 9 8 8 5 0 4 7 5 5 0 6 4 7 5 9 0 5 5 4 5 6 9 5 6 8 7 7 4 0 4 5 5 9 6 6 0 9\n", + " 7 9 9 7 0 0 9 6 7 7 6 0 8 5 0 7 8 0 1 7 6 5 4 2 0 7 8 5 1 7 6 5 4 2 5 7 8\n", + " 0 9 7 6 5 4 2 0 7 8 7 7 5 4 7 8 7 7 7 5 6 0 2 2 7 0 5 8 8 1 9 2 7 9 8 1 7\n", + " 4 7 7 2 7 7 6 4 4 4 6 7 0 0 8 7 5 1 0 9 8 8 5 2 4 7 9 5 0 6 4 7 0 7 7 0 0\n", + " 4 7 6 9 5 6 8 9 7 4 7 5 0 2 5 6 6 2 9 7 9 0 5 0 7 0 6 5 5 6 0 8 7 7 7 8 1\n", + " 1 7 6 7 4 5 0 7 8 1 5 7 6 7 4 0 5 7 8 1 1 7 6 7 4 0 5 7 8 5 7 7 4 7 8 7 0\n", + " 7 5 6 1 0 0 7 7 1 8 8 3 1 0 4 1 8 1 1 4 7 7 0 7 7 6 4 4 4 6 7 1 7 8 7 7 1\n", + " 4 9 8 8 1 0 4 1 1 1 0 6 4 7 1 7 7 1 0 4 5 6 7 5 6 8 7 7 4 7 4 1 0 7 6 6 5\n", + " 1 0 9 1 0 0 7 7 6 0 0 6 7 8 0 7 0 8 5 9 7 6 0 4 0 7 2 8 5 5 7 6 5 4 0 5 0\n", + " 8 5 7 7 6 5 4 2 7 0 8 0 6 5 4 5 8 0 5 0 5 6 0 2 0 7 5 0 8 8 7 5 0 5 5 8 5\n", + " 5 4 7 7 0 7 7 6 4 4 4 6 0 5 5 8 0 0 5 7 5 8 8 0 0 4 7 9 5 2 6 4 7 5 7 0 5\n", + " 0 4 5 6 7 5 0 8 7 7 4 0 4 5 0 5 6 2 0 5 5 5 5 5 0 2 9 6 7 5 0 2 8 5 0 5 8\n", + " 5 9 7 6 9 4 0 4 7 8 5 9 7 6 9 8 5 0 7 8 5 9 7 6 9 4 0 5 7 8 7 5 5 4 5 8 7\n", + " 7 7 7 6 5 5 0 7 5 5 8 8 5 9 5 0 9 8 1 9 4 7 7 0 0 0 6 4 4 4 6 7 5 5 8 7 9\n", + " 9 5 9 8 8 5 0 4 7 5 5 0 6 4 7 5 7 7 5 5 4 5 6 7 5 6 8 5 7 4 7 4 0 5 9 6 6\n", + " 0 9 0 0 5 5 5 7 9 6 0 7 6 7 8 5 7 5 1 9 7 6 7 4 0 7 7 8 1 9 7 6 7 4 5 7 7\n", + " 8 1 9 7 6 7 4 0 7 7 8 7 7 7 4 7 8 7 7 7 5 6 1 0 0 7 7 1 9 0 7 1 8 1 1 4 7\n", + " 7 0 7 7 6 4 4 4 6 7 1 7 8 7 7 1 7 1 8 8 1 0 4 7 9 1 6 4 7 1 7 7 1 0 4 7 2\n", + " 7 1 6 8 9 7 4 7 4 1 2 7 6 2 0 1 7 1 9 7 0 7 7 2 2 7 8 7 7 7 8 5 9 7 2 5 4\n", + " 0 0 7 8 5 5 7 2 5 4 0 5 7 8 5 1 7 6 7 4 0 9 7 8 7 5 5 4 5 8 7 9 7 5 2 5 0\n", + " 0 7 7 5 8 8 0 5 1 8 5 1 4 3 7 5 7 7 6 4 4 4 0 7 5 0 8 7 9 1 7 1 8 8 5 0 1\n", + " 7 1 5 0 6 4 7 5 0 7 5 0 4 5 6 7 5 6 8 5 7 4 7 4 5 0 7 6 6 5 9 5 1 1 5 0 7\n", + " 5 6 5 7 6 7 8 9 7 9 8 5 9 7 6 7 4 0 7 7 8 0 1 7 6 7 4 0 5 7 8 5 9 7 6 9 4\n", + " 0 5 7 8 7 7 9 4 9 8 7 9 7 5 6 5 0 0 0 4 5 8 8 9 5 0 5 9 8 5 9 4 7 7 0 7 7\n", + " 6 4 4 4 6 7 5 5 8 7 7 9 9 7 8 8 5 0 4 9 9 5 5 6 4 7 5 7 7 5 5 4 5 6 7 5 6\n", + " 8 9 9 4 7 4 3 0 5 6 6 0 9 3 9 9 9 0 7 9 6 5 5 6 7 8 5 7 5 8 0 1 9 6 7 4 9\n", + " 5 7 8 5 1 7 6 9 4 0 5 7 8 0 5 7 6 5 4 0 5 7 8 7 9 5 4 5 8 7 5 7 5 6 5 0 0\n", + " 7 7 5 8 8 1 1 0 5 1 8 0 1 4 7 7 0 7 7 6 4 4 4 6 7 5 7 8 7 4 9 5 7 8 8 0 0\n", + " 4 7 1 5 5 6 4 7 5 7 7 5 0 4 5 6 7 5 6 8 5 7 4 7 4 5 5 5 6 6 0 1 5 1 1 4 0\n", + " 7 7 6 5 5 6 7 8 5 8 0 9 7 6 7 4 0 7 7 8 0 9 7 6 7 4 0 7 7 8 0 9 7 6 7 4 0\n", + " 7 7 8 7 7 7 4 7 8 7 7 7 7 6 0 0 0 7 7 0 8 8 9 9 2 0 9 8 0 9 4 7 7 0 7 7 6\n", + " 4 4 4 6 7 0 7 8 7 7 9 0 9 8 8 0 0 4 7 9 0 0 6 4 7 0 7 7 0 0 4 0 6 7 0 6 8\n", + " 7 7 4 7 4 0 0 7 6 6 0 9 0 9 9 7 0 7 7 6 0 0 6 7 8 0 7 0 8 5 1 7 6 5 4 9 5\n", + " 0 8 5 1 0 6 5 4 0 8 5 1 7 6 5 4 5 5 2 6 0 5 9 4 5 8 0 0 5 5 6 5 0 9 5 9 5\n", + " 4 8 1 9 9 5 1 8 5 9 4 5 5 9 5 5 6 4 4 4 5 0 5 5 8 7 5 1 0 8 5 5 4 7 9 5 9\n", + " 5 4 7 5 5 0 5 9 4 5 6 7 5 6 8 5 7 4 7 5 5 0 5 6 6 5 1 7 5 9 7 5 5 6 5 8 5\n", + " 0 0 8 5 9 7 6 7 4 0 7 7 8 5 5 5 6 7 4 0 9 7 8 5 5 9 6 6 4 0 5 7 8 7 5 0 4\n", + " 5 8 7 7 7 9 2 5 0 0 0 0 5 8 8 9 9 0 5 9 8 5 9 4 5 0 5 7 0 6 4 4 4 6 7 5 7\n", + " 8 7 7 9 5 9 8 8 5 6 4 0 5 5 6 6 4 7 5 7 7 5 0 4 5 6 5 5 6 8 7 7 4 7 4 5 5\n", + " 7 6 6 0 9 5 9 5 7 5 7 7 6 5 5 6 7 8 5 7 7]\n", + "------Test------\n", + "random \t0.074s\t69952\t0.545\t0.616\t0.578\t0.415\t0.574\t0.117\n", + "------Test------\n", + "estimator[-1].labels_ [1 2 2 7 0 8 0 5 2 8 1 9 6 7 0 4 0 5 2 8 1 9 2 7 9 4 0 9 2 8 1 8 4 3 0 4 1\n", + " 8 2 8 2 0 9 5 5 7 4 9 1 1 2 2 5 2 9 1 9 2 0 7 7 5 7 7 0 0 0 0 0 2 9 4 1 8\n", + " 4 2 2 9 1 1 9 5 0 7 3 2 5 4 0 7 9 7 8 2 5 2 2 0 7 2 0 1 4 3 0 8 2 9 5 4 0\n", + " 0 5 2 2 2 2 4 5 8 4 2 2 2 0 7 1 2 8 2 1 9 3 7 9 3 0 5 2 8 1 9 3 7 0 3 0 5\n", + " 2 8 1 9 2 7 0 4 0 5 2 8 1 8 4 3 0 4 1 8 2 8 2 0 9 5 5 7 3 9 1 1 3 3 5 2 3\n", + " 1 9 2 0 7 7 5 7 7 0 0 0 0 0 8 9 4 1 8 3 2 2 2 1 1 9 5 0 7 2 9 5 7 9 7 8 9\n", + " 5 0 2 0 7 9 0 1 4 5 0 8 0 9 5 4 0 0 5 3 2 3 2 3 3 9 2 2 0 8 1 2 8 2 1 2 3\n", + " 7 0 8 0 5 8 4 1 2 3 7 0 4 0 5 2 5 1 2 3 7 0 4 0 5 2 8 1 8 4 4 0 8 1 8 2 8\n", + " 2 0 2 5 5 7 4 2 1 1 3 2 5 8 3 1 9 3 0 7 7 9 7 7 0 0 0 0 0 5 2 4 1 5 4 3 8\n", + " 3 1 1 2 5 0 7 7 2 5 0 0 7 2 7 5 2 5 0 2 0 7 2 5 1 8 7 0 5 0 2 5 4 0 0 5 3\n", + " 8 3 3 8 5 9 3 0 8 8 0 9 1 2 5 7 1 9 6 7 0 4 0 9 2 8 1 2 6 7 0 4 0 9 2 8 1\n", + " 9 3 8 0 4 0 9 8 8 1 8 4 4 0 8 1 8 8 8 2 0 9 9 5 7 4 2 1 1 6 3 9 8 3 1 6 7\n", + " 0 7 8 9 8 8 0 0 0 0 0 8 9 4 1 8 4 6 2 3 1 1 2 9 0 7 3 2 5 9 0 7 9 7 8 9 4\n", + " 0 8 0 7 2 0 1 4 7 0 8 2 9 9 4 9 0 9 3 8 3 2 4 5 8 4 9 2 2 0 8 1 8 8 8 1 6\n", + " 6 8 0 4 0 5 2 8 1 6 2 8 5 8 0 5 2 8 1 6 6 8 0 4 0 5 2 8 1 2 4 8 0 8 1 8 2\n", + " 8 2 0 6 5 5 8 4 6 1 1 6 6 5 0 6 1 6 6 0 8 8 5 8 8 0 0 0 0 0 8 6 4 1 8 8 6\n", + " 2 3 1 1 6 5 0 6 6 6 5 0 0 8 6 8 8 6 5 0 2 0 8 2 0 1 8 8 0 8 0 6 5 4 0 0 5\n", + " 6 2 3 6 4 5 8 4 0 9 2 5 8 1 2 8 2 1 2 3 7 0 4 0 5 8 9 1 2 7 8 0 4 0 5 2 9\n", + " 1 2 7 7 9 4 0 9 8 9 1 9 0 4 0 4 1 9 2 9 8 9 2 9 9 8 4 2 1 1 7 7 5 2 2 1 2\n", + " 8 0 7 7 5 7 7 9 0 0 0 0 9 2 4 1 9 4 7 8 2 1 1 2 5 0 7 3 2 9 0 0 7 2 7 9 2\n", + " 5 0 2 0 7 2 5 1 4 7 0 9 0 2 5 4 0 9 5 2 2 2 7 4 5 9 4 0 8 2 5 9 1 8 9 2 1\n", + " 2 3 7 0 4 0 5 2 8 1 2 3 7 0 4 1 5 9 8 1 2 3 7 0 4 0 5 2 8 1 8 4 4 0 4 1 8\n", + " 8 8 8 0 2 5 5 7 4 2 1 1 2 3 5 8 3 1 6 3 0 7 7 5 7 7 0 0 0 0 0 8 2 4 1 8 4\n", + " 3 2 3 1 1 2 5 0 7 2 2 5 0 0 7 2 7 8 2 5 0 2 0 7 2 0 1 4 7 0 8 0 2 5 4 0 0\n", + " 5 3 2 5 7 4 5 8 4 0 8 8 0 8 1 2 8 2 6 3 8 0 4 0 5 8 8 1 6 3 7 0 8 0 5 8 8\n", + " 1 6 3 7 9 8 0 5 7 8 1 8 8 8 0 8 1 8 8 8 2 9 6 5 5 7 8 6 3 5 8 6 1 6 6 0 7\n", + " 7 5 8 8 9 0 0 0 9 8 6 8 1 8 8 6 8 6 1 1 6 5 0 8 3 6 0 0 8 6 8 8 6 5 0 8 9\n", + " 7 6 9 1 4 7 0 8 0 6 5 8 0 9 5 6 8 6 3 8 5 8 7 9 9 8 1 8 8 8 1 2 3 7 9 4 0\n", + " 5 5 7 1 2 2 8 9 4 0 5 2 8 1 2 6 7 9 4 0 5 3 8 1 8 4 4 0 4 1 8 3 8 2 9 2 5\n", + " 5 7 4 2 1 1 5 2 6 1 2 6 0 7 7 5 7 7 9 0 0 0 5 8 2 4 1 4 4 6 7 6 1 1 2 5 6\n", + " 7 6 2 5 9 0 7 2 5 8 2 5 0 2 9 7 2 0 1 4 7 0 8 0 2 5 4 0 9 5 3 2 6 6 4 5 8\n", + " 4 0 8 7 0 8 1 3 8 3 1 2 3 7 0 4 0 5 7 8 1 9 6 7 0 4 0 5 2 8 1 2 3 7 0 3 0\n", + " 5 2 8 1 8 4 4 0 4 1 8 4 8 4 0 2 5 5 4 4 2 1 1 3 7 5 4 3 1 2 3 0 7 7 5 7 7\n", + " 0 0 0 0 0 8 2 4 1 8 4 3 4 7 1 1 2 5 0 7 7 2 5 0 0 7 2 7 8 2 5 0 2 0 7 2 0\n", + " 1 3 3 0 8 0 4 5 4 0 0 5 3 3 3 3 3 5 8 3 0 2 2 0 8 1 2 8 2 1 9 6 7 0 8 0 3\n", + " 2 8 1 2 6 7 0 4 0 5 2 8 1 9 2 7 0 4 6 5 2 8 1 8 4 4 0 4 1 8 2 8 2 0 2 5 5\n", + " 8 4 2 1 1 6 6 5 2 6 1 9 6 0 7 7 5 8 8 0 0 0 0 0 8 2 4 1 8 0 3 2 8 1 1 9 5\n", + " 0 7 6 2 5 0 6 7 2 7 8 2 5 0 2 0 8 2 0 1 4 7 0 8 0 2 5 4 0 0 5 6 2 6 6 8 5\n", + " 8 4 0 2 2 0 8 1 2 1 9 3 7 0 4 0 5 8 8 1 9 3 7 0 8 0 5 8 8 1 9 3 7 0 8 0 5\n", + " 8 8 1 8 8 4 0 8 1 8 8 8 8 0 9 5 5 7 8 9 1 1 3 3 9 9 3 1 9 3 0 7 7 5 7 7 0\n", + " 0 0 0 0 8 9 8 1 7 4 3 8 3 1 1 9 5 0 7 3 9 5 0 0 7 9 7 8 9 5 0 8 0 8 9 0 1\n", + " 8 7 0 8 0 9 5 8 0 0 5 3 8 7 3 4 5 8 4 0 9 2 0 8 1 9 8 2 1 2 6 7 0 4 0 3 2\n", + " 9 1 2 6 7 0 4 0 9 1 2 6 7 0 4 0 5 4 9 0 9 4 3 0 4 1 9 2 4 2 0 2 5 3 7 3 2\n", + " 0 1 6 3 3 2 6 1 2 3 0 2 5 3 5 2 5 0 0 0 2 9 2 4 0 8 2 6 2 1 2 5 0 7 3 2 3\n", + " 2 0 7 2 7 5 2 3 0 2 0 7 2 0 1 4 7 0 8 0 2 5 4 0 5 5 6 8 4 3 7 4 4 0 4 1 2\n", + " 5 5 1 2 3 7 0 4 0 5 7 8 1 2 2 7 0 4 0 5 3 8 1 2 3 3 0 0 0 5 2 8 1 8 4 4 0\n", + " 4 1 8 8 8 3 9 2 5 5 5 4 2 1 1 3 3 5 2 3 1 2 3 0 2 2 5 8 2 9 0 0 0 0 8 2 4\n", + " 1 8 0 3 2 3 1 1 2 5 0 7 2 2 5 0 0 7 2 7 8 2 5 0 2 0 4 2 0 1 8 7 0 8 0 2 5\n", + " 8 0 0 5 3 2 3 3 8 5 8 8 0 2 2 0 8 1 2 8 8]\n", + "------Test------\n", + "PCA-based\t0.030s\t72686\t0.636\t0.658\t0.647\t0.521\t0.643\t0.138\n", + "__________________________________________________________________________________\n" + ] + } + ], + "source": [ + "from sklearn.cluster import KMeans\n", + "from sklearn.decomposition import PCA\n", + "\n", + "print(82 * '_')\n", + "print('init\\t\\ttime\\tinertia\\thomo\\tcompl\\tv-meas\\tARI\\tAMI\\tsilhouette')\n", + "\n", + "kmeans = KMeans(init=\"k-means++\", n_clusters=n_digits, n_init=4,\n", + " random_state=0)\n", + "bench_k_means(kmeans=kmeans, name=\"k-means++\", data=data, labels=labels)\n", + "\n", + "kmeans = KMeans(init=\"random\", n_clusters=n_digits, n_init=4, random_state=0)\n", + "bench_k_means(kmeans=kmeans, name=\"random\", data=data, labels=labels)\n", + "\n", + "pca = PCA(n_components=n_digits).fit(data)\n", + "kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)\n", + "bench_k_means(kmeans=kmeans, name=\"PCA-based\", data=data, labels=labels)\n", + "\n", + "print(82 * '_')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize the results on PCA-reduced data\n", + "\n", + ":class:`~sklearn.decomposition.PCA` allows to project the data from the\n", + "original 64-dimensional space into a lower dimensional space. Subsequently,\n", + "we can use :class:`~sklearn.decomposition.PCA` to project into a\n", + "2-dimensional space and plot the data and the clusters in this new space.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "reduced_data = PCA(n_components=2).fit_transform(data)\n", + "kmeans = KMeans(init=\"k-means++\", n_clusters=n_digits, n_init=4)\n", + "kmeans.fit(reduced_data)\n", + "\n", + "# Step size of the mesh. Decrease to increase the quality of the VQ.\n", + "h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].\n", + "\n", + "# Plot the decision boundary. For that, we will assign a color to each\n", + "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n", + "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n", + "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", + "\n", + "# Obtain labels for each point in mesh. Use last trained model.\n", + "Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])\n", + "\n", + "# Put the result into a color plot\n", + "Z = Z.reshape(xx.shape)\n", + "plt.figure(1)\n", + "plt.clf()\n", + "plt.imshow(Z, interpolation=\"nearest\",\n", + " extent=(xx.min(), xx.max(), yy.min(), yy.max()),\n", + " cmap=plt.cm.Paired, aspect=\"auto\", origin=\"lower\")\n", + "\n", + "plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)\n", + "# Plot the centroids as a white X\n", + "centroids = kmeans.cluster_centers_\n", + "plt.scatter(centroids[:, 0], centroids[:, 1], marker=\"x\", s=169, linewidths=3,\n", + " color=\"w\", zorder=10)\n", + "plt.title(\"K-means clustering on the digits dataset (PCA-reduced data)\\n\"\n", + " \"Centroids are marked with white cross\")\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(y_min, y_max)\n", + "plt.xticks(())\n", + "plt.yticks(())\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From ffedea52ede637a710d250cd93d4e285bfb165a7 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Sat, 20 Feb 2021 12:13:37 -0800 Subject: [PATCH 02/16] clean up output --- tour_model_eval/plot_kmeans_digits.ipynb | 265 +---------------------- 1 file changed, 10 insertions(+), 255 deletions(-) diff --git a/tour_model_eval/plot_kmeans_digits.ipynb b/tour_model_eval/plot_kmeans_digits.ipynb index 62f7cd6..e88eb15 100644 --- a/tour_model_eval/plot_kmeans_digits.ipynb +++ b/tour_model_eval/plot_kmeans_digits.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -39,17 +39,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Automatically created module for IPython interactive environment\n" - ] - } - ], + "outputs": [], "source": [ "print(__doc__)" ] @@ -68,69 +60,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# digits: 10; # samples: 1797; # features 64\n", - "------Test------\n", - "data.shape is (1797, 64)\n", - "labels is [0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0\n", - " 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9\n", - " 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4\n", - " 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7\n", - " 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2\n", - " 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 3 1 3 9 1\n", - " 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 5 4 8 8 4 9 0 8 9 8 0 1 2\n", - " 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9\n", - " 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8\n", - " 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2\n", - " 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0\n", - " 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2\n", - " 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7\n", - " 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1\n", - " 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8\n", - " 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2\n", - " 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7\n", - " 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9\n", - " 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1\n", - " 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1\n", - " 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0\n", - " 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9\n", - " 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5\n", - " 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4\n", - " 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9\n", - " 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 2 7 8 2 0 1 2 6 3\n", - " 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 4 6 3 1 3 9 1 7 6 8 4\n", - " 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 4 9 0 8 9 8 0 1 2 3 4 5 6\n", - " 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7\n", - " 7 3 5 1 0 0 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6\n", - " 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9\n", - " 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6\n", - " 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3\n", - " 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4\n", - " 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7\n", - " 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7\n", - " 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7\n", - " 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7\n", - " 9 5 4 8 8 4 9 0 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7\n", - " 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4\n", - " 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0\n", - " 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2 3 4 5 6 7 8\n", - " 9 0 1 2 3 4 5 6 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1\n", - " 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 0 1 7 6 3 2 1 7\n", - " 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 2 5 7 9 5 4 4 9 0 8\n", - " 9 8 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6\n", - " 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5\n", - " 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6 8 4 3 1 4 0 5 3 6 9 6 1 7\n", - " 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8]\n", - "------Test------\n" - ] - } - ], + "outputs": [], "source": [ "import numpy as np\n", "from sklearn.datasets import load_digits\n", @@ -167,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -247,175 +179,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__________________________________________________________________________________\n", - "init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette\n", - "------Test------\n", - "estimator[-1].labels_ [2 8 8 3 9 3 1 0 3 3 2 6 4 3 9 5 1 0 8 3 2 6 8 3 6 5 1 6 8 3 2 3 5 5 1 5 2\n", - " 3 8 3 8 9 6 0 0 3 5 6 2 2 8 6 0 8 6 2 6 8 1 3 3 0 3 5 9 1 1 1 9 8 6 5 2 3\n", - " 3 6 5 6 2 2 6 0 1 3 5 6 0 8 1 3 6 5 3 8 0 8 8 9 3 8 9 2 5 5 1 3 1 6 0 5 9\n", - " 9 0 8 8 8 8 5 0 3 8 9 8 8 9 3 2 8 3 8 2 6 5 5 9 5 1 0 8 3 2 6 8 3 9 5 1 0\n", - " 8 3 2 6 8 3 9 5 1 0 8 3 2 3 5 5 1 5 2 3 8 3 8 9 6 0 0 3 5 6 2 2 5 5 0 8 5\n", - " 2 6 8 1 3 3 0 3 3 9 1 1 1 9 3 6 5 2 3 5 8 8 8 2 2 6 0 1 3 8 6 0 3 6 3 3 6\n", - " 0 1 8 9 0 6 9 2 5 0 1 3 1 6 0 5 9 9 0 5 8 5 8 5 5 6 8 8 9 3 2 8 3 8 2 8 5\n", - " 3 9 3 1 7 3 3 2 6 5 3 9 5 1 0 8 0 2 8 5 3 9 3 1 7 8 3 2 3 3 3 1 3 2 3 8 3\n", - " 8 9 8 0 0 3 3 8 2 2 5 8 0 3 5 2 6 5 1 5 3 7 3 3 9 1 1 1 9 0 8 5 2 0 3 5 3\n", - " 5 2 2 8 0 1 3 5 6 0 9 1 3 8 3 0 8 0 1 8 9 5 8 9 2 3 3 1 0 1 8 0 5 9 9 0 5\n", - " 3 5 5 3 0 6 5 9 3 3 9 6 2 8 0 3 2 6 4 3 9 3 1 7 6 3 2 6 4 3 9 5 1 7 8 3 2\n", - " 6 5 3 9 8 1 7 3 3 2 3 3 5 1 3 2 3 3 3 3 9 6 7 7 3 6 6 2 2 4 5 7 3 5 2 4 3\n", - " 1 3 3 7 3 3 9 1 1 1 9 3 6 3 2 3 3 4 8 5 2 2 6 7 1 3 5 6 0 6 1 3 6 3 3 6 6\n", - " 1 3 9 3 6 9 2 5 3 1 3 8 6 7 5 6 9 7 5 3 5 8 5 0 3 0 6 8 8 9 3 2 3 3 3 2 4\n", - " 4 3 9 3 1 0 8 3 2 4 8 3 0 3 1 0 8 3 2 4 4 3 9 3 1 0 8 3 2 8 3 3 1 3 2 3 6\n", - " 3 8 9 4 0 0 3 3 4 2 2 4 4 0 1 4 2 4 4 1 3 3 0 3 3 9 1 1 1 9 3 4 3 2 3 3 4\n", - " 8 5 2 2 4 0 1 4 4 4 0 9 1 3 4 3 3 4 0 1 8 9 3 8 9 2 3 3 1 3 1 4 0 3 9 9 0\n", - " 4 8 5 4 0 0 3 3 9 6 8 9 3 2 6 3 6 2 8 5 3 9 0 1 0 3 7 2 8 5 3 9 5 1 0 8 6\n", - " 2 8 5 3 6 5 1 7 3 6 2 6 9 5 1 5 2 6 8 6 3 9 8 7 6 3 5 8 2 2 3 8 0 8 8 2 8\n", - " 3 1 3 3 0 3 3 9 1 1 1 9 6 8 1 2 6 0 8 3 8 2 2 8 0 1 3 5 8 7 9 1 3 8 3 6 8\n", - " 0 1 8 9 3 8 0 2 3 3 1 6 1 8 0 5 9 7 0 8 8 8 5 5 0 7 5 9 3 8 0 6 2 8 6 8 2\n", - " 8 5 3 9 5 1 0 1 3 2 8 5 3 9 5 2 0 6 6 2 8 5 3 9 5 1 0 8 3 2 3 5 0 1 5 2 3\n", - " 3 3 3 9 8 0 0 3 5 8 2 2 8 5 0 6 5 2 4 5 1 3 3 0 3 3 9 1 1 1 9 3 8 8 2 3 5\n", - " 5 8 5 2 2 8 0 1 3 0 8 0 9 1 3 8 3 3 8 0 1 8 9 3 8 9 2 5 3 1 3 1 8 0 5 9 9\n", - " 0 5 6 0 5 5 0 3 5 9 6 3 9 3 2 8 3 8 4 5 3 9 3 1 0 3 3 2 4 5 3 9 3 1 0 3 3\n", - " 2 4 5 3 9 3 1 0 3 3 2 3 3 3 1 3 2 3 3 3 8 9 4 0 0 3 3 4 5 0 3 4 2 4 4 1 3\n", - " 3 0 3 3 6 1 1 1 9 3 4 3 2 3 3 4 3 4 2 2 4 0 1 3 5 4 9 1 3 4 3 3 4 0 1 3 7\n", - " 3 4 9 2 5 3 1 3 1 4 7 3 9 7 0 5 3 4 5 3 0 3 3 7 7 3 2 3 3 3 2 8 5 3 7 5 1\n", - " 0 0 3 2 8 4 3 7 5 1 0 8 3 2 8 4 3 9 3 1 0 5 3 2 3 5 5 1 0 2 3 5 3 8 7 8 0\n", - " 0 3 3 8 2 2 0 8 4 2 8 4 1 3 3 0 3 3 9 1 1 1 0 3 8 6 2 3 5 4 3 4 2 2 8 0 1\n", - " 3 4 8 0 9 1 3 8 0 3 8 0 1 3 9 3 8 9 2 5 3 1 3 1 8 0 3 9 9 0 5 8 4 4 5 0 3\n", - " 9 9 3 3 9 3 2 5 3 5 2 6 5 3 9 3 1 0 5 3 2 6 4 3 9 3 1 0 8 3 2 8 5 3 9 5 1\n", - " 0 5 3 2 3 3 5 1 5 2 3 5 3 5 9 8 0 0 5 1 8 2 2 5 5 0 5 5 2 8 5 1 3 3 0 3 3\n", - " 9 1 1 1 9 3 8 5 2 3 3 5 5 3 2 2 8 0 1 5 5 8 0 9 1 3 8 3 3 8 0 1 8 9 3 8 9\n", - " 2 5 5 1 3 1 8 0 5 9 9 0 5 5 5 5 5 0 3 5 9 8 8 9 3 2 8 3 8 2 6 4 3 9 3 1 5\n", - " 8 3 2 8 4 3 9 5 1 0 5 3 2 6 3 3 9 3 1 0 8 3 2 3 5 3 1 5 2 3 8 3 8 9 8 0 0\n", - " 3 3 8 2 2 4 4 0 8 4 2 6 4 1 3 3 0 3 3 9 1 1 1 9 3 8 3 2 3 1 5 8 3 2 2 6 0\n", - " 1 3 4 8 0 9 1 3 8 3 3 8 0 1 8 9 3 8 9 2 8 3 1 3 1 8 0 8 9 9 0 4 8 4 4 3 0\n", - " 3 5 9 8 8 9 3 2 8 2 6 5 3 9 3 1 0 3 3 2 6 5 3 9 3 1 0 3 3 2 6 5 3 9 3 1 0\n", - " 3 3 2 3 3 3 1 3 2 3 3 3 3 9 6 0 0 3 3 6 2 2 5 5 7 6 5 2 6 5 1 3 3 0 3 3 9\n", - " 1 1 1 9 3 6 3 2 3 3 5 3 5 2 2 6 0 1 3 5 6 0 9 1 3 6 3 3 6 0 1 3 9 3 6 9 2\n", - " 3 3 1 3 1 6 0 3 9 9 0 5 6 5 5 3 0 3 3 9 6 8 9 3 2 6 3 8 2 8 4 3 9 5 6 5 8\n", - " 6 2 8 4 3 9 5 1 6 2 8 4 3 9 5 8 0 8 7 9 6 5 5 1 8 2 6 8 5 8 9 8 0 5 5 5 8\n", - " 2 2 4 5 5 8 4 2 8 5 1 8 0 5 0 8 9 1 1 1 8 6 8 5 2 3 8 4 8 2 8 0 1 3 5 8 5\n", - " 8 1 3 8 3 0 8 5 1 8 9 3 8 9 2 5 3 8 3 8 8 0 8 9 9 0 4 3 5 5 3 8 5 9 5 2 8\n", - " 0 0 2 8 5 3 9 3 1 0 3 3 2 8 8 8 9 3 1 0 5 3 2 8 5 5 9 9 1 0 8 3 2 3 3 6 1\n", - " 3 2 3 3 3 5 7 8 0 0 0 3 8 2 2 5 5 0 8 5 2 8 5 1 8 0 0 3 0 9 1 1 1 9 3 8 3\n", - " 2 3 3 5 8 5 2 2 8 0 1 0 8 8 0 9 1 3 8 3 3 8 0 1 8 9 8 8 9 2 3 3 1 3 1 8 0\n", - " 3 9 9 0 5 8 5 5 3 0 3 3 9 8 8 9 3 2 8 3 3]\n", - "------Test------\n", - "k-means++\t0.157s\t69485\t0.613\t0.660\t0.636\t0.482\t0.632\t0.134\n", - "------Test------\n", - "estimator[-1].labels_ [8 5 5 7 6 7 4 0 7 7 8 0 1 7 6 9 4 0 5 7 8 0 5 0 6 5 4 0 5 7 8 7 9 9 4 9 8\n", - " 7 5 7 5 6 0 0 0 7 5 0 8 8 5 0 0 5 0 8 0 0 4 7 7 0 7 9 6 4 4 4 6 5 0 9 8 7\n", - " 5 0 5 0 8 8 0 0 4 7 9 0 0 5 4 7 0 9 7 5 0 5 5 6 7 5 6 8 5 9 4 7 4 0 0 5 6\n", - " 6 0 5 5 0 5 9 0 7 5 6 5 5 6 7 8 5 7 5 8 0 9 9 6 9 4 0 5 7 8 0 5 9 6 9 4 5\n", - " 5 7 8 0 5 0 6 9 4 5 5 7 8 7 9 9 4 9 8 7 5 7 5 6 0 0 0 7 9 0 8 8 5 5 0 5 5\n", - " 8 0 5 4 7 7 5 7 7 6 4 4 4 6 7 0 9 8 7 9 5 5 5 8 8 0 5 4 7 5 0 5 7 0 7 7 0\n", - " 0 4 5 6 0 0 6 8 9 0 4 7 4 0 5 5 6 6 5 9 5 9 5 9 9 6 5 5 6 7 8 5 7 5 8 5 9\n", - " 7 6 7 4 2 7 0 8 0 9 7 6 5 4 0 5 0 8 5 9 7 6 7 4 2 5 7 8 7 7 7 4 7 8 7 5 7\n", - " 5 6 5 0 0 7 7 5 8 8 9 5 0 7 5 8 0 9 4 9 7 2 7 7 6 4 4 4 6 0 5 5 8 0 7 9 7\n", - " 9 8 8 5 0 4 7 5 5 0 6 4 7 5 9 0 5 5 4 5 6 9 5 6 8 7 7 4 0 4 5 5 9 6 6 0 9\n", - " 7 9 9 7 0 0 9 6 7 7 6 0 8 5 0 7 8 0 1 7 6 5 4 2 0 7 8 5 1 7 6 5 4 2 5 7 8\n", - " 0 9 7 6 5 4 2 0 7 8 7 7 5 4 7 8 7 7 7 5 6 0 2 2 7 0 5 8 8 1 9 2 7 9 8 1 7\n", - " 4 7 7 2 7 7 6 4 4 4 6 7 0 0 8 7 5 1 0 9 8 8 5 2 4 7 9 5 0 6 4 7 0 7 7 0 0\n", - " 4 7 6 9 5 6 8 9 7 4 7 5 0 2 5 6 6 2 9 7 9 0 5 0 7 0 6 5 5 6 0 8 7 7 7 8 1\n", - " 1 7 6 7 4 5 0 7 8 1 5 7 6 7 4 0 5 7 8 1 1 7 6 7 4 0 5 7 8 5 7 7 4 7 8 7 0\n", - " 7 5 6 1 0 0 7 7 1 8 8 3 1 0 4 1 8 1 1 4 7 7 0 7 7 6 4 4 4 6 7 1 7 8 7 7 1\n", - " 4 9 8 8 1 0 4 1 1 1 0 6 4 7 1 7 7 1 0 4 5 6 7 5 6 8 7 7 4 7 4 1 0 7 6 6 5\n", - " 1 0 9 1 0 0 7 7 6 0 0 6 7 8 0 7 0 8 5 9 7 6 0 4 0 7 2 8 5 5 7 6 5 4 0 5 0\n", - " 8 5 7 7 6 5 4 2 7 0 8 0 6 5 4 5 8 0 5 0 5 6 0 2 0 7 5 0 8 8 7 5 0 5 5 8 5\n", - " 5 4 7 7 0 7 7 6 4 4 4 6 0 5 5 8 0 0 5 7 5 8 8 0 0 4 7 9 5 2 6 4 7 5 7 0 5\n", - " 0 4 5 6 7 5 0 8 7 7 4 0 4 5 0 5 6 2 0 5 5 5 5 5 0 2 9 6 7 5 0 2 8 5 0 5 8\n", - " 5 9 7 6 9 4 0 4 7 8 5 9 7 6 9 8 5 0 7 8 5 9 7 6 9 4 0 5 7 8 7 5 5 4 5 8 7\n", - " 7 7 7 6 5 5 0 7 5 5 8 8 5 9 5 0 9 8 1 9 4 7 7 0 0 0 6 4 4 4 6 7 5 5 8 7 9\n", - " 9 5 9 8 8 5 0 4 7 5 5 0 6 4 7 5 7 7 5 5 4 5 6 7 5 6 8 5 7 4 7 4 0 5 9 6 6\n", - " 0 9 0 0 5 5 5 7 9 6 0 7 6 7 8 5 7 5 1 9 7 6 7 4 0 7 7 8 1 9 7 6 7 4 5 7 7\n", - " 8 1 9 7 6 7 4 0 7 7 8 7 7 7 4 7 8 7 7 7 5 6 1 0 0 7 7 1 9 0 7 1 8 1 1 4 7\n", - " 7 0 7 7 6 4 4 4 6 7 1 7 8 7 7 1 7 1 8 8 1 0 4 7 9 1 6 4 7 1 7 7 1 0 4 7 2\n", - " 7 1 6 8 9 7 4 7 4 1 2 7 6 2 0 1 7 1 9 7 0 7 7 2 2 7 8 7 7 7 8 5 9 7 2 5 4\n", - " 0 0 7 8 5 5 7 2 5 4 0 5 7 8 5 1 7 6 7 4 0 9 7 8 7 5 5 4 5 8 7 9 7 5 2 5 0\n", - " 0 7 7 5 8 8 0 5 1 8 5 1 4 3 7 5 7 7 6 4 4 4 0 7 5 0 8 7 9 1 7 1 8 8 5 0 1\n", - " 7 1 5 0 6 4 7 5 0 7 5 0 4 5 6 7 5 6 8 5 7 4 7 4 5 0 7 6 6 5 9 5 1 1 5 0 7\n", - " 5 6 5 7 6 7 8 9 7 9 8 5 9 7 6 7 4 0 7 7 8 0 1 7 6 7 4 0 5 7 8 5 9 7 6 9 4\n", - " 0 5 7 8 7 7 9 4 9 8 7 9 7 5 6 5 0 0 0 4 5 8 8 9 5 0 5 9 8 5 9 4 7 7 0 7 7\n", - " 6 4 4 4 6 7 5 5 8 7 7 9 9 7 8 8 5 0 4 9 9 5 5 6 4 7 5 7 7 5 5 4 5 6 7 5 6\n", - " 8 9 9 4 7 4 3 0 5 6 6 0 9 3 9 9 9 0 7 9 6 5 5 6 7 8 5 7 5 8 0 1 9 6 7 4 9\n", - " 5 7 8 5 1 7 6 9 4 0 5 7 8 0 5 7 6 5 4 0 5 7 8 7 9 5 4 5 8 7 5 7 5 6 5 0 0\n", - " 7 7 5 8 8 1 1 0 5 1 8 0 1 4 7 7 0 7 7 6 4 4 4 6 7 5 7 8 7 4 9 5 7 8 8 0 0\n", - " 4 7 1 5 5 6 4 7 5 7 7 5 0 4 5 6 7 5 6 8 5 7 4 7 4 5 5 5 6 6 0 1 5 1 1 4 0\n", - " 7 7 6 5 5 6 7 8 5 8 0 9 7 6 7 4 0 7 7 8 0 9 7 6 7 4 0 7 7 8 0 9 7 6 7 4 0\n", - " 7 7 8 7 7 7 4 7 8 7 7 7 7 6 0 0 0 7 7 0 8 8 9 9 2 0 9 8 0 9 4 7 7 0 7 7 6\n", - " 4 4 4 6 7 0 7 8 7 7 9 0 9 8 8 0 0 4 7 9 0 0 6 4 7 0 7 7 0 0 4 0 6 7 0 6 8\n", - " 7 7 4 7 4 0 0 7 6 6 0 9 0 9 9 7 0 7 7 6 0 0 6 7 8 0 7 0 8 5 1 7 6 5 4 9 5\n", - " 0 8 5 1 0 6 5 4 0 8 5 1 7 6 5 4 5 5 2 6 0 5 9 4 5 8 0 0 5 5 6 5 0 9 5 9 5\n", - " 4 8 1 9 9 5 1 8 5 9 4 5 5 9 5 5 6 4 4 4 5 0 5 5 8 7 5 1 0 8 5 5 4 7 9 5 9\n", - " 5 4 7 5 5 0 5 9 4 5 6 7 5 6 8 5 7 4 7 5 5 0 5 6 6 5 1 7 5 9 7 5 5 6 5 8 5\n", - " 0 0 8 5 9 7 6 7 4 0 7 7 8 5 5 5 6 7 4 0 9 7 8 5 5 9 6 6 4 0 5 7 8 7 5 0 4\n", - " 5 8 7 7 7 9 2 5 0 0 0 0 5 8 8 9 9 0 5 9 8 5 9 4 5 0 5 7 0 6 4 4 4 6 7 5 7\n", - " 8 7 7 9 5 9 8 8 5 6 4 0 5 5 6 6 4 7 5 7 7 5 0 4 5 6 5 5 6 8 7 7 4 7 4 5 5\n", - " 7 6 6 0 9 5 9 5 7 5 7 7 6 5 5 6 7 8 5 7 7]\n", - "------Test------\n", - "random \t0.074s\t69952\t0.545\t0.616\t0.578\t0.415\t0.574\t0.117\n", - "------Test------\n", - "estimator[-1].labels_ [1 2 2 7 0 8 0 5 2 8 1 9 6 7 0 4 0 5 2 8 1 9 2 7 9 4 0 9 2 8 1 8 4 3 0 4 1\n", - " 8 2 8 2 0 9 5 5 7 4 9 1 1 2 2 5 2 9 1 9 2 0 7 7 5 7 7 0 0 0 0 0 2 9 4 1 8\n", - " 4 2 2 9 1 1 9 5 0 7 3 2 5 4 0 7 9 7 8 2 5 2 2 0 7 2 0 1 4 3 0 8 2 9 5 4 0\n", - " 0 5 2 2 2 2 4 5 8 4 2 2 2 0 7 1 2 8 2 1 9 3 7 9 3 0 5 2 8 1 9 3 7 0 3 0 5\n", - " 2 8 1 9 2 7 0 4 0 5 2 8 1 8 4 3 0 4 1 8 2 8 2 0 9 5 5 7 3 9 1 1 3 3 5 2 3\n", - " 1 9 2 0 7 7 5 7 7 0 0 0 0 0 8 9 4 1 8 3 2 2 2 1 1 9 5 0 7 2 9 5 7 9 7 8 9\n", - " 5 0 2 0 7 9 0 1 4 5 0 8 0 9 5 4 0 0 5 3 2 3 2 3 3 9 2 2 0 8 1 2 8 2 1 2 3\n", - " 7 0 8 0 5 8 4 1 2 3 7 0 4 0 5 2 5 1 2 3 7 0 4 0 5 2 8 1 8 4 4 0 8 1 8 2 8\n", - " 2 0 2 5 5 7 4 2 1 1 3 2 5 8 3 1 9 3 0 7 7 9 7 7 0 0 0 0 0 5 2 4 1 5 4 3 8\n", - " 3 1 1 2 5 0 7 7 2 5 0 0 7 2 7 5 2 5 0 2 0 7 2 5 1 8 7 0 5 0 2 5 4 0 0 5 3\n", - " 8 3 3 8 5 9 3 0 8 8 0 9 1 2 5 7 1 9 6 7 0 4 0 9 2 8 1 2 6 7 0 4 0 9 2 8 1\n", - " 9 3 8 0 4 0 9 8 8 1 8 4 4 0 8 1 8 8 8 2 0 9 9 5 7 4 2 1 1 6 3 9 8 3 1 6 7\n", - " 0 7 8 9 8 8 0 0 0 0 0 8 9 4 1 8 4 6 2 3 1 1 2 9 0 7 3 2 5 9 0 7 9 7 8 9 4\n", - " 0 8 0 7 2 0 1 4 7 0 8 2 9 9 4 9 0 9 3 8 3 2 4 5 8 4 9 2 2 0 8 1 8 8 8 1 6\n", - " 6 8 0 4 0 5 2 8 1 6 2 8 5 8 0 5 2 8 1 6 6 8 0 4 0 5 2 8 1 2 4 8 0 8 1 8 2\n", - " 8 2 0 6 5 5 8 4 6 1 1 6 6 5 0 6 1 6 6 0 8 8 5 8 8 0 0 0 0 0 8 6 4 1 8 8 6\n", - " 2 3 1 1 6 5 0 6 6 6 5 0 0 8 6 8 8 6 5 0 2 0 8 2 0 1 8 8 0 8 0 6 5 4 0 0 5\n", - " 6 2 3 6 4 5 8 4 0 9 2 5 8 1 2 8 2 1 2 3 7 0 4 0 5 8 9 1 2 7 8 0 4 0 5 2 9\n", - " 1 2 7 7 9 4 0 9 8 9 1 9 0 4 0 4 1 9 2 9 8 9 2 9 9 8 4 2 1 1 7 7 5 2 2 1 2\n", - " 8 0 7 7 5 7 7 9 0 0 0 0 9 2 4 1 9 4 7 8 2 1 1 2 5 0 7 3 2 9 0 0 7 2 7 9 2\n", - " 5 0 2 0 7 2 5 1 4 7 0 9 0 2 5 4 0 9 5 2 2 2 7 4 5 9 4 0 8 2 5 9 1 8 9 2 1\n", - " 2 3 7 0 4 0 5 2 8 1 2 3 7 0 4 1 5 9 8 1 2 3 7 0 4 0 5 2 8 1 8 4 4 0 4 1 8\n", - " 8 8 8 0 2 5 5 7 4 2 1 1 2 3 5 8 3 1 6 3 0 7 7 5 7 7 0 0 0 0 0 8 2 4 1 8 4\n", - " 3 2 3 1 1 2 5 0 7 2 2 5 0 0 7 2 7 8 2 5 0 2 0 7 2 0 1 4 7 0 8 0 2 5 4 0 0\n", - " 5 3 2 5 7 4 5 8 4 0 8 8 0 8 1 2 8 2 6 3 8 0 4 0 5 8 8 1 6 3 7 0 8 0 5 8 8\n", - " 1 6 3 7 9 8 0 5 7 8 1 8 8 8 0 8 1 8 8 8 2 9 6 5 5 7 8 6 3 5 8 6 1 6 6 0 7\n", - " 7 5 8 8 9 0 0 0 9 8 6 8 1 8 8 6 8 6 1 1 6 5 0 8 3 6 0 0 8 6 8 8 6 5 0 8 9\n", - " 7 6 9 1 4 7 0 8 0 6 5 8 0 9 5 6 8 6 3 8 5 8 7 9 9 8 1 8 8 8 1 2 3 7 9 4 0\n", - " 5 5 7 1 2 2 8 9 4 0 5 2 8 1 2 6 7 9 4 0 5 3 8 1 8 4 4 0 4 1 8 3 8 2 9 2 5\n", - " 5 7 4 2 1 1 5 2 6 1 2 6 0 7 7 5 7 7 9 0 0 0 5 8 2 4 1 4 4 6 7 6 1 1 2 5 6\n", - " 7 6 2 5 9 0 7 2 5 8 2 5 0 2 9 7 2 0 1 4 7 0 8 0 2 5 4 0 9 5 3 2 6 6 4 5 8\n", - " 4 0 8 7 0 8 1 3 8 3 1 2 3 7 0 4 0 5 7 8 1 9 6 7 0 4 0 5 2 8 1 2 3 7 0 3 0\n", - " 5 2 8 1 8 4 4 0 4 1 8 4 8 4 0 2 5 5 4 4 2 1 1 3 7 5 4 3 1 2 3 0 7 7 5 7 7\n", - " 0 0 0 0 0 8 2 4 1 8 4 3 4 7 1 1 2 5 0 7 7 2 5 0 0 7 2 7 8 2 5 0 2 0 7 2 0\n", - " 1 3 3 0 8 0 4 5 4 0 0 5 3 3 3 3 3 5 8 3 0 2 2 0 8 1 2 8 2 1 9 6 7 0 8 0 3\n", - " 2 8 1 2 6 7 0 4 0 5 2 8 1 9 2 7 0 4 6 5 2 8 1 8 4 4 0 4 1 8 2 8 2 0 2 5 5\n", - " 8 4 2 1 1 6 6 5 2 6 1 9 6 0 7 7 5 8 8 0 0 0 0 0 8 2 4 1 8 0 3 2 8 1 1 9 5\n", - " 0 7 6 2 5 0 6 7 2 7 8 2 5 0 2 0 8 2 0 1 4 7 0 8 0 2 5 4 0 0 5 6 2 6 6 8 5\n", - " 8 4 0 2 2 0 8 1 2 1 9 3 7 0 4 0 5 8 8 1 9 3 7 0 8 0 5 8 8 1 9 3 7 0 8 0 5\n", - " 8 8 1 8 8 4 0 8 1 8 8 8 8 0 9 5 5 7 8 9 1 1 3 3 9 9 3 1 9 3 0 7 7 5 7 7 0\n", - " 0 0 0 0 8 9 8 1 7 4 3 8 3 1 1 9 5 0 7 3 9 5 0 0 7 9 7 8 9 5 0 8 0 8 9 0 1\n", - " 8 7 0 8 0 9 5 8 0 0 5 3 8 7 3 4 5 8 4 0 9 2 0 8 1 9 8 2 1 2 6 7 0 4 0 3 2\n", - " 9 1 2 6 7 0 4 0 9 1 2 6 7 0 4 0 5 4 9 0 9 4 3 0 4 1 9 2 4 2 0 2 5 3 7 3 2\n", - " 0 1 6 3 3 2 6 1 2 3 0 2 5 3 5 2 5 0 0 0 2 9 2 4 0 8 2 6 2 1 2 5 0 7 3 2 3\n", - " 2 0 7 2 7 5 2 3 0 2 0 7 2 0 1 4 7 0 8 0 2 5 4 0 5 5 6 8 4 3 7 4 4 0 4 1 2\n", - " 5 5 1 2 3 7 0 4 0 5 7 8 1 2 2 7 0 4 0 5 3 8 1 2 3 3 0 0 0 5 2 8 1 8 4 4 0\n", - " 4 1 8 8 8 3 9 2 5 5 5 4 2 1 1 3 3 5 2 3 1 2 3 0 2 2 5 8 2 9 0 0 0 0 8 2 4\n", - " 1 8 0 3 2 3 1 1 2 5 0 7 2 2 5 0 0 7 2 7 8 2 5 0 2 0 4 2 0 1 8 7 0 8 0 2 5\n", - " 8 0 0 5 3 2 3 3 8 5 8 8 0 2 2 0 8 1 2 8 8]\n", - "------Test------\n", - "PCA-based\t0.030s\t72686\t0.636\t0.658\t0.647\t0.521\t0.643\t0.138\n", - "__________________________________________________________________________________\n" - ] - } - ], + "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", "from sklearn.decomposition import PCA\n", @@ -452,20 +218,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", From ba0df386a4e25b5e957283fd796fac316cddd107 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Sat, 20 Feb 2021 22:33:20 -0800 Subject: [PATCH 03/16] v-measure for single user on above cutoff bins --- .../v-measurel_cutoff_bins_single_user.ipynb | 271 ++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb diff --git a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb new file mode 100644 index 0000000..d4b5552 --- /dev/null +++ b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "introductory-sunglasses", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "# Our imports\n", + "import emission.core.get_database as edb\n", + "import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline\n", + "import emission.analysis.modelling.tour_model.similarity as similarity\n", + "import emission.analysis.modelling.tour_model.featurization as featurization\n", + "import emission.analysis.modelling.tour_model.representatives as representatives\n", + "import emission.storage.decorations.analysis_timeseries_queries as esda\n", + "import pandas as pd\n", + "from numpy import *\n", + "# import confirmed_trips_eval_bins_clusters as evaluation\n", + "from sklearn import metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "under-sample", + "metadata": {}, + "outputs": [], + "source": [ + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "genetic-drinking", + "metadata": {}, + "outputs": [], + "source": [ + "participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n", + "all_users = [u[\"user_id\"] for u in participant_uuid_obj]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rubber-montgomery", + "metadata": {}, + "outputs": [], + "source": [ + "radius = 300" + ] + }, + { + "cell_type": "markdown", + "id": "thorough-nomination", + "metadata": {}, + "source": [ + "## Choose one user for experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "revolutionary-sussex", + "metadata": {}, + "outputs": [], + "source": [ + "user = all_users[4]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "crude-compatibility", + "metadata": {}, + "outputs": [], + "source": [ + "#read the data from the database. We choose key='confirmed_trip' to get confirmed trips here\n", + "trips = pipeline.read_data(uuid=user,key=esda.CONFIRMED_TRIP_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "right-graham", + "metadata": {}, + "outputs": [], + "source": [ + "# select trips that have user_input to analyze\n", + "non_empty_trips = [t for t in trips if t[\"data\"][\"user_input\"] != {}]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "infrared-accordance", + "metadata": {}, + "outputs": [], + "source": [ + "len(non_empty_trips), non_empty_trips" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "generic-glasgow", + "metadata": {}, + "outputs": [], + "source": [ + "bin_trips, bins = pipeline.remove_noise(non_empty_trips, radius)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "thick-commissioner", + "metadata": {}, + "outputs": [], + "source": [ + "logging.debug('The list of bins is %s' % bins)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "macro-round", + "metadata": {}, + "outputs": [], + "source": [ + "# show all user labels in all bins\n", + "for bin in bins:\n", + " bin_user_input = (non_empty_trips[i].data[\"user_input\"] for i in bin)\n", + " bin_df = pd.DataFrame(data = bin_user_input)\n", + " print(bin_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tired-athens", + "metadata": {}, + "outputs": [], + "source": [ + "# turn all user_input into list without binning\n", + "bin_trips_user_input_ls = pd.DataFrame(data=[bin_trips[i][\"data\"][\"user_input\"] for i in range(len(bin_trips))]).values.tolist()\n", + "bin_trips_user_input_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "veterinary-courage", + "metadata": {}, + "outputs": [], + "source": [ + "for i in bin_trips_user_input_ls:\n", + " if i[2] == \"same_mode\":\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "coastal-fiction", + "metadata": {}, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_df=pd.DataFrame(data=[bin_trips[i][\"data\"][\"user_input\"] for i in range(len(bin_trips))]).drop_duplicates()\n", + "no_dup_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stainless-ancient", + "metadata": {}, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_list = no_dup_df.values.tolist()\n", + "no_dup_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "narrative-reducing", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true =[]\n", + "for trip in bin_trips_user_input_ls:\n", + " if trip in no_dup_list:\n", + " labels_true.append(no_dup_list.index(trip))\n", + "labels_true" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "framed-sector", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_pred based on bins\n", + "labels_pred = []\n", + "for i in range(len(bins)):\n", + " for trip in bins[i]:\n", + " labels_pred.append(i)\n", + "labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unlimited-waste", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "grateful-finance", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "removable-rating", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true, labels_pred)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 4a3c8a551b95d2da4549b4fcebb6244cea901e0e Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Mon, 22 Feb 2021 09:18:03 -0800 Subject: [PATCH 04/16] modified v-measurel_cutoff_bins_single_user, add one file for clusters for a single user --- .../v-measurel_cutoff_bins_single_user.ipynb | 401 +++++++++++-- ...measurel_cutoff_clusters_single_user.ipynb | 564 ++++++++++++++++++ 2 files changed, 930 insertions(+), 35 deletions(-) create mode 100644 tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb diff --git a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb index d4b5552..6a55dd8 100644 --- a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "introductory-sunglasses", + "id": "severe-married", "metadata": {}, "outputs": [], "source": [ @@ -18,14 +18,15 @@ "import emission.storage.decorations.analysis_timeseries_queries as esda\n", "import pandas as pd\n", "from numpy import *\n", - "# import confirmed_trips_eval_bins_clusters as evaluation\n", - "from sklearn import metrics" + "import confirmed_trips_eval_bins_clusters as evaluation\n", + "from sklearn import metrics\n", + "from pandas.testing import assert_frame_equal" ] }, { "cell_type": "code", "execution_count": null, - "id": "under-sample", + "id": "knowing-price", "metadata": {}, "outputs": [], "source": [ @@ -36,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "genetic-drinking", + "id": "graphic-determination", "metadata": {}, "outputs": [], "source": [ @@ -47,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "rubber-montgomery", + "id": "breathing-description", "metadata": {}, "outputs": [], "source": [ @@ -65,28 +66,28 @@ { "cell_type": "code", "execution_count": null, - "id": "revolutionary-sussex", + "id": "gorgeous-retailer", "metadata": {}, "outputs": [], "source": [ - "user = all_users[4]" + "user = all_users[0]" ] }, { "cell_type": "code", "execution_count": null, - "id": "crude-compatibility", + "id": "homeless-father", "metadata": {}, "outputs": [], "source": [ - "#read the data from the database. We choose key='confirmed_trip' to get confirmed trips here\n", + "#read the data from the database. We choose key=esda.CONFIRMED_TRIP_KEY to get confirmed trips here\n", "trips = pipeline.read_data(uuid=user,key=esda.CONFIRMED_TRIP_KEY)" ] }, { "cell_type": "code", "execution_count": null, - "id": "right-graham", + "id": "arranged-cleaning", "metadata": {}, "outputs": [], "source": [ @@ -97,7 +98,7 @@ { "cell_type": "code", "execution_count": null, - "id": "infrared-accordance", + "id": "super-spencer", "metadata": {}, "outputs": [], "source": [ @@ -107,7 +108,7 @@ { "cell_type": "code", "execution_count": null, - "id": "generic-glasgow", + "id": "higher-advocate", "metadata": {}, "outputs": [], "source": [ @@ -117,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "thick-commissioner", + "id": "beautiful-parcel", "metadata": {}, "outputs": [], "source": [ @@ -127,8 +128,10 @@ { "cell_type": "code", "execution_count": null, - "id": "macro-round", - "metadata": {}, + "id": "comic-norfolk", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# show all user labels in all bins\n", @@ -138,47 +141,58 @@ " print(bin_df)" ] }, + { + "cell_type": "markdown", + "id": "expensive-forestry", + "metadata": {}, + "source": [ + "### Original output" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "tired-athens", + "id": "corporate-situation", "metadata": {}, "outputs": [], "source": [ - "# turn all user_input into list without binning\n", - "bin_trips_user_input_ls = pd.DataFrame(data=[bin_trips[i][\"data\"][\"user_input\"] for i in range(len(bin_trips))]).values.tolist()\n", - "bin_trips_user_input_ls" + "bin_trips_df = pd.DataFrame(data=[i[\"data\"][\"user_input\"] for i in bin_trips])\n", + "bin_trips_df" ] }, { "cell_type": "code", "execution_count": null, - "id": "veterinary-courage", + "id": "historic-russian", "metadata": {}, "outputs": [], "source": [ - "for i in bin_trips_user_input_ls:\n", - " if i[2] == \"same_mode\":\n", - " print(i)" + "# turn all user_input into list without binning\n", + "bin_trips_user_input_ls = bin_trips_df.values.tolist()\n", + "bin_trips_user_input_ls" ] }, { "cell_type": "code", "execution_count": null, - "id": "coastal-fiction", - "metadata": {}, + "id": "jewish-bristol", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# drop duplicate user_input\n", - "no_dup_df=pd.DataFrame(data=[bin_trips[i][\"data\"][\"user_input\"] for i in range(len(bin_trips))]).drop_duplicates()\n", + "no_dup_df=bin_trips_df.drop_duplicates()\n", "no_dup_df" ] }, { "cell_type": "code", "execution_count": null, - "id": "stainless-ancient", - "metadata": {}, + "id": "wooden-postage", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# turn non-duplicate user_input into list\n", @@ -189,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "narrative-reducing", + "id": "wicked-serial", "metadata": {}, "outputs": [], "source": [ @@ -204,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "framed-sector", + "id": "surgical-stadium", "metadata": {}, "outputs": [], "source": [ @@ -216,10 +230,55 @@ "labels_pred" ] }, + { + "cell_type": "markdown", + "id": "first-campbell", + "metadata": {}, + "source": [ + "Note: the trips order in labels_true and labels_pred should be the same. Using timestamp to compare the trips in bin_trips and those in bins" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "unlimited-waste", + "id": "dental-universal", + "metadata": {}, + "outputs": [], + "source": [ + "bin_trips_ts = pd.DataFrame(data=[i[\"data\"][\"start_ts\"]for i in bin_trips])\n", + "len(bin_trips_ts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dynamic-prize", + "metadata": {}, + "outputs": [], + "source": [ + "bin_ls =[]\n", + "for bin in bins:\n", + " for index in bin:\n", + " bin_ls.append(index)\n", + "bins_ts = pd.DataFrame(data=[non_empty_trips[i][\"data\"][\"start_ts\"]for i in bin_ls])\n", + "len(bins_ts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "republican-rabbit", + "metadata": {}, + "outputs": [], + "source": [ + "# compare two data frames, return nothing if two data frames are the same\n", + "assert_frame_equal(bins_ts,bin_trips_ts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "concerned-congo", "metadata": {}, "outputs": [], "source": [ @@ -229,7 +288,7 @@ { "cell_type": "code", "execution_count": null, - "id": "grateful-finance", + "id": "alleged-alabama", "metadata": {}, "outputs": [], "source": [ @@ -239,12 +298,284 @@ { "cell_type": "code", "execution_count": null, - "id": "removable-rating", - "metadata": {}, + "id": "material-genetics", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "metrics.v_measure_score(labels_true, labels_pred)" ] + }, + { + "cell_type": "markdown", + "id": "aware-friday", + "metadata": {}, + "source": [ + "### After changing language" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "native-sight", + "metadata": {}, + "outputs": [], + "source": [ + "span_eng_dict = {'revisado_bike':'test ride with bike','placas_de carro':'car plates','aseguranza':'insurance',\n", + " 'iglesia':'church','curso':'course','mi_hija recién aliviada':'my daughter just had a new baby',\n", + " 'servicio_comunitario':'community service','pago_de aseguranza':'insurance payment',\n", + " 'grupo_comunitario':'community group','caminata_comunitaria':'community walk'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "threaded-focus", + "metadata": {}, + "outputs": [], + "source": [ + "# use dict to replace the values in Spanish in the bin(this step just for showing the trips in each bin)\n", + "for bin in bins:\n", + " bin_user_input = (non_empty_trips[i].data[\"user_input\"] for i in bin)\n", + " bin_df = pd.DataFrame(data = bin_user_input)\n", + " sp2en_bin_df = bin_df.replace(span_eng_dict)\n", + " print(sp2en_bin_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imported-reasoning", + "metadata": {}, + "outputs": [], + "source": [ + "# turn all user_input into list without binning\n", + "bin_trips_user_input_sp2en_ls = pd.DataFrame(data=[i[\"data\"][\"user_input\"] for i in bin_trips]).replace(span_eng_dict).values.tolist()\n", + "bin_trips_user_input_sp2en_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "athletic-reasoning", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_sp2en_df=pd.DataFrame(data=[i[\"data\"][\"user_input\"] for i in bin_trips]).replace(span_eng_dict).drop_duplicates()\n", + "no_dup_sp2en_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "false-fields", + "metadata": {}, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_sp2en_list = no_dup_sp2en_df.values.tolist()\n", + "no_dup_sp2en_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "formal-belgium", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true_sp2en =[]\n", + "for trip in bin_trips_user_input_sp2en_ls:\n", + " if trip in no_dup_sp2en_list:\n", + " labels_true_sp2en.append(no_dup_sp2en_list.index(trip))\n", + "labels_true_sp2en" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "indirect-lafayette", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_pred based on bins\n", + "labels_pred = []\n", + "for i in range(len(bins)):\n", + " for trip in bins[i]:\n", + " labels_pred.append(i)\n", + "labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cooked-mineral", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caring-calcium", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sunset-frequency", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "markdown", + "id": "included-alberta", + "metadata": {}, + "source": [ + "### After converting purposes and mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "outer-hammer", + "metadata": {}, + "outputs": [], + "source": [ + "map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home',\n", + " 'insurance_payment':'insurance'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "developing-socket", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', 200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "partial-prerequisite", + "metadata": {}, + "outputs": [], + "source": [ + "# convert purpose\n", + "bin_trips_user_input_sp2en = pd.DataFrame(data=[bin_trips[i][\"data\"][\"user_input\"] for i in range(len(bin_trips))]).replace(span_eng_dict)\n", + "bin_trips_cvt_pur_df = bin_trips_user_input_sp2en.replace(map_pur_dict)\n", + "# convert mode\n", + "bin_trips_cvt_pur_mo_df = bin_trips_cvt_pur_df\n", + "for i in range(len(bin_trips_cvt_pur_mo_df)):\n", + " if bin_trips_cvt_pur_mo_df.iloc[i][\"replaced_mode\"] == \"same_mode\":\n", + " print(bin_trips_cvt_pur_mo_df.iloc[i]) # to see which row will be converted\n", + " bin_trips_cvt_pur_mo_df.iloc[i][\"replaced_mode\"] = bin_trips_cvt_pur_mo_df.iloc[i]['mode_confirm']\n", + "print(bin_trips_cvt_pur_mo_df)\n", + "bin_trips_cvt_pur_mode_ls = bin_trips_cvt_pur_mo_df.values.tolist()\n", + "bin_trips_cvt_pur_mode_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "classical-berkeley", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_cvt_pur_mode_df = bin_trips_cvt_pur_mo_df.drop_duplicates()\n", + "no_dup_cvt_pur_mode_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "opening-equity", + "metadata": {}, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_cvt_pur_mo_ls = no_dup_cvt_pur_mode_df.values.tolist()\n", + "no_dup_cvt_pur_mo_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "occasional-evanescence", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true_cvt =[]\n", + "for trip in bin_trips_cvt_pur_mode_ls:\n", + " if trip in no_dup_cvt_pur_mo_ls:\n", + " labels_true_cvt.append(no_dup_cvt_pur_mo_ls.index(trip))\n", + "labels_true_cvt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fancy-barbados", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_pred based on bins\n", + "labels_pred = []\n", + "for i in range(len(bins)):\n", + " for trip in bins[i]:\n", + " labels_pred.append(i)\n", + "labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "placed-carry", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true_cvt, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "worse-shift", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true_cvt, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "immediate-series", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true_cvt, labels_pred)" + ] } ], "metadata": { diff --git a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb new file mode 100644 index 0000000..c99060e --- /dev/null +++ b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb @@ -0,0 +1,564 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "worst-baseball", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "# Our imports\n", + "import emission.core.get_database as edb\n", + "import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline\n", + "import emission.analysis.modelling.tour_model.similarity as similarity\n", + "import emission.analysis.modelling.tour_model.featurization as featurization\n", + "import emission.analysis.modelling.tour_model.representatives as representatives\n", + "import emission.storage.decorations.analysis_timeseries_queries as esda\n", + "import pandas as pd\n", + "from numpy import *\n", + "import confirmed_trips_eval_bins_clusters as evaluation\n", + "from sklearn import metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "behavioral-generation", + "metadata": {}, + "outputs": [], + "source": [ + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "powered-import", + "metadata": {}, + "outputs": [], + "source": [ + "participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n", + "all_users = [u[\"user_id\"] for u in participant_uuid_obj]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "living-connection", + "metadata": {}, + "outputs": [], + "source": [ + "radius = 300" + ] + }, + { + "cell_type": "markdown", + "id": "thorough-nomination", + "metadata": {}, + "source": [ + "## Choose one user for experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "necessary-letter", + "metadata": {}, + "outputs": [], + "source": [ + "user = all_users[10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exclusive-think", + "metadata": {}, + "outputs": [], + "source": [ + "#read the data from the database. We choose key=esda.CONFIRMED_TRIP_KEY to get confirmed trips here\n", + "trips = pipeline.read_data(uuid=user,key=esda.CONFIRMED_TRIP_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "funded-canadian", + "metadata": {}, + "outputs": [], + "source": [ + "# select trips that have user_input to analyze\n", + "non_empty_trips = [t for t in trips if t[\"data\"][\"user_input\"] != {}]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "iraqi-account", + "metadata": {}, + "outputs": [], + "source": [ + "len(non_empty_trips), non_empty_trips" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "owned-advocate", + "metadata": {}, + "outputs": [], + "source": [ + "bin_trips, bins = pipeline.remove_noise(non_empty_trips, radius)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "healthy-wildlife", + "metadata": {}, + "outputs": [], + "source": [ + "logging.debug('The list of bins is %s' % bins)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "promotional-batch", + "metadata": {}, + "outputs": [], + "source": [ + "# clusters,labels,cluster_trips, points = pipeline.cluster(bin_trips, len(bins))\n", + "# clustering the data only based on sil score (min_cluster = 0) instead of bins number (len(bins))\n", + "feat = featurization.featurization(bin_trips)\n", + "min = 0\n", + "max = int(math.ceil(1.5 * len(bins)))\n", + "feat.cluster(min_clusters=min, max_clusters=max)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "attempted-afghanistan", + "metadata": {}, + "outputs": [], + "source": [ + "logging.debug('number of clusters: %d' % feat.clusters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fatty-manner", + "metadata": {}, + "outputs": [], + "source": [ + "logging.debug('labels list is: %s' % feat.labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "internal-mobility", + "metadata": {}, + "outputs": [], + "source": [ + "cluster_trips = feat.data\n", + "cluster_trips" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "alone-rochester", + "metadata": {}, + "outputs": [], + "source": [ + "cluster_user_input_df = pd.DataFrame(data=[i[\"data\"][\"user_input\"] for i in cluster_trips])\n", + "cluster_user_input_df" + ] + }, + { + "cell_type": "markdown", + "id": "sixth-living", + "metadata": {}, + "source": [ + "### Original output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "amber-westminster", + "metadata": {}, + "outputs": [], + "source": [ + "# turn cluster_trips to list without any changes\n", + "cluster_user_input_ls = cluster_user_input_df.values.tolist()\n", + "cluster_user_input_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "informative-donor", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_df=cluster_user_input_df.drop_duplicates()\n", + "no_dup_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "computational-punishment", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_list = no_dup_df.values.tolist()\n", + "no_dup_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "electric-contamination", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true =[]\n", + "for trip in cluster_user_input_ls:\n", + " if trip in no_dup_list:\n", + " labels_true.append(no_dup_list.index(trip))\n", + "labels_true" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hindu-scholarship", + "metadata": {}, + "outputs": [], + "source": [ + "labels_pred = feat.labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecological-wales", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "metric-budget", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "responsible-newport", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true, labels_pred)" + ] + }, + { + "cell_type": "markdown", + "id": "level-semiconductor", + "metadata": {}, + "source": [ + "### After changing language" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "distinct-airport", + "metadata": {}, + "outputs": [], + "source": [ + "span_eng_dict = {'revisado_bike':'test ride with bike','placas_de carro':'car plates','aseguranza':'insurance',\n", + " 'iglesia':'church','curso':'course','mi_hija recién aliviada':'my daughter just had a new baby',\n", + " 'servicio_comunitario':'community service','pago_de aseguranza':'insurance payment',\n", + " 'grupo_comunitario':'community group','caminata_comunitaria':'community walk'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "enabling-equity", + "metadata": {}, + "outputs": [], + "source": [ + "# change language and turn data frame to list\n", + "cluster_sp2en_ls = cluster_user_input_df.replace(span_eng_dict).values.tolist()\n", + "cluster_sp2en_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "preliminary-spider", + "metadata": {}, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_sp2en_df=cluster_user_input_df.replace(span_eng_dict).drop_duplicates()\n", + "no_dup_sp2en_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "saved-consensus", + "metadata": {}, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_sp2en_list = no_dup_sp2en_df.values.tolist()\n", + "no_dup_sp2en_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "available-coaching", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true_sp2en =[]\n", + "for trip in cluster_sp2en_ls:\n", + " if trip in no_dup_sp2en_list:\n", + " labels_true_sp2en.append(no_dup_sp2en_list.index(trip))\n", + "labels_true_sp2en" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "average-gauge", + "metadata": {}, + "outputs": [], + "source": [ + "labels_pred = labels_pred = feat.labels\n", + "labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fiscal-camping", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "color-repeat", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "whole-wrong", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "markdown", + "id": "dangerous-creek", + "metadata": {}, + "source": [ + "### After converting purposes and mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "certain-italy", + "metadata": {}, + "outputs": [], + "source": [ + "map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home',\n", + " 'insurance_payment':'insurance'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "swiss-initial", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', 200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "naughty-vegetarian", + "metadata": {}, + "outputs": [], + "source": [ + "# convert purpose\n", + "cluster_sp2en_df = cluster_user_input_df.replace(span_eng_dict)\n", + "cluster_cvt_pur_df = cluster_sp2en_df.replace(map_pur_dict)\n", + "# convert mode\n", + "cluster_cvt_pur_mo_df = cluster_cvt_pur_df\n", + "for i in range(len(cluster_cvt_pur_mo_df)):\n", + " if cluster_cvt_pur_mo_df.iloc[i][\"replaced_mode\"] == \"same_mode\":\n", + " print(cluster_cvt_pur_mo_df.iloc[i]) # to see which row will be converted\n", + " cluster_cvt_pur_mo_df.iloc[i][\"replaced_mode\"] = cluster_cvt_pur_mo_df.iloc[i]['mode_confirm']\n", + "print(cluster_cvt_pur_mo_df)\n", + "cluster_cvt_pur_mo_ls = cluster_cvt_pur_mo_df.values.tolist()\n", + "cluster_cvt_pur_mo_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "approximate-lightweight", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_cvt_pur_mode_df = cluster_cvt_pur_mo_df.drop_duplicates()\n", + "no_dup_cvt_pur_mode_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "active-westminster", + "metadata": {}, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_cvt_pur_mo_ls = no_dup_cvt_pur_mode_df.values.tolist()\n", + "no_dup_cvt_pur_mo_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "documented-arlington", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true_cvt =[]\n", + "for trip in cluster_cvt_pur_mo_ls:\n", + " if trip in no_dup_cvt_pur_mo_ls:\n", + " labels_true_cvt.append(no_dup_cvt_pur_mo_ls.index(trip))\n", + "labels_true_cvt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "noted-worst", + "metadata": {}, + "outputs": [], + "source": [ + "labels_pred = labels_pred = feat.labels\n", + "labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "thousand-palestine", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true_cvt, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "demanding-immigration", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true_cvt, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "racial-theory", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true_cvt, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "precise-woman", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 0951484174399ca7c31988a9f667461332f33e45 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Mon, 22 Feb 2021 23:50:18 -0800 Subject: [PATCH 05/16] modified evaluation code, put all bins and bins above cutoff evaluation in one notebook, evaluated clusters above cutoff, showed process of single user on all bins, bins above cutoff, and clusters above cutoff --- .../confirmed_trips_eval_bins_clusters.py | 205 ++++++ .../v-measurel_all_bins_single_user.ipynb | 630 ++++++++++++++++++ .../v-measurel_bins_all_user.ipynb | 314 +++++++++ ...urel_clusters_above_cutoff_all_users.ipynb | 195 ++++++ ...measurel_cutoff_clusters_single_user.ipynb | 85 ++- 5 files changed, 1410 insertions(+), 19 deletions(-) create mode 100644 tour_model_eval/v-measurel_all_bins_single_user.ipynb create mode 100644 tour_model_eval/v-measurel_bins_all_user.ipynb create mode 100644 tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index 9f58337..e57e4a7 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -9,6 +9,8 @@ import emission.storage.decorations.analysis_timeseries_queries as esda import pandas as pd from numpy import * +from sklearn import metrics +from pandas.testing import assert_frame_equal # Spanish words to English span_eng_dict = {'revisado_bike':'test ride with bike','placas_de carro':'car plates','aseguranza':'insurance', @@ -71,3 +73,206 @@ def precision_bin_all_users(all_users,radius,sp2en=None,cvt_purpose=None): all_bins_preci = precision_bins(all_bins_preci, sim.bins, non_empty_trips, sp2en, cvt_purpose) all_users_preci.append(round(mean(all_bins_preci), 2)) return all_users_preci + + +# v_measure_bins takes 5 parameters +# - sp2en=True: change Spanish to English +# - cvt_pur_mo=True: convert purposes and replaced mode +# - cutoff=True: choose to analyze bins above cutoff +# - cutoff=None: analyze all bins +# Note: for sp2en and cvt_pur_mo, set either one to be True as needed. cvt_pur_mo will change language first +def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): + homo_score = [] + comp_score = [] + v_score = [] + for i in range(len(all_users)): + user = all_users[i] + trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) + non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] + valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t["data"]["user_input"] and + 'purpose_confirm' in t["data"]["user_input"] and 'replaced_mode' in t["data"]["user_input"]] + sim = similarity.similarity(valid_trips, radius) + filter_trips = sim.data + + # filter out users that haven't enough trips (at least 10) to analyze + if len(filter_trips) < 10: + homo_score.append(NaN) + comp_score.append(NaN) + v_score.append(NaN) + continue + sim.bin_data() + if cutoff is None: + trip_index_ls = [] + bins = sim.bins + for bin in bins: + for index in bin: + trip_index_ls.append(index) + bin_trips = [filter_trips[num] for num in trip_index_ls] + + elif cutoff: + sim.delete_bins() + bin_trips = sim.newdata + bins = sim.bins + + if len(bin_trips) < 10: + homo_score.append(NaN) + comp_score.append(NaN) + v_score.append(NaN) + continue + bin_trips_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips]) + + if sp2en: + bin_trips_df = bin_trips_df.replace(span_eng_dict) + elif cvt_pur_mo: + bin_trips_df = bin_trips_df.replace(span_eng_dict) + bin_trips_df = bin_trips_df.replace(map_pur_dict) + for a in range(len(bin_trips_df)): + if bin_trips_df.iloc[a]["replaced_mode"] == "same_mode": + # to see which row will be converted + logging.debug("The following rows will be changed: %s", bin_trips_df.iloc[a]) + bin_trips_df.iloc[a]["replaced_mode"] = bin_trips_df.iloc[a]['mode_confirm'] + + # turn all user_input into list without binning + bin_trips_user_input_ls = bin_trips_df.values.tolist() + # drop duplicate user_input + no_dup_df = bin_trips_df.drop_duplicates() + # turn non-duplicate user_input into list + no_dup_list = no_dup_df.values.tolist() + + # collect labels_true based on user_input + labels_true = [] + for trip in bin_trips_user_input_ls: + if trip in no_dup_list: + labels_true.append(no_dup_list.index(trip)) + + # collect labels_pred based on bins + labels_pred = [] + for b in range(len(bins)): + for trip in bins[b]: + labels_pred.append(b) + + # compare the trips order in bins and those in valid_trips using timestamp + bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips]) + bin_ls = [] + for bin in bins: + for index in bin: + bin_ls.append(index) + bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls]) + # compare two data frames, return nothing if two data frames are the same + assert_frame_equal(bins_ts, bin_trips_ts) + homo = metrics.homogeneity_score(labels_true, labels_pred) + homo_score.append(float('%.3f' % homo)) + comp = metrics.completeness_score(labels_true, labels_pred) + comp_score.append(float('%.3f' % comp)) + v = metrics.v_measure_score(labels_true, labels_pred) + v_score.append(float('%.3f' % v)) + + return homo_score, comp_score, v_score + + +# - sp2en=True: change Spanish to English +# - cvt_pur_mo=True: convert purposes and replaced mode +# - cutoff=True: choose to analyze bins above cutoff +# - cutoff=None: analyze all bins +# Note: for sp2en and cvt_pur_mo, set either one to be True as needed. cvt_pur_mo will change language first +def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): + homo_score = [] + comp_score = [] + v_score = [] + for i in range(len(all_users)): + user = all_users[i] + trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) + non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] + valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t["data"]["user_input"] and + 'purpose_confirm' in t["data"]["user_input"] and 'replaced_mode' in t["data"]["user_input"]] + sim = similarity.similarity(valid_trips, radius) + filter_trips = sim.data + # filter out users that haven't enough trips (at least 10) to analyze + if len(filter_trips) < 10: + homo_score.append(NaN) + comp_score.append(NaN) + v_score.append(NaN) + continue + sim.bin_data() + sim.delete_bins() + bin_trips = sim.newdata + bins = sim.bins + + if len(bin_trips) < 10: + homo_score.append(NaN) + comp_score.append(NaN) + v_score.append(NaN) + continue + + # clustering the data only based on sil score (min_cluster = 0) instead of bins number (len(bins)) + feat = featurization.featurization(bin_trips) + min = 0 + max = int(math.ceil(1.5 * len(bins))) + feat.cluster(min_clusters=min, max_clusters=max) + cluster_trips = feat.data + cluster_user_input_df = pd.DataFrame(data=[i["data"]["user_input"] for i in cluster_trips]) + if sp2en: + # change language + cluster_user_input_df = cluster_user_input_df.replace(span_eng_dict) + cluster_user_input_ls = cluster_user_input_df.values.tolist() + elif cvt_pur_mo: + # change language first + cluster_user_input_df = cluster_user_input_df.replace(span_eng_dict) + # convert purpose + cluster_user_input_df = cluster_user_input_df.replace(map_pur_dict) + # convert mode + for a in range(len(cluster_user_input_df)): + if cluster_user_input_df.iloc[a]["replaced_mode"] == "same_mode": + cluster_user_input_df.iloc[a]["replaced_mode"] = cluster_user_input_df.iloc[a]['mode_confirm'] + # turn cluster_trips to list without any changes + cluster_user_input_ls = cluster_user_input_df.values.tolist() + # drop duplicate user_input + no_dup_df = cluster_user_input_df.drop_duplicates() + # turn non-duplicate user_input into list + no_dup_list = no_dup_df.values.tolist() + # collect labels_true based on user_input + labels_true = [] + for trip in cluster_user_input_ls: + if trip in no_dup_list: + labels_true.append(no_dup_list.index(trip)) + labels_pred = feat.labels + + # compare the points in cluster_trips and those in feat.points, return nothing if two data frames are the same + cluster_ps = [] + for trip in cluster_trips: + cluster_ps.append([trip["data"]["start_loc"]["coordinates"][0], + trip["data"]["start_loc"]["coordinates"][1], + trip["data"]["end_loc"]["coordinates"][0], + trip["data"]["end_loc"]["coordinates"][1]]) + cluster_ps_df = pd.DataFrame(data=cluster_ps) + label_ps_df = pd.DataFrame(data=feat.points) + assert_frame_equal(cluster_ps_df, label_ps_df) + + homo = metrics.homogeneity_score(labels_true, labels_pred) + homo_score.append(float('%.3f' % homo)) + comp = metrics.completeness_score(labels_true, labels_pred) + comp_score.append(float('%.3f' % comp)) + v = metrics.v_measure_score(labels_true, labels_pred) + v_score.append(float('%.3f' % v)) + + return homo_score, comp_score, v_score + + + + + + + + + + + + + + + + + + + + diff --git a/tour_model_eval/v-measurel_all_bins_single_user.ipynb b/tour_model_eval/v-measurel_all_bins_single_user.ipynb new file mode 100644 index 0000000..4eb2013 --- /dev/null +++ b/tour_model_eval/v-measurel_all_bins_single_user.ipynb @@ -0,0 +1,630 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "modified-diary", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "# Our imports\n", + "import emission.core.get_database as edb\n", + "import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline\n", + "import emission.analysis.modelling.tour_model.similarity as similarity\n", + "import emission.analysis.modelling.tour_model.featurization as featurization\n", + "import emission.analysis.modelling.tour_model.representatives as representatives\n", + "import emission.storage.decorations.analysis_timeseries_queries as esda\n", + "import pandas as pd\n", + "from numpy import *\n", + "import confirmed_trips_eval_bins_clusters as evaluation\n", + "from sklearn import metrics\n", + "from pandas.testing import assert_frame_equal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "together-twenty", + "metadata": {}, + "outputs": [], + "source": [ + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sorted-lloyd", + "metadata": {}, + "outputs": [], + "source": [ + "participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n", + "all_users = [u[\"user_id\"] for u in participant_uuid_obj]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "missing-psychology", + "metadata": {}, + "outputs": [], + "source": [ + "radius = 300" + ] + }, + { + "cell_type": "markdown", + "id": "thorough-nomination", + "metadata": {}, + "source": [ + "## Choose one user for experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "august-valley", + "metadata": {}, + "outputs": [], + "source": [ + "user = all_users[6]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "diverse-running", + "metadata": {}, + "outputs": [], + "source": [ + "#read the data from the database. We choose key=esda.CONFIRMED_TRIP_KEY to get confirmed trips here\n", + "trips = pipeline.read_data(uuid=user,key=esda.CONFIRMED_TRIP_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "crude-postage", + "metadata": {}, + "outputs": [], + "source": [ + "# select trips that have user_input to analyze\n", + "non_empty_trips = [t for t in trips if t[\"data\"][\"user_input\"] != {}]\n", + "len(non_empty_trips)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "copyrighted-delta", + "metadata": {}, + "outputs": [], + "source": [ + "# filter out trips that are not fully labeled(contain NaN in user_input)\n", + "valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t[\"data\"][\"user_input\"] and \n", + " 'purpose_confirm'in t[\"data\"][\"user_input\"] and 'replaced_mode' in t[\"data\"][\"user_input\"]]\n", + "len(valid_trips),valid_trips" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "competent-australia", + "metadata": {}, + "outputs": [], + "source": [ + "sim = similarity.similarity(valid_trips, radius)\n", + "sim.data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "standing-serial", + "metadata": {}, + "outputs": [], + "source": [ + "sim.bin_data()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "annoying-antique", + "metadata": {}, + "outputs": [], + "source": [ + "filter_trips = sim.data\n", + "trip_index_ls = []\n", + "bins = sim.bins\n", + "for bin in bins:\n", + " for index in bin:\n", + " trip_index_ls.append(index)\n", + "bin_trips = [filter_trips[num]for num in trip_index_ls]\n", + "\n", + "print(len(bin_trips),len(bins))\n", + "bin_trips" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "latest-contemporary", + "metadata": {}, + "outputs": [], + "source": [ + "logging.debug('The list of bins is %s' % bins)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "thrown-hello", + "metadata": {}, + "outputs": [], + "source": [ + "# show all user labels in all bins\n", + "for bin in bins:\n", + " bin_user_input = (filter_trips[i].data[\"user_input\"] for i in bin)\n", + " bin_df = pd.DataFrame(data = bin_user_input)\n", + " print(bin_df)" + ] + }, + { + "cell_type": "markdown", + "id": "strange-inventory", + "metadata": {}, + "source": [ + "### Original output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "streaming-taiwan", + "metadata": {}, + "outputs": [], + "source": [ + "bin_trips_df = pd.DataFrame(data=[trip[\"data\"][\"user_input\"] for trip in bin_trips])\n", + "bin_trips_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "destroyed-filter", + "metadata": {}, + "outputs": [], + "source": [ + "# turn all user_input into list without binning\n", + "bin_trips_user_input_ls = bin_trips_df.values.tolist()\n", + "bin_trips_user_input_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "mature-italy", + "metadata": {}, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_df=bin_trips_df.drop_duplicates()\n", + "no_dup_df,len(no_dup_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "illegal-collect", + "metadata": {}, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_list = no_dup_df.values.tolist()\n", + "no_dup_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sapphire-hughes", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', 300)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fuzzy-marathon", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true =[]\n", + "for trip in bin_trips_user_input_ls:\n", + " if trip in no_dup_list:\n", + " labels_true.append(no_dup_list.index(trip))\n", + "labels_true" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "considerable-restoration", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_pred based on bins\n", + "labels_pred = []\n", + "for i in range(len(bins)):\n", + " for trip in bins[i]:\n", + " labels_pred.append(i)\n", + "labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "domestic-august", + "metadata": {}, + "outputs": [], + "source": [ + "bin_trips_ts = pd.DataFrame(data=[trip[\"data\"][\"start_ts\"]for trip in bin_trips])\n", + "bin_input = pd.DataFrame(data=[trip[\"data\"][\"user_input\"]for trip in bin_trips])\n", + "len(bin_trips_ts)\n", + "bin_input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "married-catholic", + "metadata": {}, + "outputs": [], + "source": [ + "bins_ts = pd.DataFrame(data=[filter_trips[num][\"data\"][\"start_ts\"]for num in trip_index_ls])\n", + "bins_input = pd.DataFrame(data=[filter_trips[num][\"data\"][\"user_input\"]for num in trip_index_ls])\n", + "len(trip_index_ls)\n", + "bins_input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "requested-organic", + "metadata": {}, + "outputs": [], + "source": [ + "# compare two data frames, return nothing if two data frames are the same\n", + "assert_frame_equal(bins_ts,bin_trips_ts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "economic-nitrogen", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "postal-trademark", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "southern-evidence", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true, labels_pred)" + ] + }, + { + "cell_type": "markdown", + "id": "domestic-regression", + "metadata": {}, + "source": [ + "### After changing language" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "amazing-jersey", + "metadata": {}, + "outputs": [], + "source": [ + "span_eng_dict = {'revisado_bike':'test ride with bike','placas_de carro':'car plates','aseguranza':'insurance',\n", + " 'iglesia':'church','curso':'course','mi_hija recién aliviada':'my daughter just had a new baby',\n", + " 'servicio_comunitario':'community service','pago_de aseguranza':'insurance payment',\n", + " 'grupo_comunitario':'community group','caminata_comunitaria':'community walk'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "impressive-philadelphia", + "metadata": {}, + "outputs": [], + "source": [ + "# use dict to replace the values in Spanish in the bin(this step just for showing the trips in each bin)\n", + "for bin in bins:\n", + " bin_user_input = (valid_trips[i].data[\"user_input\"] for i in bin)\n", + " bin_df = pd.DataFrame(data = bin_user_input)\n", + " sp2en_bin_df = bin_df.replace(span_eng_dict)\n", + " print(sp2en_bin_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hybrid-eugene", + "metadata": {}, + "outputs": [], + "source": [ + "# turn all user_input into list without binning\n", + "bin_trips_sp2en_df = bin_trips_df.replace(span_eng_dict)\n", + "bin_trips_sp2en_ls = bin_trips_sp2en_df.values.tolist()\n", + "bin_trips_sp2en_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "viral-intermediate", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_sp2en_df=bin_trips_sp2en_df.drop_duplicates()\n", + "no_dup_sp2en_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "conceptual-branch", + "metadata": {}, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_sp2en_list = no_dup_sp2en_df.values.tolist()\n", + "no_dup_sp2en_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "wanted-management", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true_sp2en =[]\n", + "for trip in bin_trips_sp2en_ls:\n", + " if trip in no_dup_sp2en_list:\n", + " labels_true_sp2en.append(no_dup_sp2en_list.index(trip))\n", + "labels_true_sp2en" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "otherwise-vault", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_pred based on bins\n", + "labels_pred = []\n", + "for i in range(len(bins)):\n", + " for trip in bins[i]:\n", + " labels_pred.append(i)\n", + "labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adolescent-style", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "signal-backing", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upper-contribution", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true_sp2en, labels_pred)" + ] + }, + { + "cell_type": "markdown", + "id": "bearing-communications", + "metadata": {}, + "source": [ + "### After converting purposes and mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "completed-testing", + "metadata": {}, + "outputs": [], + "source": [ + "map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home',\n", + " 'insurance_payment':'insurance'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "quality-board", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', 200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "falling-arbor", + "metadata": {}, + "outputs": [], + "source": [ + "# convert purpose\n", + "bin_trips_cvt_pur_df = bin_trips_sp2en_df.replace(map_pur_dict)\n", + "# convert mode\n", + "bin_trips_cvt_pur_mo_df = bin_trips_cvt_pur_df\n", + "for i in range(len(bin_trips_cvt_pur_mo_df)):\n", + " if bin_trips_cvt_pur_mo_df.iloc[i][\"replaced_mode\"] == \"same_mode\":\n", + " print(bin_trips_cvt_pur_mo_df.iloc[i]) # to see which row will be converted\n", + " bin_trips_cvt_pur_mo_df.iloc[i][\"replaced_mode\"] = bin_trips_cvt_pur_mo_df.iloc[i]['mode_confirm']\n", + "print(bin_trips_cvt_pur_mo_df)\n", + "bin_trips_cvt_pur_mode_ls = bin_trips_cvt_pur_mo_df.values.tolist()\n", + "bin_trips_cvt_pur_mode_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "electrical-liechtenstein", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# drop duplicate user_input\n", + "no_dup_cvt_pur_mode_df = bin_trips_cvt_pur_mo_df.drop_duplicates()\n", + "no_dup_cvt_pur_mode_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cleared-valley", + "metadata": {}, + "outputs": [], + "source": [ + "# turn non-duplicate user_input into list\n", + "no_dup_cvt_pur_mo_ls = no_dup_cvt_pur_mode_df.values.tolist()\n", + "no_dup_cvt_pur_mo_ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "printable-fusion", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_true based on user_input\n", + "labels_true_cvt =[]\n", + "for trip in bin_trips_cvt_pur_mode_ls:\n", + " if trip in no_dup_cvt_pur_mo_ls:\n", + " labels_true_cvt.append(no_dup_cvt_pur_mo_ls.index(trip))\n", + "labels_true_cvt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hundred-zealand", + "metadata": {}, + "outputs": [], + "source": [ + "# collect labels_pred based on bins\n", + "labels_pred = []\n", + "for i in range(len(bins)):\n", + " for trip in bins[i]:\n", + " labels_pred.append(i)\n", + "labels_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "false-lithuania", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.homogeneity_score(labels_true_cvt, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "documentary-power", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.completeness_score(labels_true_cvt, labels_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "elect-animation", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.v_measure_score(labels_true_cvt, labels_pred)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tour_model_eval/v-measurel_bins_all_user.ipynb b/tour_model_eval/v-measurel_bins_all_user.ipynb new file mode 100644 index 0000000..12a2d49 --- /dev/null +++ b/tour_model_eval/v-measurel_bins_all_user.ipynb @@ -0,0 +1,314 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "computational-national", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "# Our imports\n", + "import emission.core.get_database as edb\n", + "import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline\n", + "import emission.analysis.modelling.tour_model.similarity as similarity\n", + "import emission.analysis.modelling.tour_model.featurization as featurization\n", + "import emission.analysis.modelling.tour_model.representatives as representatives\n", + "import emission.storage.decorations.analysis_timeseries_queries as esda\n", + "import pandas as pd\n", + "from numpy import *\n", + "import confirmed_trips_eval_bins_clusters as evaluation\n", + "from sklearn import metrics\n", + "from pandas.testing import assert_frame_equal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "preceding-yugoslavia", + "metadata": {}, + "outputs": [], + "source": [ + "participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n", + "all_users = [u[\"user_id\"] for u in participant_uuid_obj]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "perceived-impossible", + "metadata": {}, + "outputs": [], + "source": [ + "radius = 300" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "smooth-variety", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('max_colwidth',100)\n", + "pd.set_option('display.max_rows', None)" + ] + }, + { + "cell_type": "markdown", + "id": "medical-biodiversity", + "metadata": {}, + "source": [ + "## Bins above cutoff" + ] + }, + { + "cell_type": "markdown", + "id": "corporate-heather", + "metadata": {}, + "source": [ + "### Original user input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "chief-fundamentals", + "metadata": {}, + "outputs": [], + "source": [ + "homo_score_ori, comp_score_ori, v_score_ori = evaluation.v_measure_bins(all_users,radius,cutoff=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "educated-philippines", + "metadata": {}, + "outputs": [], + "source": [ + "mean_v_ori=round(mean([x for x in v_score_ori if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "welcome-homeless", + "metadata": {}, + "source": [ + "### After changing language" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "preceding-money", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "homo_score_sp2en, comp_score_sp2en, v_score_sp2en = evaluation.v_measure_bins(all_users,radius,sp2en=True,cutoff=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "laden-plate", + "metadata": {}, + "outputs": [], + "source": [ + "mean_v_sp2en=round(mean([x for x in v_score_sp2en if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "sticky-denver", + "metadata": {}, + "source": [ + "### After converting purposes and replaced mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "agreed-moment", + "metadata": {}, + "outputs": [], + "source": [ + "homo_score_cvt, comp_score_cvt, v_score_cvt = evaluation.v_measure_bins(all_users,radius,cvt_pur_mo=True,cutoff=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "handmade-dairy", + "metadata": {}, + "outputs": [], + "source": [ + "mean_v_cvt=round(mean([x for x in v_score_cvt if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "funky-potato", + "metadata": {}, + "source": [ + "### DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "technological-reservation", + "metadata": {}, + "outputs": [], + "source": [ + "cutoff_df = pd.DataFrame(data={'homogeneity_score':[homo_score_ori,homo_score_sp2en,homo_score_cvt],\n", + " 'completeness_score':[comp_score_ori,comp_score_sp2en,comp_score_cvt],\n", + " 'v_measure_score':[v_score_ori,v_score_sp2en,v_score_cvt],\n", + " 'mean v_measure_score':[mean_v_ori,mean_v_sp2en,mean_v_cvt]},\n", + " index = ['original user input','after translation','after converting purposes and replaced mode'])\n", + "cutoff_df" + ] + }, + { + "cell_type": "markdown", + "id": "strange-badge", + "metadata": {}, + "source": [ + "## All bins" + ] + }, + { + "cell_type": "markdown", + "id": "rural-virtue", + "metadata": {}, + "source": [ + "### Original user input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "listed-entrance", + "metadata": {}, + "outputs": [], + "source": [ + "ab_homo_score_ori, ab_comp_score_ori, ab_v_score_ori = evaluation.v_measure_bins(all_users,radius)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "known-calculation", + "metadata": {}, + "outputs": [], + "source": [ + "ab_mean_v_ori=round(mean([x for x in ab_v_score_ori if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "broadband-expression", + "metadata": {}, + "source": [ + "### After changing language" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "alone-authorization", + "metadata": {}, + "outputs": [], + "source": [ + "ab_homo_score_sp2en, ab_comp_score_sp2en, ab_v_score_sp2en = evaluation.v_measure_bins(all_users,radius,sp2en=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "viral-rebel", + "metadata": {}, + "outputs": [], + "source": [ + "ab_mean_v_sp2en=round(mean([x for x in ab_v_score_sp2en if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "infectious-armstrong", + "metadata": {}, + "source": [ + "### After converting purposes and replaced mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "supreme-plain", + "metadata": {}, + "outputs": [], + "source": [ + "ab_homo_score_cvt, ab_comp_score_cvt, ab_v_score_cvt = evaluation.v_measure_bins(all_users,radius,cvt_pur_mo=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "corresponding-blind", + "metadata": {}, + "outputs": [], + "source": [ + "ab_mean_v_cvt=round(mean([x for x in ab_v_score_cvt if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "aquatic-password", + "metadata": {}, + "source": [ + "### DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "instant-broadcast", + "metadata": {}, + "outputs": [], + "source": [ + "all_df = pd.DataFrame(data={'homogeneity_score':[ab_homo_score_ori,ab_homo_score_sp2en,ab_homo_score_cvt],\n", + " 'completeness_score':[ab_comp_score_ori,ab_comp_score_sp2en,ab_comp_score_cvt],\n", + " 'v_measure_score':[ab_v_score_ori,ab_v_score_sp2en,ab_v_score_cvt],\n", + " 'mean v_measure_score':[ab_mean_v_ori,ab_mean_v_sp2en,ab_mean_v_cvt]},\n", + " index = ['original user input','after translation','after converting purposes and replaced mode'])\n", + "all_df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb new file mode 100644 index 0000000..f653b5a --- /dev/null +++ b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "important-humanity", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "# Our imports\n", + "import emission.core.get_database as edb\n", + "import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline\n", + "import emission.analysis.modelling.tour_model.similarity as similarity\n", + "import emission.analysis.modelling.tour_model.featurization as featurization\n", + "import emission.analysis.modelling.tour_model.representatives as representatives\n", + "import emission.storage.decorations.analysis_timeseries_queries as esda\n", + "import pandas as pd\n", + "from numpy import *\n", + "import confirmed_trips_eval_bins_clusters as evaluation\n", + "from sklearn import metrics\n", + "from pandas.testing import assert_frame_equal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "physical-improvement", + "metadata": {}, + "outputs": [], + "source": [ + "participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n", + "all_users = [u[\"user_id\"] for u in participant_uuid_obj]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "figured-buddy", + "metadata": {}, + "outputs": [], + "source": [ + "radius = 300" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "equal-release", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('max_colwidth',100)\n", + "pd.set_option('display.max_rows', None)" + ] + }, + { + "cell_type": "markdown", + "id": "standard-savannah", + "metadata": {}, + "source": [ + "## Evaluate clusters above cutoff based on silhouette_score" + ] + }, + { + "cell_type": "markdown", + "id": "formed-guitar", + "metadata": {}, + "source": [ + "### Original user input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "official-victor", + "metadata": {}, + "outputs": [], + "source": [ + "homo_score_ori, comp_score_ori, v_score_ori = evaluation.v_measure_clusters(all_users,radius)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "loved-isolation", + "metadata": {}, + "outputs": [], + "source": [ + "mean_v_ori=round(mean([x for x in v_score_ori if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "demanding-rendering", + "metadata": {}, + "source": [ + "### After changing language" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "satisfied-enough", + "metadata": {}, + "outputs": [], + "source": [ + "homo_score_sp2en, comp_score_sp2en, v_score_sp2en = evaluation.v_measure_clusters(all_users,radius,sp2en=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hundred-pasta", + "metadata": {}, + "outputs": [], + "source": [ + "mean_v_sp2en=round(mean([x for x in v_score_sp2en if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "hungarian-wallpaper", + "metadata": {}, + "source": [ + "### After converting purposes and replaced mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "particular-scope", + "metadata": {}, + "outputs": [], + "source": [ + "homo_score_cvt, comp_score_cvt, v_score_cvt = evaluation.v_measure_clusters(all_users,radius,cvt_pur_mo=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "neither-shooting", + "metadata": {}, + "outputs": [], + "source": [ + "mean_v_cvt=round(mean([x for x in v_score_cvt if str(x) != 'nan']),3)" + ] + }, + { + "cell_type": "markdown", + "id": "continuing-absorption", + "metadata": {}, + "source": [ + "### DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "retained-citizenship", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(data={'homogeneity_score':[homo_score_ori,homo_score_sp2en,homo_score_cvt],\n", + " 'completeness_score':[comp_score_ori,comp_score_sp2en,comp_score_cvt],\n", + " 'v_measure_score':[v_score_ori,v_score_sp2en,v_score_cvt],\n", + " 'mean v_measure_score':[mean_v_ori,mean_v_sp2en,mean_v_cvt]},\n", + " index = ['original user input','after translation','after converting purposes and replaced mode'])\n", + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb index c99060e..e6fe2e0 100644 --- a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb @@ -19,7 +19,8 @@ "import pandas as pd\n", "from numpy import *\n", "import confirmed_trips_eval_bins_clusters as evaluation\n", - "from sklearn import metrics" + "from sklearn import metrics\n", + "from pandas.testing import assert_frame_equal" ] }, { @@ -69,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "user = all_users[10]" + "user = all_users[6]" ] }, { @@ -91,17 +92,21 @@ "outputs": [], "source": [ "# select trips that have user_input to analyze\n", - "non_empty_trips = [t for t in trips if t[\"data\"][\"user_input\"] != {}]" + "non_empty_trips = [t for t in trips if t[\"data\"][\"user_input\"] != {}]\n", + "len(non_empty_trips)" ] }, { "cell_type": "code", "execution_count": null, - "id": "iraqi-account", + "id": "fabulous-definition", "metadata": {}, "outputs": [], "source": [ - "len(non_empty_trips), non_empty_trips" + "# filter out trips that are not fully labeled(contain NaN in user_input)\n", + "valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t[\"data\"][\"user_input\"] and \n", + " 'purpose_confirm'in t[\"data\"][\"user_input\"] and 'replaced_mode' in t[\"data\"][\"user_input\"]]\n", + "len(valid_trips)" ] }, { @@ -111,7 +116,7 @@ "metadata": {}, "outputs": [], "source": [ - "bin_trips, bins = pipeline.remove_noise(non_empty_trips, radius)" + "bin_trips, bins = pipeline.remove_noise(valid_trips, radius)" ] }, { @@ -127,11 +132,10 @@ { "cell_type": "code", "execution_count": null, - "id": "promotional-batch", + "id": "valid-morocco", "metadata": {}, "outputs": [], "source": [ - "# clusters,labels,cluster_trips, points = pipeline.cluster(bin_trips, len(bins))\n", "# clustering the data only based on sil score (min_cluster = 0) instead of bins number (len(bins))\n", "feat = featurization.featurization(bin_trips)\n", "min = 0\n", @@ -142,7 +146,7 @@ { "cell_type": "code", "execution_count": null, - "id": "attempted-afghanistan", + "id": "bridal-breakfast", "metadata": {}, "outputs": [], "source": [ @@ -152,7 +156,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fatty-manner", + "id": "following-heating", "metadata": {}, "outputs": [], "source": [ @@ -162,8 +166,10 @@ { "cell_type": "code", "execution_count": null, - "id": "internal-mobility", - "metadata": {}, + "id": "beginning-crowd", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "cluster_trips = feat.data\n", @@ -173,8 +179,10 @@ { "cell_type": "code", "execution_count": null, - "id": "alone-rochester", - "metadata": {}, + "id": "incorporated-strengthening", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "cluster_user_input_df = pd.DataFrame(data=[i[\"data\"][\"user_input\"] for i in cluster_trips])\n", @@ -212,7 +220,7 @@ "source": [ "# drop duplicate user_input\n", "no_dup_df=cluster_user_input_df.drop_duplicates()\n", - "no_dup_df" + "no_dup_df,len(no_dup_df)" ] }, { @@ -254,6 +262,45 @@ "labels_pred = feat.labels" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fallen-tulsa", + "metadata": {}, + "outputs": [], + "source": [ + "cluster_ps=[]\n", + "for trip in cluster_trips:\n", + " cluster_ps.append([trip[\"data\"][\"start_loc\"][\"coordinates\"][0],\n", + " trip[\"data\"][\"start_loc\"][\"coordinates\"][1],\n", + " trip[\"data\"][\"end_loc\"][\"coordinates\"][0],\n", + " trip[\"data\"][\"end_loc\"][\"coordinates\"][1]])\n", + "cluster_ps_df = pd.DataFrame(data=cluster_ps)\n", + "cluster_ps_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dangerous-discussion", + "metadata": {}, + "outputs": [], + "source": [ + "label_ps_df = pd.DataFrame(data=feat.points)\n", + "label_ps_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "progressive-profit", + "metadata": {}, + "outputs": [], + "source": [ + "# compare two data frames, return nothing if two data frames are the same\n", + "assert_frame_equal(cluster_ps_df,label_ps_df)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -315,7 +362,8 @@ "outputs": [], "source": [ "# change language and turn data frame to list\n", - "cluster_sp2en_ls = cluster_user_input_df.replace(span_eng_dict).values.tolist()\n", + "cluster_sp2en_df = cluster_user_input_df.replace(span_eng_dict)\n", + "cluster_sp2en_ls = cluster_sp2en_df.values.tolist()\n", "cluster_sp2en_ls" ] }, @@ -327,7 +375,7 @@ "outputs": [], "source": [ "# drop duplicate user_input\n", - "no_dup_sp2en_df=cluster_user_input_df.replace(span_eng_dict).drop_duplicates()\n", + "no_dup_sp2en_df=cluster_sp2en_df.drop_duplicates()\n", "no_dup_sp2en_df" ] }, @@ -436,7 +484,6 @@ "outputs": [], "source": [ "# convert purpose\n", - "cluster_sp2en_df = cluster_user_input_df.replace(span_eng_dict)\n", "cluster_cvt_pur_df = cluster_sp2en_df.replace(map_pur_dict)\n", "# convert mode\n", "cluster_cvt_pur_mo_df = cluster_cvt_pur_df\n", @@ -534,7 +581,7 @@ { "cell_type": "code", "execution_count": null, - "id": "precise-woman", + "id": "common-manhattan", "metadata": {}, "outputs": [], "source": [] From 4b99639aba91bce46face2c94616bf4cd61b763a Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Tue, 23 Feb 2021 15:03:42 -0800 Subject: [PATCH 06/16] delete example codes from sklearn, clean up files --- .../confirmed_trips_eval_bins_clusters.py | 52 ---- .../plot_document_clustering.ipynb | 285 ----------------- tour_model_eval/plot_kmeans_digits.ipynb | 286 ------------------ .../v-measurel_cutoff_bins_single_user.ipynb | 154 ++++++---- 4 files changed, 89 insertions(+), 688 deletions(-) delete mode 100644 tour_model_eval/plot_document_clustering.ipynb delete mode 100644 tour_model_eval/plot_kmeans_digits.ipynb diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index e57e4a7..1bcc370 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -22,58 +22,6 @@ map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home', 'insurance_payment':'insurance'} -# precision_bins takes five parameters -# - all_bins_preci: the list that collects precision of each bin, should pass in an empty list -# - sp2en=None means no need to translate language -# sp2en='True' will use span_eng_dict to change Spanish to English -# -# - cvt_purpose=None means no need to convert purposes -# cvt_purpose='True' will use map_pur_dict to convert purposes -# using this parameter should also set sp2en='True' -def precision_bins (all_bins_preci,bins,non_empty_trips,sp2en=None,cvt_purpose=None): - for bin in bins: - bin_user_input = (non_empty_trips[i].data["user_input"] for i in bin if - non_empty_trips[i].data["user_input"] != {}) - bin_df = pd.DataFrame(data=bin_user_input) - if sp2en == 'True': - bin_df = bin_df.replace(span_eng_dict) - if cvt_purpose == 'True': - bin_df = bin_df.replace(map_pur_dict) - duplic_trips = bin_df[bin_df.duplicated(keep=False)] - - # for bin that doesn't have duplicate trips, assign 0 as precision - if duplic_trips.empty and len(bin_df) > 1: - all_bins_preci.append(0) - # for bin only has one trip, assign 1.0 as precision - elif len(bin_df) == 1: - all_bins_preci.append(1.0) - else: - duplic = duplic_trips.groupby(duplic_trips.columns.tolist()).apply(lambda x: tuple(x.index)).tolist() - max_duplic = max(duplic, key=lambda i: len(i)) - precision = round(len(max_duplic) / len(bin), 2) - all_bins_preci.append(precision) - return all_bins_preci - - -# precision_all_users takes four parameters -# - all_users: pass in all participants' data -# - sp2en: default None, no need to change language -# - cvt_purpose: default None, no need to convert purpose -def precision_bin_all_users(all_users,radius,sp2en=None,cvt_purpose=None): - all_users_preci = [] - for i in range(len(all_users)): - user = all_users[i] - trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) - all_bins_preci = [] - non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] - if non_empty_trips != {}: - sim = similarity.similarity(non_empty_trips, radius) - if sim.data: - sim.bin_data() - all_bins_preci = precision_bins(all_bins_preci, sim.bins, non_empty_trips, sp2en, cvt_purpose) - all_users_preci.append(round(mean(all_bins_preci), 2)) - return all_users_preci - # v_measure_bins takes 5 parameters # - sp2en=True: change Spanish to English diff --git a/tour_model_eval/plot_document_clustering.ipynb b/tour_model_eval/plot_document_clustering.ipynb deleted file mode 100644 index 828ce5a..0000000 --- a/tour_model_eval/plot_document_clustering.ipynb +++ /dev/null @@ -1,285 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# Clustering text documents using k-means\n", - "\n", - "This is an example showing how the scikit-learn can be used to cluster\n", - "documents by topics using a bag-of-words approach. This example uses\n", - "a scipy.sparse matrix to store the features instead of standard numpy arrays.\n", - "\n", - "Two feature extraction methods can be used in this example:\n", - "\n", - " - TfidfVectorizer uses a in-memory vocabulary (a python dict) to map the most\n", - " frequent words to features indices and hence compute a word occurrence\n", - " frequency (sparse) matrix. The word frequencies are then reweighted using\n", - " the Inverse Document Frequency (IDF) vector collected feature-wise over\n", - " the corpus.\n", - "\n", - " - HashingVectorizer hashes word occurrences to a fixed dimensional space,\n", - " possibly with collisions. The word count vectors are then normalized to\n", - " each have l2-norm equal to one (projected to the euclidean unit-ball) which\n", - " seems to be important for k-means to work in high dimensional space.\n", - "\n", - " HashingVectorizer does not provide IDF weighting as this is a stateless\n", - " model (the fit method does nothing). When IDF weighting is needed it can\n", - " be added by pipelining its output to a TfidfTransformer instance.\n", - "\n", - "Two algorithms are demoed: ordinary k-means and its more scalable cousin\n", - "minibatch k-means.\n", - "\n", - "Additionally, latent semantic analysis can also be used to reduce\n", - "dimensionality and discover latent patterns in the data.\n", - "\n", - "It can be noted that k-means (and minibatch k-means) are very sensitive to\n", - "feature scaling and that in this case the IDF weighting helps improve the\n", - "quality of the clustering by quite a lot as measured against the \"ground truth\"\n", - "provided by the class label assignments of the 20 newsgroups dataset.\n", - "\n", - "This improvement is not visible in the Silhouette Coefficient which is small\n", - "for both as this measure seem to suffer from the phenomenon called\n", - "\"Concentration of Measure\" or \"Curse of Dimensionality\" for high dimensional\n", - "datasets such as text data. Other measures such as V-measure and Adjusted Rand\n", - "Index are information theoretic based evaluation scores: as they are only based\n", - "on cluster assignments rather than distances, hence not affected by the curse\n", - "of dimensionality.\n", - "\n", - "Note: as k-means is optimizing a non-convex objective function, it will likely\n", - "end up in a local optimum. Several runs with independent random init might be\n", - "necessary to get a good convergence.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Author: Peter Prettenhofer \n", - "# Lars Buitinck\n", - "# License: BSD 3 clause\n", - "from sklearn.datasets import fetch_20newsgroups\n", - "from sklearn.decomposition import TruncatedSVD\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.feature_extraction.text import HashingVectorizer\n", - "from sklearn.feature_extraction.text import TfidfTransformer\n", - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import Normalizer\n", - "from sklearn import metrics\n", - "\n", - "from sklearn.cluster import KMeans, MiniBatchKMeans\n", - "\n", - "import logging\n", - "from optparse import OptionParser\n", - "import sys\n", - "from time import time\n", - "\n", - "import numpy as np\n", - "\n", - "\n", - "# Display progress logs on stdout\n", - "logging.basicConfig(level=logging.INFO,\n", - " format='%(asctime)s %(levelname)s %(message)s')\n", - "\n", - "# parse commandline arguments\n", - "op = OptionParser()\n", - "op.add_option(\"--lsa\",\n", - " dest=\"n_components\", type=\"int\",\n", - " help=\"Preprocess documents with latent semantic analysis.\")\n", - "op.add_option(\"--no-minibatch\",\n", - " action=\"store_false\", dest=\"minibatch\", default=True,\n", - " help=\"Use ordinary k-means algorithm (in batch mode).\")\n", - "op.add_option(\"--no-idf\",\n", - " action=\"store_false\", dest=\"use_idf\", default=True,\n", - " help=\"Disable Inverse Document Frequency feature weighting.\")\n", - "op.add_option(\"--use-hashing\",\n", - " action=\"store_true\", default=False,\n", - " help=\"Use a hashing feature vectorizer\")\n", - "op.add_option(\"--n-features\", type=int, default=10000,\n", - " help=\"Maximum number of features (dimensions)\"\n", - " \" to extract from text.\")\n", - "op.add_option(\"--verbose\",\n", - " action=\"store_true\", dest=\"verbose\", default=False,\n", - " help=\"Print progress reports inside k-means algorithm.\")\n", - "\n", - "print(__doc__)\n", - "op.print_help()\n", - "\n", - "\n", - "def is_interactive():\n", - " return not hasattr(sys.modules['__main__'], '__file__')\n", - "\n", - "\n", - "# work-around for Jupyter notebook and IPython console\n", - "argv = [] if is_interactive() else sys.argv[1:]\n", - "(opts, args) = op.parse_args(argv)\n", - "if len(args) > 0:\n", - " op.error(\"this script takes no arguments.\")\n", - " sys.exit(1)\n", - "\n", - "\n", - "# #############################################################################\n", - "# Load some categories from the training set\n", - "categories = [\n", - " 'alt.atheism',\n", - " 'talk.religion.misc',\n", - " 'comp.graphics',\n", - " 'sci.space',\n", - "]\n", - "# Uncomment the following to do the analysis on all the categories\n", - "# categories = None\n", - "\n", - "print(\"Loading 20 newsgroups dataset for categories:\")\n", - "print(categories)\n", - "\n", - "dataset = fetch_20newsgroups(subset='all', categories=categories,\n", - " shuffle=True, random_state=42)\n", - "\n", - "print(\"%d documents\" % len(dataset.data))\n", - "print(\"%d categories\" % len(dataset.target_names))\n", - "print()\n", - "\n", - "labels = dataset.target\n", - "true_k = np.unique(labels).shape[0]\n", - "\n", - "print(\"Extracting features from the training dataset \"\n", - " \"using a sparse vectorizer\")\n", - "t0 = time()\n", - "if opts.use_hashing:\n", - " if opts.use_idf:\n", - " # Perform an IDF normalization on the output of HashingVectorizer\n", - " hasher = HashingVectorizer(n_features=opts.n_features,\n", - " stop_words='english', alternate_sign=False,\n", - " norm=None)\n", - " vectorizer = make_pipeline(hasher, TfidfTransformer())\n", - " else:\n", - " vectorizer = HashingVectorizer(n_features=opts.n_features,\n", - " stop_words='english',\n", - " alternate_sign=False, norm='l2')\n", - "else:\n", - " vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n", - " min_df=2, stop_words='english',\n", - " use_idf=opts.use_idf)\n", - "X = vectorizer.fit_transform(dataset.data)\n", - "\n", - "print(\"done in %fs\" % (time() - t0))\n", - "print(\"n_samples: %d, n_features: %d\" % X.shape)\n", - "print()\n", - "\n", - "if opts.n_components:\n", - " print(\"Performing dimensionality reduction using LSA\")\n", - " t0 = time()\n", - " # Vectorizer results are normalized, which makes KMeans behave as\n", - " # spherical k-means for better results. Since LSA/SVD results are\n", - " # not normalized, we have to redo the normalization.\n", - " svd = TruncatedSVD(opts.n_components)\n", - " normalizer = Normalizer(copy=False)\n", - " lsa = make_pipeline(svd, normalizer)\n", - "\n", - " X = lsa.fit_transform(X)\n", - "\n", - " print(\"done in %fs\" % (time() - t0))\n", - "\n", - " explained_variance = svd.explained_variance_ratio_.sum()\n", - " print(\"Explained variance of the SVD step: {}%\".format(\n", - " int(explained_variance * 100)))\n", - "\n", - " print()\n", - "\n", - "\n", - "# #############################################################################\n", - "# Do the actual clustering\n", - "\n", - "if opts.minibatch:\n", - " km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n", - " init_size=1000, batch_size=1000, verbose=opts.verbose)\n", - "else:\n", - " km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n", - " verbose=opts.verbose)\n", - "\n", - "print(\"Clustering sparse data with %s\" % km)\n", - "t0 = time()\n", - "km.fit(X)\n", - "print(\"done in %0.3fs\" % (time() - t0))\n", - "print()\n", - "\n", - "# Test what labels are\n", - "np.set_printoptions(threshold=np.inf)\n", - "print(\"----------Test---------------\")\n", - "print('labels is %s' % labels)\n", - "print('The type of labels is %s' % type(labels))\n", - "print('the shape of labels is n_samples %s' % labels.shape)\n", - "print('km.labels_ is %s' % km.labels_)\n", - "print('The type of km.labels_ is %s' % type(km.labels_))\n", - "print('the shape of km.labels_ is n_samples %s' % km.labels_.shape)\n", - "print('true_k is %s'% true_k)\n", - "print('np.unique(labels) is %s' % np.unique(labels))\n", - "print('np.unique(labels).shape is %s' % np.unique(labels).shape)\n", - "print(\"----------Test---------------\")\n", - "print()\n", - "\n", - "\n", - "\n", - "print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\n", - "print(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\n", - "print(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\n", - "print(\"Adjusted Rand-Index: %.3f\"\n", - " % metrics.adjusted_rand_score(labels, km.labels_))\n", - "print(\"Silhouette Coefficient: %0.3f\"\n", - " % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n", - "\n", - "print()\n", - "\n", - "\n", - "if not opts.use_hashing:\n", - " print(\"Top terms per cluster:\")\n", - "\n", - " if opts.n_components:\n", - " original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n", - " order_centroids = original_space_centroids.argsort()[:, ::-1]\n", - " else:\n", - " order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", - "\n", - " terms = vectorizer.get_feature_names()\n", - " for i in range(true_k):\n", - " print(\"Cluster %d:\" % i, end='')\n", - " for ind in order_centroids[i, :10]:\n", - " print(' %s' % terms[ind], end='')\n", - " print()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/tour_model_eval/plot_kmeans_digits.ipynb b/tour_model_eval/plot_kmeans_digits.ipynb deleted file mode 100644 index e88eb15..0000000 --- a/tour_model_eval/plot_kmeans_digits.ipynb +++ /dev/null @@ -1,286 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# A demo of K-Means clustering on the handwritten digits data\n", - "\n", - "In this example we compare the various initialization strategies for K-means in\n", - "terms of runtime and quality of the results.\n", - "\n", - "As the ground truth is known here, we also apply different cluster quality\n", - "metrics to judge the goodness of fit of the cluster labels to the ground truth.\n", - "\n", - "Cluster quality metrics evaluated (see `clustering_evaluation` for\n", - "definitions and discussions of the metrics):\n", - "\n", - "=========== ========================================================\n", - "Shorthand full name\n", - "=========== ========================================================\n", - "homo homogeneity score\n", - "compl completeness score\n", - "v-meas V measure\n", - "ARI adjusted Rand index\n", - "AMI adjusted mutual information\n", - "silhouette silhouette coefficient\n", - "=========== ========================================================\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(__doc__)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load the dataset\n", - "\n", - "We will start by loading the `digits` dataset. This dataset contains\n", - "handwritten digits from 0 to 9. In the context of clustering, one would like\n", - "to group images such that the handwritten digits on the image are the same.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn.datasets import load_digits\n", - "\n", - "data, labels = load_digits(return_X_y=True)\n", - "(n_samples, n_features), n_digits = data.shape, np.unique(labels).size\n", - "\n", - "print(\n", - " f\"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}\"\n", - ")\n", - "\n", - "print('------Test------')\n", - "np.set_printoptions(threshold=np.inf)\n", - "print('data.shape is ',data.shape)\n", - "print(\"labels is %s\" % labels)\n", - "print('------Test------')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define our evaluation benchmark\n", - "\n", - "We will first our evaluation benchmark. During this benchmark, we intend to\n", - "compare different initialization methods for KMeans. Our benchmark will:\n", - "\n", - "* create a pipeline which will scale the data using a\n", - " :class:`~sklearn.preprocessing.StandardScaler`;\n", - "* train and time the pipeline fitting;\n", - "* measure the performance of the clustering obtained via different metrics.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from time import time\n", - "from sklearn import metrics\n", - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "\n", - "def bench_k_means(kmeans, name, data, labels):\n", - " \"\"\"Benchmark to evaluate the KMeans initialization methods.\n", - "\n", - " Parameters\n", - " ----------\n", - " kmeans : KMeans instance\n", - " A :class:`~sklearn.cluster.KMeans` instance with the initialization\n", - " already set.\n", - " name : str\n", - " Name given to the strategy. It will be used to show the results in a\n", - " table.\n", - " data : ndarray of shape (n_samples, n_features)\n", - " The data to cluster.\n", - " labels : ndarray of shape (n_samples,)\n", - " The labels used to compute the clustering metrics which requires some\n", - " supervision.\n", - " \"\"\"\n", - " t0 = time()\n", - " estimator = make_pipeline(StandardScaler(), kmeans).fit(data)\n", - " fit_time = time() - t0\n", - " results = [name, fit_time, estimator[-1].inertia_]\n", - "\n", - " # Define the metrics which require only the true labels and estimator\n", - " # labels\n", - " clustering_metrics = [\n", - " metrics.homogeneity_score,\n", - " metrics.completeness_score,\n", - " metrics.v_measure_score,\n", - " metrics.adjusted_rand_score,\n", - " metrics.adjusted_mutual_info_score,\n", - " ]\n", - " results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]\n", - " \n", - " print('------Test------')\n", - " print('estimator[-1].labels_', estimator[-1].labels_)\n", - " print('------Test------')\n", - "\n", - " # The silhouette score requires the full dataset\n", - " results += [\n", - " metrics.silhouette_score(data, estimator[-1].labels_,\n", - " metric=\"euclidean\", sample_size=300,)\n", - " ]\n", - "\n", - " # Show the results\n", - " formatter_result = (\"{:9s}\\t{:.3f}s\\t{:.0f}\\t{:.3f}\\t{:.3f}\"\n", - " \"\\t{:.3f}\\t{:.3f}\\t{:.3f}\\t{:.3f}\")\n", - " print(formatter_result.format(*results))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run the benchmark\n", - "\n", - "We will compare three approaches:\n", - "\n", - "* an initialization using `kmeans++`. This method is stochastic and we will\n", - " run the initialization 4 times;\n", - "* a random initialization. This method is stochastic as well and we will run\n", - " the initialization 4 times;\n", - "* an initialization based on a :class:`~sklearn.decomposition.PCA`\n", - " projection. Indeed, we will use the components of the\n", - " :class:`~sklearn.decomposition.PCA` to initialize KMeans. This method is\n", - " deterministic and a single initialization suffice.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.cluster import KMeans\n", - "from sklearn.decomposition import PCA\n", - "\n", - "print(82 * '_')\n", - "print('init\\t\\ttime\\tinertia\\thomo\\tcompl\\tv-meas\\tARI\\tAMI\\tsilhouette')\n", - "\n", - "kmeans = KMeans(init=\"k-means++\", n_clusters=n_digits, n_init=4,\n", - " random_state=0)\n", - "bench_k_means(kmeans=kmeans, name=\"k-means++\", data=data, labels=labels)\n", - "\n", - "kmeans = KMeans(init=\"random\", n_clusters=n_digits, n_init=4, random_state=0)\n", - "bench_k_means(kmeans=kmeans, name=\"random\", data=data, labels=labels)\n", - "\n", - "pca = PCA(n_components=n_digits).fit(data)\n", - "kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)\n", - "bench_k_means(kmeans=kmeans, name=\"PCA-based\", data=data, labels=labels)\n", - "\n", - "print(82 * '_')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Visualize the results on PCA-reduced data\n", - "\n", - ":class:`~sklearn.decomposition.PCA` allows to project the data from the\n", - "original 64-dimensional space into a lower dimensional space. Subsequently,\n", - "we can use :class:`~sklearn.decomposition.PCA` to project into a\n", - "2-dimensional space and plot the data and the clusters in this new space.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "reduced_data = PCA(n_components=2).fit_transform(data)\n", - "kmeans = KMeans(init=\"k-means++\", n_clusters=n_digits, n_init=4)\n", - "kmeans.fit(reduced_data)\n", - "\n", - "# Step size of the mesh. Decrease to increase the quality of the VQ.\n", - "h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].\n", - "\n", - "# Plot the decision boundary. For that, we will assign a color to each\n", - "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n", - "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n", - "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", - "\n", - "# Obtain labels for each point in mesh. Use last trained model.\n", - "Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])\n", - "\n", - "# Put the result into a color plot\n", - "Z = Z.reshape(xx.shape)\n", - "plt.figure(1)\n", - "plt.clf()\n", - "plt.imshow(Z, interpolation=\"nearest\",\n", - " extent=(xx.min(), xx.max(), yy.min(), yy.max()),\n", - " cmap=plt.cm.Paired, aspect=\"auto\", origin=\"lower\")\n", - "\n", - "plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)\n", - "# Plot the centroids as a white X\n", - "centroids = kmeans.cluster_centers_\n", - "plt.scatter(centroids[:, 0], centroids[:, 1], marker=\"x\", s=169, linewidths=3,\n", - " color=\"w\", zorder=10)\n", - "plt.title(\"K-means clustering on the digits dataset (PCA-reduced data)\\n\"\n", - " \"Centroids are marked with white cross\")\n", - "plt.xlim(x_min, x_max)\n", - "plt.ylim(y_min, y_max)\n", - "plt.xticks(())\n", - "plt.yticks(())\n", - "plt.show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb index 6a55dd8..fb50a58 100644 --- a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "severe-married", + "id": "outer-institute", "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ { "cell_type": "code", "execution_count": null, - "id": "knowing-price", + "id": "precious-chassis", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "graphic-determination", + "id": "demonstrated-presentation", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "breathing-description", + "id": "according-salem", "metadata": {}, "outputs": [], "source": [ @@ -66,17 +66,17 @@ { "cell_type": "code", "execution_count": null, - "id": "gorgeous-retailer", + "id": "dynamic-wallace", "metadata": {}, "outputs": [], "source": [ - "user = all_users[0]" + "user = all_users[6]" ] }, { "cell_type": "code", "execution_count": null, - "id": "homeless-father", + "id": "proved-vintage", "metadata": {}, "outputs": [], "source": [ @@ -87,7 +87,7 @@ { "cell_type": "code", "execution_count": null, - "id": "arranged-cleaning", + "id": "postal-lecture", "metadata": {}, "outputs": [], "source": [ @@ -98,27 +98,45 @@ { "cell_type": "code", "execution_count": null, - "id": "super-spencer", + "id": "governmental-chamber", "metadata": {}, "outputs": [], "source": [ - "len(non_empty_trips), non_empty_trips" + "len(non_empty_trips),non_empty_trips" ] }, { "cell_type": "code", "execution_count": null, - "id": "higher-advocate", + "id": "complicated-movie", "metadata": {}, "outputs": [], "source": [ - "bin_trips, bins = pipeline.remove_noise(non_empty_trips, radius)" + "# filter out trips that are not fully labeled(contain NaN in user_input)\n", + "valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t[\"data\"][\"user_input\"] and \n", + " 'purpose_confirm'in t[\"data\"][\"user_input\"] and 'replaced_mode' in t[\"data\"][\"user_input\"]]\n", + "len(valid_trips),valid_trips" ] }, { "cell_type": "code", "execution_count": null, - "id": "beautiful-parcel", + "id": "waiting-courage", + "metadata": {}, + "outputs": [], + "source": [ + "sim = similarity.similarity(valid_trips, radius)\n", + "filter_trips = sim.data\n", + "sim.bin_data()\n", + "sim.delete_bins()\n", + "bin_trips = sim.newdata\n", + "bins = sim.bins" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "organized-multiple", "metadata": {}, "outputs": [], "source": [ @@ -128,7 +146,7 @@ { "cell_type": "code", "execution_count": null, - "id": "comic-norfolk", + "id": "packed-redhead", "metadata": { "scrolled": true }, @@ -136,14 +154,14 @@ "source": [ "# show all user labels in all bins\n", "for bin in bins:\n", - " bin_user_input = (non_empty_trips[i].data[\"user_input\"] for i in bin)\n", + " bin_user_input = (filter_trips[i].data[\"user_input\"] for i in bin)\n", " bin_df = pd.DataFrame(data = bin_user_input)\n", " print(bin_df)" ] }, { "cell_type": "markdown", - "id": "expensive-forestry", + "id": "egyptian-sessions", "metadata": {}, "source": [ "### Original output" @@ -152,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "corporate-situation", + "id": "proved-slave", "metadata": {}, "outputs": [], "source": [ @@ -163,8 +181,10 @@ { "cell_type": "code", "execution_count": null, - "id": "historic-russian", - "metadata": {}, + "id": "small-tattoo", + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ "# turn all user_input into list without binning\n", @@ -175,7 +195,7 @@ { "cell_type": "code", "execution_count": null, - "id": "jewish-bristol", + "id": "greek-management", "metadata": { "scrolled": true }, @@ -183,13 +203,13 @@ "source": [ "# drop duplicate user_input\n", "no_dup_df=bin_trips_df.drop_duplicates()\n", - "no_dup_df" + "no_dup_df,len(no_dup_df)" ] }, { "cell_type": "code", "execution_count": null, - "id": "wooden-postage", + "id": "discrete-secretary", "metadata": { "scrolled": true }, @@ -203,7 +223,7 @@ { "cell_type": "code", "execution_count": null, - "id": "wicked-serial", + "id": "economic-madison", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +238,7 @@ { "cell_type": "code", "execution_count": null, - "id": "surgical-stadium", + "id": "plain-lotus", "metadata": {}, "outputs": [], "source": [ @@ -232,7 +252,7 @@ }, { "cell_type": "markdown", - "id": "first-campbell", + "id": "saved-coral", "metadata": {}, "source": [ "Note: the trips order in labels_true and labels_pred should be the same. Using timestamp to compare the trips in bin_trips and those in bins" @@ -241,18 +261,20 @@ { "cell_type": "code", "execution_count": null, - "id": "dental-universal", + "id": "prescribed-former", "metadata": {}, "outputs": [], "source": [ - "bin_trips_ts = pd.DataFrame(data=[i[\"data\"][\"start_ts\"]for i in bin_trips])\n", - "len(bin_trips_ts)" + "bin_trips_ts = pd.DataFrame(data=[trip[\"data\"][\"start_ts\"]for trip in bin_trips])\n", + "bin_input = pd.DataFrame(data=[trip[\"data\"][\"user_input\"]for trip in bin_trips])\n", + "len(bin_trips_ts)\n", + "bin_input" ] }, { "cell_type": "code", "execution_count": null, - "id": "dynamic-prize", + "id": "supposed-style", "metadata": {}, "outputs": [], "source": [ @@ -260,14 +282,16 @@ "for bin in bins:\n", " for index in bin:\n", " bin_ls.append(index)\n", - "bins_ts = pd.DataFrame(data=[non_empty_trips[i][\"data\"][\"start_ts\"]for i in bin_ls])\n", - "len(bins_ts)" + "bins_ts = pd.DataFrame(data=[filter_trips[i][\"data\"][\"start_ts\"]for i in bin_ls])\n", + "bins_input = pd.DataFrame(data=[filter_trips[num][\"data\"][\"user_input\"]for num in bin_ls])\n", + "len(bins_ts)\n", + "bins_input" ] }, { "cell_type": "code", "execution_count": null, - "id": "republican-rabbit", + "id": "functioning-triangle", "metadata": {}, "outputs": [], "source": [ @@ -278,7 +302,7 @@ { "cell_type": "code", "execution_count": null, - "id": "concerned-congo", + "id": "distinguished-chapter", "metadata": {}, "outputs": [], "source": [ @@ -288,7 +312,7 @@ { "cell_type": "code", "execution_count": null, - "id": "alleged-alabama", + "id": "public-anime", "metadata": {}, "outputs": [], "source": [ @@ -298,7 +322,7 @@ { "cell_type": "code", "execution_count": null, - "id": "material-genetics", + "id": "single-watson", "metadata": { "scrolled": true }, @@ -309,7 +333,7 @@ }, { "cell_type": "markdown", - "id": "aware-friday", + "id": "chief-finish", "metadata": {}, "source": [ "### After changing language" @@ -318,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "native-sight", + "id": "noticed-operations", "metadata": {}, "outputs": [], "source": [ @@ -331,13 +355,13 @@ { "cell_type": "code", "execution_count": null, - "id": "threaded-focus", + "id": "worse-print", "metadata": {}, "outputs": [], "source": [ "# use dict to replace the values in Spanish in the bin(this step just for showing the trips in each bin)\n", "for bin in bins:\n", - " bin_user_input = (non_empty_trips[i].data[\"user_input\"] for i in bin)\n", + " bin_user_input = (filter_trips[i].data[\"user_input\"] for i in bin)\n", " bin_df = pd.DataFrame(data = bin_user_input)\n", " sp2en_bin_df = bin_df.replace(span_eng_dict)\n", " print(sp2en_bin_df)" @@ -346,33 +370,34 @@ { "cell_type": "code", "execution_count": null, - "id": "imported-reasoning", + "id": "tutorial-graphic", "metadata": {}, "outputs": [], "source": [ "# turn all user_input into list without binning\n", - "bin_trips_user_input_sp2en_ls = pd.DataFrame(data=[i[\"data\"][\"user_input\"] for i in bin_trips]).replace(span_eng_dict).values.tolist()\n", - "bin_trips_user_input_sp2en_ls" + "bin_trips_sp2en_df = bin_trips_df.replace(span_eng_dict)\n", + "bin_trips_sp2en_ls = bin_trips_sp2en_df.values.tolist()\n", + "bin_trips_sp2en_ls" ] }, { "cell_type": "code", "execution_count": null, - "id": "athletic-reasoning", + "id": "verbal-makeup", "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [], "source": [ "# drop duplicate user_input\n", - "no_dup_sp2en_df=pd.DataFrame(data=[i[\"data\"][\"user_input\"] for i in bin_trips]).replace(span_eng_dict).drop_duplicates()\n", + "no_dup_sp2en_df=bin_trips_sp2en_df.drop_duplicates()\n", "no_dup_sp2en_df" ] }, { "cell_type": "code", "execution_count": null, - "id": "false-fields", + "id": "forced-relaxation", "metadata": {}, "outputs": [], "source": [ @@ -384,13 +409,13 @@ { "cell_type": "code", "execution_count": null, - "id": "formal-belgium", + "id": "invisible-assist", "metadata": {}, "outputs": [], "source": [ "# collect labels_true based on user_input\n", "labels_true_sp2en =[]\n", - "for trip in bin_trips_user_input_sp2en_ls:\n", + "for trip in bin_trips_sp2en_ls:\n", " if trip in no_dup_sp2en_list:\n", " labels_true_sp2en.append(no_dup_sp2en_list.index(trip))\n", "labels_true_sp2en" @@ -399,7 +424,7 @@ { "cell_type": "code", "execution_count": null, - "id": "indirect-lafayette", + "id": "operating-myrtle", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +439,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cooked-mineral", + "id": "dietary-clone", "metadata": {}, "outputs": [], "source": [ @@ -424,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "caring-calcium", + "id": "breathing-garbage", "metadata": {}, "outputs": [], "source": [ @@ -434,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "sunset-frequency", + "id": "fabulous-thanksgiving", "metadata": {}, "outputs": [], "source": [ @@ -443,7 +468,7 @@ }, { "cell_type": "markdown", - "id": "included-alberta", + "id": "comparable-student", "metadata": {}, "source": [ "### After converting purposes and mode" @@ -452,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "outer-hammer", + "id": "average-contrary", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +488,7 @@ { "cell_type": "code", "execution_count": null, - "id": "developing-socket", + "id": "collect-eating", "metadata": {}, "outputs": [], "source": [ @@ -473,13 +498,12 @@ { "cell_type": "code", "execution_count": null, - "id": "partial-prerequisite", + "id": "valued-requirement", "metadata": {}, "outputs": [], "source": [ "# convert purpose\n", - "bin_trips_user_input_sp2en = pd.DataFrame(data=[bin_trips[i][\"data\"][\"user_input\"] for i in range(len(bin_trips))]).replace(span_eng_dict)\n", - "bin_trips_cvt_pur_df = bin_trips_user_input_sp2en.replace(map_pur_dict)\n", + "bin_trips_cvt_pur_df = bin_trips_sp2en_df.replace(map_pur_dict)\n", "# convert mode\n", "bin_trips_cvt_pur_mo_df = bin_trips_cvt_pur_df\n", "for i in range(len(bin_trips_cvt_pur_mo_df)):\n", @@ -494,7 +518,7 @@ { "cell_type": "code", "execution_count": null, - "id": "classical-berkeley", + "id": "intimate-microphone", "metadata": { "scrolled": true }, @@ -508,7 +532,7 @@ { "cell_type": "code", "execution_count": null, - "id": "opening-equity", + "id": "naked-turkish", "metadata": {}, "outputs": [], "source": [ @@ -520,7 +544,7 @@ { "cell_type": "code", "execution_count": null, - "id": "occasional-evanescence", + "id": "governing-treat", "metadata": {}, "outputs": [], "source": [ @@ -535,7 +559,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fancy-barbados", + "id": "outstanding-credit", "metadata": {}, "outputs": [], "source": [ @@ -550,7 +574,7 @@ { "cell_type": "code", "execution_count": null, - "id": "placed-carry", + "id": "bearing-generator", "metadata": {}, "outputs": [], "source": [ @@ -560,7 +584,7 @@ { "cell_type": "code", "execution_count": null, - "id": "worse-shift", + "id": "periodic-barrier", "metadata": {}, "outputs": [], "source": [ @@ -570,7 +594,7 @@ { "cell_type": "code", "execution_count": null, - "id": "immediate-series", + "id": "sixth-nowhere", "metadata": {}, "outputs": [], "source": [ From ed7ac2c9e047018a567824d2e899a84319289861 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Tue, 23 Feb 2021 17:25:04 -0800 Subject: [PATCH 07/16] clean up evaluation code --- .../confirmed_trips_eval_bins_clusters.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index 1bcc370..4f9652d 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -23,6 +23,16 @@ 'insurance_payment':'insurance'} +def filter_data(user,radius): + trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) + non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] + valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t["data"]["user_input"] and + 'purpose_confirm' in t["data"]["user_input"] and 'replaced_mode' in t["data"]["user_input"]] + sim = similarity.similarity(valid_trips, radius) + filter_trips = sim.data + return filter_trips,sim + + # v_measure_bins takes 5 parameters # - sp2en=True: change Spanish to English # - cvt_pur_mo=True: convert purposes and replaced mode @@ -35,12 +45,7 @@ def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): v_score = [] for i in range(len(all_users)): user = all_users[i] - trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) - non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] - valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t["data"]["user_input"] and - 'purpose_confirm' in t["data"]["user_input"] and 'replaced_mode' in t["data"]["user_input"]] - sim = similarity.similarity(valid_trips, radius) - filter_trips = sim.data + filter_trips,sim = filter_data(user,radius) # filter out users that haven't enough trips (at least 10) to analyze if len(filter_trips) < 10: @@ -129,12 +134,8 @@ def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): v_score = [] for i in range(len(all_users)): user = all_users[i] - trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) - non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] - valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t["data"]["user_input"] and - 'purpose_confirm' in t["data"]["user_input"] and 'replaced_mode' in t["data"]["user_input"]] - sim = similarity.similarity(valid_trips, radius) - filter_trips = sim.data + filter_trips,sim = filter_data(user,radius) + # filter out users that haven't enough trips (at least 10) to analyze if len(filter_trips) < 10: homo_score.append(NaN) From 5aa469f2df7545c113b39add1ab82731146fd56a Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Tue, 23 Feb 2021 23:19:21 -0800 Subject: [PATCH 08/16] change radius, add query times notebook --- tour_model_eval/query_times_all_users.ipynb | 265 ++++++++++++++++++ .../v-measurel_all_bins_single_user.ipynb | 100 +++---- .../v-measurel_bins_all_user.ipynb | 58 ++-- ...urel_clusters_above_cutoff_all_users.ipynb | 34 +-- .../v-measurel_cutoff_bins_single_user.ipynb | 98 +++---- ...measurel_cutoff_clusters_single_user.ipynb | 28 +- 6 files changed, 420 insertions(+), 163 deletions(-) create mode 100644 tour_model_eval/query_times_all_users.ipynb diff --git a/tour_model_eval/query_times_all_users.ipynb b/tour_model_eval/query_times_all_users.ipynb new file mode 100644 index 0000000..8877cfe --- /dev/null +++ b/tour_model_eval/query_times_all_users.ipynb @@ -0,0 +1,265 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "expanded-implement", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "# Our imports\n", + "import emission.core.get_database as edb\n", + "import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline\n", + "import emission.analysis.modelling.tour_model.similarity as similarity\n", + "import emission.analysis.modelling.tour_model.featurization as featurization\n", + "import emission.analysis.modelling.tour_model.representatives as representatives\n", + "import emission.storage.decorations.analysis_timeseries_queries as esda\n", + "import pandas as pd\n", + "from numpy import *\n", + "import confirmed_trips_eval_bins_clusters as evaluation\n", + "from sklearn import metrics\n", + "from pandas.testing import assert_frame_equal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "functional-birthday", + "metadata": {}, + "outputs": [], + "source": [ + "# logger = logging.getLogger()\n", + "# logger.setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "turned-watch", + "metadata": {}, + "outputs": [], + "source": [ + "participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n", + "all_users = [u[\"user_id\"] for u in participant_uuid_obj]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "encouraging-killer", + "metadata": {}, + "outputs": [], + "source": [ + "radius = 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tender-volleyball", + "metadata": {}, + "outputs": [], + "source": [ + "query_day = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "perfect-promotion", + "metadata": {}, + "outputs": [], + "source": [ + "query_month = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ordered-juvenile", + "metadata": {}, + "outputs": [], + "source": [ + "def match_day(trip,bin):\n", + " if bin:\n", + " t = filter_trips[bin[0]]\n", + " if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:\n", + " return True\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "raising-brisbane", + "metadata": {}, + "outputs": [], + "source": [ + "def match_month(trip,bin):\n", + " if bin:\n", + " t = filter_trips[bin[0]]\n", + " if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:\n", + " return True\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "comic-apache", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(all_users)):\n", + " user = all_users[i]\n", + " filter_trips,sim = evaluation.filter_data(user,radius)\n", + " logging.debug(\"len(filter_trips)is %s \"% len(filter_trips))\n", + "\n", + " # filter out users that don't have valid labeled trips\n", + " if len(filter_trips) == 0:\n", + " query_day.append(NaN)\n", + " query_month.append(NaN)\n", + " continue\n", + " \n", + " sim.bin_data()\n", + " sim.delete_bins()\n", + " # bins below cutoff\n", + " bl_bins = sim.below_cutoff\n", + " \n", + " # a list of trip indices from bl_bins\n", + " bl_trip_ls = []\n", + " for bin in bl_bins:\n", + " for index in bin:\n", + " bl_trip_ls.append(index)\n", + "\n", + " # collect all the trips that below cutoff\n", + " bl_trips = [filter_trips[num]for num in bl_trip_ls]\n", + " \n", + " \n", + " # collect query times in a day\n", + " bin_day = []\n", + " for trip_index in bl_trip_ls:\n", + " added = False\n", + " trip = filter_trips[trip_index]\n", + " for bin in bin_day:\n", + " if match_day(trip,bin):\n", + " bin.append(trip_index)\n", + " added = True\n", + " if not added:\n", + " bin_day.append([trip_index])\n", + " query_day_ls = []\n", + " for bin in bin_day:\n", + " query_day_ls.append(len(bin))\n", + "\n", + " # average query times for a day\n", + " query_day.append(math.ceil(mean(query_day_ls)))\n", + " \n", + " \n", + " # collect query times in a month\n", + " bin_month = []\n", + " for trip_index in bl_trip_ls:\n", + " added = False\n", + " trip = filter_trips[trip_index]\n", + " for bin in bin_month:\n", + " if match_month(trip,bin):\n", + " bin.append(trip_index)\n", + " added = True\n", + " if not added:\n", + " bin_month.append([trip_index])\n", + " query_month_ls = []\n", + " for bin in bin_month:\n", + " query_month_ls.append(len(bin))\n", + "\n", + " # average query times for a month\n", + " query_month.append(math.ceil(mean(query_month_ls)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "alternative-dialogue", + "metadata": {}, + "outputs": [], + "source": [ + "query_day" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tracked-pattern", + "metadata": {}, + "outputs": [], + "source": [ + "query_month" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "instant-somerset", + "metadata": {}, + "outputs": [], + "source": [ + "mean_day = math.ceil(mean([x for x in query_day if str(x) != 'nan']))\n", + "mean_day" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "documented-municipality", + "metadata": {}, + "outputs": [], + "source": [ + "mean_month = math.ceil(mean([x for x in query_month if str(x) != 'nan']))\n", + "mean_month" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "burning-elephant", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(data = {'query times in a day':query_day,'query times in a month':query_month})\n", + "df.loc['mean'] = [mean_day,mean_month]\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "incoming-gibson", + "metadata": {}, + "outputs": [], + "source": [ + "df.plot(kind = 'bar')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tour_model_eval/v-measurel_all_bins_single_user.ipynb b/tour_model_eval/v-measurel_all_bins_single_user.ipynb index 4eb2013..cf2aa3a 100644 --- a/tour_model_eval/v-measurel_all_bins_single_user.ipynb +++ b/tour_model_eval/v-measurel_all_bins_single_user.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "modified-diary", + "id": "republican-pleasure", "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ { "cell_type": "code", "execution_count": null, - "id": "together-twenty", + "id": "decent-passion", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "sorted-lloyd", + "id": "preliminary-example", "metadata": {}, "outputs": [], "source": [ @@ -48,11 +48,11 @@ { "cell_type": "code", "execution_count": null, - "id": "missing-psychology", + "id": "executive-heather", "metadata": {}, "outputs": [], "source": [ - "radius = 300" + "radius = 100" ] }, { @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "august-valley", + "id": "genuine-shipping", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "diverse-running", + "id": "sorted-juvenile", "metadata": {}, "outputs": [], "source": [ @@ -87,7 +87,7 @@ { "cell_type": "code", "execution_count": null, - "id": "crude-postage", + "id": "primary-friendly", "metadata": {}, "outputs": [], "source": [ @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "copyrighted-delta", + "id": "medical-spider", "metadata": {}, "outputs": [], "source": [ @@ -112,7 +112,7 @@ { "cell_type": "code", "execution_count": null, - "id": "competent-australia", + "id": "latest-reconstruction", "metadata": {}, "outputs": [], "source": [ @@ -123,7 +123,7 @@ { "cell_type": "code", "execution_count": null, - "id": "standing-serial", + "id": "stone-omega", "metadata": {}, "outputs": [], "source": [ @@ -133,7 +133,7 @@ { "cell_type": "code", "execution_count": null, - "id": "annoying-antique", + "id": "individual-insert", "metadata": {}, "outputs": [], "source": [ @@ -152,7 +152,7 @@ { "cell_type": "code", "execution_count": null, - "id": "latest-contemporary", + "id": "continued-borough", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "thrown-hello", + "id": "outside-fairy", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +175,7 @@ }, { "cell_type": "markdown", - "id": "strange-inventory", + "id": "extraordinary-penalty", "metadata": {}, "source": [ "### Original output" @@ -184,7 +184,7 @@ { "cell_type": "code", "execution_count": null, - "id": "streaming-taiwan", + "id": "advised-wiring", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ { "cell_type": "code", "execution_count": null, - "id": "destroyed-filter", + "id": "careful-sally", "metadata": {}, "outputs": [], "source": [ @@ -207,7 +207,7 @@ { "cell_type": "code", "execution_count": null, - "id": "mature-italy", + "id": "elect-hardwood", "metadata": {}, "outputs": [], "source": [ @@ -219,7 +219,7 @@ { "cell_type": "code", "execution_count": null, - "id": "illegal-collect", + "id": "dynamic-grace", "metadata": {}, "outputs": [], "source": [ @@ -231,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "sapphire-hughes", + "id": "romance-chrome", "metadata": {}, "outputs": [], "source": [ @@ -241,7 +241,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fuzzy-marathon", + "id": "further-moscow", "metadata": {}, "outputs": [], "source": [ @@ -256,7 +256,7 @@ { "cell_type": "code", "execution_count": null, - "id": "considerable-restoration", + "id": "welsh-trustee", "metadata": {}, "outputs": [], "source": [ @@ -271,7 +271,7 @@ { "cell_type": "code", "execution_count": null, - "id": "domestic-august", + "id": "minimal-fever", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "married-catholic", + "id": "corporate-missile", "metadata": {}, "outputs": [], "source": [ @@ -297,7 +297,7 @@ { "cell_type": "code", "execution_count": null, - "id": "requested-organic", + "id": "radio-meter", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ { "cell_type": "code", "execution_count": null, - "id": "economic-nitrogen", + "id": "fancy-mount", "metadata": {}, "outputs": [], "source": [ @@ -318,7 +318,7 @@ { "cell_type": "code", "execution_count": null, - "id": "postal-trademark", + "id": "brown-traveler", "metadata": {}, "outputs": [], "source": [ @@ -328,7 +328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "southern-evidence", + "id": "chemical-rider", "metadata": {}, "outputs": [], "source": [ @@ -337,7 +337,7 @@ }, { "cell_type": "markdown", - "id": "domestic-regression", + "id": "excessive-spell", "metadata": {}, "source": [ "### After changing language" @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "amazing-jersey", + "id": "christian-legislature", "metadata": {}, "outputs": [], "source": [ @@ -359,7 +359,7 @@ { "cell_type": "code", "execution_count": null, - "id": "impressive-philadelphia", + "id": "ready-delay", "metadata": {}, "outputs": [], "source": [ @@ -374,7 +374,7 @@ { "cell_type": "code", "execution_count": null, - "id": "hybrid-eugene", + "id": "blind-router", "metadata": {}, "outputs": [], "source": [ @@ -387,7 +387,7 @@ { "cell_type": "code", "execution_count": null, - "id": "viral-intermediate", + "id": "suited-institution", "metadata": { "scrolled": true }, @@ -401,7 +401,7 @@ { "cell_type": "code", "execution_count": null, - "id": "conceptual-branch", + "id": "incomplete-confusion", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +413,7 @@ { "cell_type": "code", "execution_count": null, - "id": "wanted-management", + "id": "ahead-hawaiian", "metadata": {}, "outputs": [], "source": [ @@ -428,7 +428,7 @@ { "cell_type": "code", "execution_count": null, - "id": "otherwise-vault", + "id": "collectible-australia", "metadata": {}, "outputs": [], "source": [ @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "adolescent-style", + "id": "changed-pollution", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ { "cell_type": "code", "execution_count": null, - "id": "signal-backing", + "id": "judicial-printer", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +463,7 @@ { "cell_type": "code", "execution_count": null, - "id": "upper-contribution", + "id": "female-confidentiality", "metadata": {}, "outputs": [], "source": [ @@ -472,7 +472,7 @@ }, { "cell_type": "markdown", - "id": "bearing-communications", + "id": "foster-reading", "metadata": {}, "source": [ "### After converting purposes and mode" @@ -481,7 +481,7 @@ { "cell_type": "code", "execution_count": null, - "id": "completed-testing", + "id": "opposite-aurora", "metadata": {}, "outputs": [], "source": [ @@ -492,7 +492,7 @@ { "cell_type": "code", "execution_count": null, - "id": "quality-board", + "id": "western-commissioner", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +502,7 @@ { "cell_type": "code", "execution_count": null, - "id": "falling-arbor", + "id": "vocal-saudi", "metadata": {}, "outputs": [], "source": [ @@ -522,7 +522,7 @@ { "cell_type": "code", "execution_count": null, - "id": "electrical-liechtenstein", + "id": "recreational-label", "metadata": { "scrolled": true }, @@ -536,7 +536,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cleared-valley", + "id": "adequate-notice", "metadata": {}, "outputs": [], "source": [ @@ -548,7 +548,7 @@ { "cell_type": "code", "execution_count": null, - "id": "printable-fusion", + "id": "stuck-survivor", "metadata": {}, "outputs": [], "source": [ @@ -563,7 +563,7 @@ { "cell_type": "code", "execution_count": null, - "id": "hundred-zealand", + "id": "floppy-flight", "metadata": {}, "outputs": [], "source": [ @@ -578,7 +578,7 @@ { "cell_type": "code", "execution_count": null, - "id": "false-lithuania", + "id": "broken-demand", "metadata": {}, "outputs": [], "source": [ @@ -588,7 +588,7 @@ { "cell_type": "code", "execution_count": null, - "id": "documentary-power", + "id": "nasty-potato", "metadata": {}, "outputs": [], "source": [ @@ -598,7 +598,7 @@ { "cell_type": "code", "execution_count": null, - "id": "elect-animation", + "id": "regular-glance", "metadata": {}, "outputs": [], "source": [ diff --git a/tour_model_eval/v-measurel_bins_all_user.ipynb b/tour_model_eval/v-measurel_bins_all_user.ipynb index 12a2d49..a4af2bd 100644 --- a/tour_model_eval/v-measurel_bins_all_user.ipynb +++ b/tour_model_eval/v-measurel_bins_all_user.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "computational-national", + "id": "hungry-polish", "metadata": { "scrolled": true }, @@ -28,7 +28,7 @@ { "cell_type": "code", "execution_count": null, - "id": "preceding-yugoslavia", + "id": "coupled-transportation", "metadata": {}, "outputs": [], "source": [ @@ -39,17 +39,17 @@ { "cell_type": "code", "execution_count": null, - "id": "perceived-impossible", + "id": "interstate-queue", "metadata": {}, "outputs": [], "source": [ - "radius = 300" + "radius = 100" ] }, { "cell_type": "code", "execution_count": null, - "id": "smooth-variety", + "id": "norwegian-victoria", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ }, { "cell_type": "markdown", - "id": "medical-biodiversity", + "id": "loved-estonia", "metadata": {}, "source": [ "## Bins above cutoff" @@ -67,7 +67,7 @@ }, { "cell_type": "markdown", - "id": "corporate-heather", + "id": "laughing-macintosh", "metadata": {}, "source": [ "### Original user input" @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "chief-fundamentals", + "id": "amber-photograph", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ { "cell_type": "code", "execution_count": null, - "id": "educated-philippines", + "id": "collected-throat", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ }, { "cell_type": "markdown", - "id": "welcome-homeless", + "id": "headed-dating", "metadata": {}, "source": [ "### After changing language" @@ -104,7 +104,7 @@ { "cell_type": "code", "execution_count": null, - "id": "preceding-money", + "id": "blank-arnold", "metadata": { "scrolled": true }, @@ -116,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "laden-plate", + "id": "maritime-twins", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +125,7 @@ }, { "cell_type": "markdown", - "id": "sticky-denver", + "id": "consistent-kuwait", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -134,7 +134,7 @@ { "cell_type": "code", "execution_count": null, - "id": "agreed-moment", + "id": "confused-sensitivity", "metadata": {}, "outputs": [], "source": [ @@ -144,7 +144,7 @@ { "cell_type": "code", "execution_count": null, - "id": "handmade-dairy", + "id": "advance-passing", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +153,7 @@ }, { "cell_type": "markdown", - "id": "funky-potato", + "id": "bronze-operations", "metadata": {}, "source": [ "### DataFrame" @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "technological-reservation", + "id": "pacific-reflection", "metadata": {}, "outputs": [], "source": [ @@ -176,7 +176,7 @@ }, { "cell_type": "markdown", - "id": "strange-badge", + "id": "documented-vacation", "metadata": {}, "source": [ "## All bins" @@ -184,7 +184,7 @@ }, { "cell_type": "markdown", - "id": "rural-virtue", + "id": "selected-compiler", "metadata": {}, "source": [ "### Original user input" @@ -193,7 +193,7 @@ { "cell_type": "code", "execution_count": null, - "id": "listed-entrance", + "id": "tight-contract", "metadata": {}, "outputs": [], "source": [ @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "known-calculation", + "id": "regular-shadow", "metadata": {}, "outputs": [], "source": [ @@ -212,7 +212,7 @@ }, { "cell_type": "markdown", - "id": "broadband-expression", + "id": "brutal-trinidad", "metadata": {}, "source": [ "### After changing language" @@ -221,7 +221,7 @@ { "cell_type": "code", "execution_count": null, - "id": "alone-authorization", + "id": "decent-machinery", "metadata": {}, "outputs": [], "source": [ @@ -231,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "viral-rebel", + "id": "lasting-synthetic", "metadata": {}, "outputs": [], "source": [ @@ -240,7 +240,7 @@ }, { "cell_type": "markdown", - "id": "infectious-armstrong", + "id": "sexual-opportunity", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -249,7 +249,7 @@ { "cell_type": "code", "execution_count": null, - "id": "supreme-plain", + "id": "periodic-freedom", "metadata": {}, "outputs": [], "source": [ @@ -259,7 +259,7 @@ { "cell_type": "code", "execution_count": null, - "id": "corresponding-blind", + "id": "stylish-thailand", "metadata": {}, "outputs": [], "source": [ @@ -268,7 +268,7 @@ }, { "cell_type": "markdown", - "id": "aquatic-password", + "id": "floppy-theory", "metadata": {}, "source": [ "### DataFrame" @@ -277,7 +277,7 @@ { "cell_type": "code", "execution_count": null, - "id": "instant-broadcast", + "id": "british-working", "metadata": {}, "outputs": [], "source": [ diff --git a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb index f653b5a..67a0f56 100644 --- a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb +++ b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "important-humanity", + "id": "adverse-recipient", "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ { "cell_type": "code", "execution_count": null, - "id": "physical-improvement", + "id": "pleased-fence", "metadata": {}, "outputs": [], "source": [ @@ -37,17 +37,17 @@ { "cell_type": "code", "execution_count": null, - "id": "figured-buddy", + "id": "global-leave", "metadata": {}, "outputs": [], "source": [ - "radius = 300" + "radius = 100" ] }, { "cell_type": "code", "execution_count": null, - "id": "equal-release", + "id": "hundred-surge", "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "markdown", - "id": "standard-savannah", + "id": "funky-darkness", "metadata": {}, "source": [ "## Evaluate clusters above cutoff based on silhouette_score" @@ -65,7 +65,7 @@ }, { "cell_type": "markdown", - "id": "formed-guitar", + "id": "critical-prime", "metadata": {}, "source": [ "### Original user input" @@ -74,7 +74,7 @@ { "cell_type": "code", "execution_count": null, - "id": "official-victor", + "id": "copyrighted-vehicle", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ { "cell_type": "code", "execution_count": null, - "id": "loved-isolation", + "id": "stainless-exposure", "metadata": {}, "outputs": [], "source": [ @@ -93,7 +93,7 @@ }, { "cell_type": "markdown", - "id": "demanding-rendering", + "id": "returning-skiing", "metadata": {}, "source": [ "### After changing language" @@ -102,7 +102,7 @@ { "cell_type": "code", "execution_count": null, - "id": "satisfied-enough", + "id": "authorized-complement", "metadata": {}, "outputs": [], "source": [ @@ -112,7 +112,7 @@ { "cell_type": "code", "execution_count": null, - "id": "hundred-pasta", + "id": "hispanic-drunk", "metadata": {}, "outputs": [], "source": [ @@ -121,7 +121,7 @@ }, { "cell_type": "markdown", - "id": "hungarian-wallpaper", + "id": "united-grade", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -130,7 +130,7 @@ { "cell_type": "code", "execution_count": null, - "id": "particular-scope", + "id": "outer-usage", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ { "cell_type": "code", "execution_count": null, - "id": "neither-shooting", + "id": "positive-grass", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "continuing-absorption", + "id": "assured-oriental", "metadata": {}, "source": [ "### DataFrame" @@ -158,7 +158,7 @@ { "cell_type": "code", "execution_count": null, - "id": "retained-citizenship", + "id": "authentic-compression", "metadata": {}, "outputs": [], "source": [ diff --git a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb index fb50a58..271ef9f 100644 --- a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "outer-institute", + "id": "primary-friendship", "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ { "cell_type": "code", "execution_count": null, - "id": "precious-chassis", + "id": "composed-sewing", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "demonstrated-presentation", + "id": "diagnostic-stand", "metadata": {}, "outputs": [], "source": [ @@ -48,11 +48,11 @@ { "cell_type": "code", "execution_count": null, - "id": "according-salem", + "id": "unexpected-mineral", "metadata": {}, "outputs": [], "source": [ - "radius = 300" + "radius = 100" ] }, { @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dynamic-wallace", + "id": "stock-facing", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "proved-vintage", + "id": "genetic-liberty", "metadata": {}, "outputs": [], "source": [ @@ -87,7 +87,7 @@ { "cell_type": "code", "execution_count": null, - "id": "postal-lecture", + "id": "patient-spirit", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ { "cell_type": "code", "execution_count": null, - "id": "governmental-chamber", + "id": "obvious-imaging", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ { "cell_type": "code", "execution_count": null, - "id": "complicated-movie", + "id": "sufficient-baseball", "metadata": {}, "outputs": [], "source": [ @@ -121,7 +121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "waiting-courage", + "id": "inclusive-theory", "metadata": {}, "outputs": [], "source": [ @@ -136,7 +136,7 @@ { "cell_type": "code", "execution_count": null, - "id": "organized-multiple", + "id": "sticky-indianapolis", "metadata": {}, "outputs": [], "source": [ @@ -146,7 +146,7 @@ { "cell_type": "code", "execution_count": null, - "id": "packed-redhead", + "id": "recreational-diesel", "metadata": { "scrolled": true }, @@ -161,7 +161,7 @@ }, { "cell_type": "markdown", - "id": "egyptian-sessions", + "id": "complete-feeling", "metadata": {}, "source": [ "### Original output" @@ -170,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "proved-slave", + "id": "innocent-circumstances", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +181,7 @@ { "cell_type": "code", "execution_count": null, - "id": "small-tattoo", + "id": "lightweight-connectivity", "metadata": { "scrolled": false }, @@ -195,7 +195,7 @@ { "cell_type": "code", "execution_count": null, - "id": "greek-management", + "id": "ceramic-sense", "metadata": { "scrolled": true }, @@ -209,7 +209,7 @@ { "cell_type": "code", "execution_count": null, - "id": "discrete-secretary", + "id": "intensive-rings", "metadata": { "scrolled": true }, @@ -223,7 +223,7 @@ { "cell_type": "code", "execution_count": null, - "id": "economic-madison", + "id": "little-cuisine", "metadata": {}, "outputs": [], "source": [ @@ -238,7 +238,7 @@ { "cell_type": "code", "execution_count": null, - "id": "plain-lotus", + "id": "absent-emerald", "metadata": {}, "outputs": [], "source": [ @@ -252,7 +252,7 @@ }, { "cell_type": "markdown", - "id": "saved-coral", + "id": "agricultural-welcome", "metadata": {}, "source": [ "Note: the trips order in labels_true and labels_pred should be the same. Using timestamp to compare the trips in bin_trips and those in bins" @@ -261,7 +261,7 @@ { "cell_type": "code", "execution_count": null, - "id": "prescribed-former", + "id": "appropriate-convert", "metadata": {}, "outputs": [], "source": [ @@ -274,7 +274,7 @@ { "cell_type": "code", "execution_count": null, - "id": "supposed-style", + "id": "billion-yield", "metadata": {}, "outputs": [], "source": [ @@ -291,7 +291,7 @@ { "cell_type": "code", "execution_count": null, - "id": "functioning-triangle", + "id": "subjective-victim", "metadata": {}, "outputs": [], "source": [ @@ -302,7 +302,7 @@ { "cell_type": "code", "execution_count": null, - "id": "distinguished-chapter", + "id": "numeric-worth", "metadata": {}, "outputs": [], "source": [ @@ -312,7 +312,7 @@ { "cell_type": "code", "execution_count": null, - "id": "public-anime", + "id": "tired-citation", "metadata": {}, "outputs": [], "source": [ @@ -322,7 +322,7 @@ { "cell_type": "code", "execution_count": null, - "id": "single-watson", + "id": "romance-investment", "metadata": { "scrolled": true }, @@ -333,7 +333,7 @@ }, { "cell_type": "markdown", - "id": "chief-finish", + "id": "increasing-baghdad", "metadata": {}, "source": [ "### After changing language" @@ -342,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "noticed-operations", + "id": "hidden-source", "metadata": {}, "outputs": [], "source": [ @@ -355,7 +355,7 @@ { "cell_type": "code", "execution_count": null, - "id": "worse-print", + "id": "reduced-hebrew", "metadata": {}, "outputs": [], "source": [ @@ -370,7 +370,7 @@ { "cell_type": "code", "execution_count": null, - "id": "tutorial-graphic", + "id": "minor-reason", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ { "cell_type": "code", "execution_count": null, - "id": "verbal-makeup", + "id": "acknowledged-spanish", "metadata": { "scrolled": false }, @@ -397,7 +397,7 @@ { "cell_type": "code", "execution_count": null, - "id": "forced-relaxation", + "id": "competent-audience", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "invisible-assist", + "id": "egyptian-request", "metadata": {}, "outputs": [], "source": [ @@ -424,7 +424,7 @@ { "cell_type": "code", "execution_count": null, - "id": "operating-myrtle", + "id": "communist-tomorrow", "metadata": {}, "outputs": [], "source": [ @@ -439,7 +439,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dietary-clone", + "id": "beneficial-freeze", "metadata": {}, "outputs": [], "source": [ @@ -449,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "breathing-garbage", + "id": "corrected-processing", "metadata": {}, "outputs": [], "source": [ @@ -459,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fabulous-thanksgiving", + "id": "overall-terrace", "metadata": {}, "outputs": [], "source": [ @@ -468,7 +468,7 @@ }, { "cell_type": "markdown", - "id": "comparable-student", + "id": "collectible-heater", "metadata": {}, "source": [ "### After converting purposes and mode" @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "average-contrary", + "id": "pacific-newport", "metadata": {}, "outputs": [], "source": [ @@ -488,7 +488,7 @@ { "cell_type": "code", "execution_count": null, - "id": "collect-eating", + "id": "unique-russia", "metadata": {}, "outputs": [], "source": [ @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "valued-requirement", + "id": "alone-external", "metadata": {}, "outputs": [], "source": [ @@ -518,7 +518,7 @@ { "cell_type": "code", "execution_count": null, - "id": "intimate-microphone", + "id": "parallel-position", "metadata": { "scrolled": true }, @@ -532,7 +532,7 @@ { "cell_type": "code", "execution_count": null, - "id": "naked-turkish", + "id": "weighted-arbitration", "metadata": {}, "outputs": [], "source": [ @@ -544,7 +544,7 @@ { "cell_type": "code", "execution_count": null, - "id": "governing-treat", + "id": "severe-stable", "metadata": {}, "outputs": [], "source": [ @@ -559,7 +559,7 @@ { "cell_type": "code", "execution_count": null, - "id": "outstanding-credit", + "id": "natural-commitment", "metadata": {}, "outputs": [], "source": [ @@ -574,7 +574,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bearing-generator", + "id": "million-authentication", "metadata": {}, "outputs": [], "source": [ @@ -584,7 +584,7 @@ { "cell_type": "code", "execution_count": null, - "id": "periodic-barrier", + "id": "downtown-nelson", "metadata": {}, "outputs": [], "source": [ @@ -594,7 +594,7 @@ { "cell_type": "code", "execution_count": null, - "id": "sixth-nowhere", + "id": "understanding-equity", "metadata": {}, "outputs": [], "source": [ diff --git a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb index e6fe2e0..c1c1ea9 100644 --- a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb @@ -52,7 +52,7 @@ "metadata": {}, "outputs": [], "source": [ - "radius = 300" + "radius = 100" ] }, { @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fabulous-definition", + "id": "excited-closure", "metadata": {}, "outputs": [], "source": [ @@ -132,7 +132,7 @@ { "cell_type": "code", "execution_count": null, - "id": "valid-morocco", + "id": "pleased-generic", "metadata": {}, "outputs": [], "source": [ @@ -146,7 +146,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bridal-breakfast", + "id": "north-instruction", "metadata": {}, "outputs": [], "source": [ @@ -156,7 +156,7 @@ { "cell_type": "code", "execution_count": null, - "id": "following-heating", + "id": "boring-nerve", "metadata": {}, "outputs": [], "source": [ @@ -166,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "beginning-crowd", + "id": "neither-toilet", "metadata": { "scrolled": true }, @@ -179,7 +179,7 @@ { "cell_type": "code", "execution_count": null, - "id": "incorporated-strengthening", + "id": "artificial-serum", "metadata": { "scrolled": true }, @@ -265,7 +265,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fallen-tulsa", + "id": "least-bristol", "metadata": {}, "outputs": [], "source": [ @@ -282,7 +282,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dangerous-discussion", + "id": "remarkable-alliance", "metadata": {}, "outputs": [], "source": [ @@ -293,7 +293,7 @@ { "cell_type": "code", "execution_count": null, - "id": "progressive-profit", + "id": "sticky-teaching", "metadata": {}, "outputs": [], "source": [ @@ -577,14 +577,6 @@ "source": [ "metrics.v_measure_score(labels_true_cvt, labels_pred)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "common-manhattan", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 84d640463826a99f78d085323c8dfe071590e7a2 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Fri, 26 Feb 2021 10:08:09 -0800 Subject: [PATCH 09/16] plot all data for all valid users --- tour_model_eval/query_times_all_users.ipynb | 914 ++++++++++++++++-- .../v-measurel_bins_all_user.ipynb | 242 ++++- ...urel_clusters_above_cutoff_all_users.ipynb | 127 ++- 3 files changed, 1170 insertions(+), 113 deletions(-) diff --git a/tour_model_eval/query_times_all_users.ipynb b/tour_model_eval/query_times_all_users.ipynb index 8877cfe..20164ba 100644 --- a/tour_model_eval/query_times_all_users.ipynb +++ b/tour_model_eval/query_times_all_users.ipynb @@ -2,10 +2,21 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "id": "expanded-implement", - "metadata": {}, - "outputs": [], + "execution_count": 1, + "id": "pending-avatar", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "storage not configured, falling back to sample, default configuration\n", + "Connecting to database URL localhost\n" + ] + } + ], "source": [ "import logging\n", "\n", @@ -20,24 +31,15 @@ "from numpy import *\n", "import confirmed_trips_eval_bins_clusters as evaluation\n", "from sklearn import metrics\n", - "from pandas.testing import assert_frame_equal" + "from pandas.testing import assert_frame_equal\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.ticker import MaxNLocator" ] }, { "cell_type": "code", - "execution_count": null, - "id": "functional-birthday", - "metadata": {}, - "outputs": [], - "source": [ - "# logger = logging.getLogger()\n", - "# logger.setLevel(logging.DEBUG)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "turned-watch", + "execution_count": 2, + "id": "handmade-burning", "metadata": {}, "outputs": [], "source": [ @@ -47,8 +49,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "encouraging-killer", + "execution_count": 3, + "id": "outstanding-representation", "metadata": {}, "outputs": [], "source": [ @@ -57,8 +59,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "tender-volleyball", + "execution_count": 4, + "id": "needed-backing", "metadata": {}, "outputs": [], "source": [ @@ -67,8 +69,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "perfect-promotion", + "execution_count": 5, + "id": "pacific-ranking", "metadata": {}, "outputs": [], "source": [ @@ -77,8 +79,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ordered-juvenile", + "execution_count": 6, + "id": "minute-anniversary", "metadata": {}, "outputs": [], "source": [ @@ -92,8 +94,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "raising-brisbane", + "execution_count": 7, + "id": "chief-renewal", "metadata": {}, "outputs": [], "source": [ @@ -107,8 +109,23 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "comic-apache", + "execution_count": 8, + "id": "iraqi-festival", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_day(query_day_ls): \n", + " query_day_ls_df = pd.DataFrame(data = query_day_ls)\n", + " query_day_df=query_day_ls_df.value_counts(sort = False).rename_axis('query times').to_frame('frequecy').reset_index()\n", + " query_day_df.set_index(['query times'], inplace=True)\n", + " query_day=query_day_df.plot(kind = 'bar',title='query times in a day')\n", + " query_day.yaxis.set_major_locator(MaxNLocator(integer=True))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "stopped-tractor", "metadata": {}, "outputs": [], "source": [ @@ -125,22 +142,43 @@ " \n", " sim.bin_data()\n", " sim.delete_bins()\n", + " bins = sim.bins\n", + " \n", + " # collect query trips indices above cutoff\n", + " ab_trip_ls = []\n", + " for bin in bins:\n", + " early_trip = filter_trips[bin[0]]\n", + " trip_index = 0\n", + " for i in range(1,len(bin)):\n", + " compare_trip = filter_trips[bin[i]]\n", + " if early_trip['data']['start_local_dt']['year']>compare_trip['data']['start_local_dt']['year']:\n", + " early_trip = compare_trip\n", + " trip_index = i\n", + " elif early_trip['data']['start_local_dt']['year']==compare_trip['data']['start_local_dt']['year'] and early_trip['data']['start_local_dt']['month']>compare_trip['data']['start_local_dt']['month']:\n", + " early_trip = compare_trip\n", + " trip_index = i\n", + " elif early_trip['data']['start_local_dt']['year']==compare_trip['data']['start_local_dt']['year'] and early_trip['data']['start_local_dt']['month']==compare_trip['data']['start_local_dt']['month'] and early_trip['data']['start_local_dt']['day']>compare_trip['data']['start_local_dt']['day']:\n", + " early_trip = compare_trip\n", + " trip_index = i\n", + " ab_trip_ls.append(bin[trip_index])\n", + "\n", + "\n", + " \n", " # bins below cutoff\n", " bl_bins = sim.below_cutoff\n", " \n", - " # a list of trip indices from bl_bins\n", + " # collect query trips indices below cutoff\n", " bl_trip_ls = []\n", " for bin in bl_bins:\n", " for index in bin:\n", " bl_trip_ls.append(index)\n", - "\n", - " # collect all the trips that below cutoff\n", - " bl_trips = [filter_trips[num]for num in bl_trip_ls]\n", - " \n", + " \n", + " # whole list of query trips indices\n", + " query_trips_ls=ab_trip_ls+bl_trip_ls \n", " \n", " # collect query times in a day\n", " bin_day = []\n", - " for trip_index in bl_trip_ls:\n", + " for trip_index in query_trips_ls:\n", " added = False\n", " trip = filter_trips[trip_index]\n", " for bin in bin_day:\n", @@ -153,13 +191,13 @@ " for bin in bin_day:\n", " query_day_ls.append(len(bin))\n", "\n", - " # average query times for a day\n", - " query_day.append(math.ceil(mean(query_day_ls)))\n", + " # collect query times in a day for every user\n", + " query_day.append(query_day_ls)\n", " \n", " \n", " # collect query times in a month\n", " bin_month = []\n", - " for trip_index in bl_trip_ls:\n", + " for trip_index in query_trips_ls:\n", " added = False\n", " trip = filter_trips[trip_index]\n", " for bin in bin_month:\n", @@ -172,72 +210,810 @@ " for bin in bin_month:\n", " query_month_ls.append(len(bin))\n", "\n", - " # average query times for a month\n", - " query_month.append(math.ceil(mean(query_month_ls)))\n" + " # collect query times in a month for every user\n", + " query_month.append(query_month_ls)\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "alternative-dialogue", + "execution_count": 10, + "id": "victorian-rating", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[[5,\n", + " 6,\n", + " 6,\n", + " 9,\n", + " 8,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 4,\n", + " 1,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 1,\n", + " 3,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 3,\n", + " 2,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 3,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 5,\n", + " 4,\n", + " 3,\n", + " 2,\n", + " 4,\n", + " 2,\n", + " 4,\n", + " 2,\n", + " 3,\n", + " 1],\n", + " nan,\n", + " [2,\n", + " 4,\n", + " 3,\n", + " 4,\n", + " 2,\n", + " 4,\n", + " 7,\n", + " 2,\n", + " 5,\n", + " 6,\n", + " 3,\n", + " 3,\n", + " 9,\n", + " 1,\n", + " 2,\n", + " 7,\n", + " 2,\n", + " 5,\n", + " 2,\n", + " 6,\n", + " 1,\n", + " 1,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 6,\n", + " 2,\n", + " 3,\n", + " 6,\n", + " 2,\n", + " 1,\n", + " 2,\n", + " 5,\n", + " 1,\n", + " 3,\n", + " 3,\n", + " 9,\n", + " 4,\n", + " 6,\n", + " 4,\n", + " 5,\n", + " 2,\n", + " 3,\n", + " 4,\n", + " 1,\n", + " 1,\n", + " 3,\n", + " 1,\n", + " 1],\n", + " [3,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 1,\n", + " 4,\n", + " 2,\n", + " 4,\n", + " 2,\n", + " 2,\n", + " 4,\n", + " 1,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 4,\n", + " 2,\n", + " 2,\n", + " 3,\n", + " 2,\n", + " 1,\n", + " 2,\n", + " 4,\n", + " 4,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 2,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 2,\n", + " 3,\n", + " 2],\n", + " [2, 1, 2, 2],\n", + " [2,\n", + " 3,\n", + " 3,\n", + " 1,\n", + " 3,\n", + " 3,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 3,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 3,\n", + " 2,\n", + " 1,\n", + " 3,\n", + " 1,\n", + " 5,\n", + " 2,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 3,\n", + " 2,\n", + " 1,\n", + " 5,\n", + " 3,\n", + " 4,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 5,\n", + " 2,\n", + " 4,\n", + " 2,\n", + " 2,\n", + " 3,\n", + " 1,\n", + " 3,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 1],\n", + " [2, 3, 3, 1, 2, 2, 4, 2, 2, 4, 2, 2, 2, 1, 2, 1, 1, 2, 2, 3, 1, 3, 2, 1],\n", + " [3,\n", + " 8,\n", + " 8,\n", + " 9,\n", + " 1,\n", + " 7,\n", + " 2,\n", + " 8,\n", + " 12,\n", + " 8,\n", + " 11,\n", + " 4,\n", + " 6,\n", + " 9,\n", + " 4,\n", + " 6,\n", + " 4,\n", + " 5,\n", + " 4,\n", + " 5,\n", + " 5,\n", + " 8,\n", + " 11,\n", + " 4,\n", + " 9,\n", + " 9,\n", + " 6,\n", + " 3,\n", + " 1,\n", + " 3,\n", + " 1,\n", + " 12,\n", + " 4,\n", + " 2,\n", + " 9,\n", + " 6,\n", + " 3,\n", + " 15,\n", + " 1,\n", + " 2,\n", + " 4,\n", + " 6,\n", + " 3,\n", + " 6,\n", + " 1,\n", + " 2,\n", + " 1,\n", + " 3,\n", + " 4,\n", + " 2,\n", + " 4,\n", + " 5,\n", + " 2,\n", + " 1,\n", + " 1],\n", + " [2,\n", + " 1,\n", + " 5,\n", + " 2,\n", + " 2,\n", + " 4,\n", + " 2,\n", + " 5,\n", + " 3,\n", + " 5,\n", + " 8,\n", + " 6,\n", + " 5,\n", + " 7,\n", + " 1,\n", + " 2,\n", + " 4,\n", + " 2,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 4,\n", + " 3,\n", + " 4,\n", + " 4,\n", + " 8,\n", + " 5,\n", + " 8,\n", + " 3,\n", + " 2,\n", + " 4,\n", + " 2,\n", + " 4,\n", + " 3,\n", + " 3,\n", + " 6,\n", + " 4,\n", + " 8,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 4,\n", + " 16,\n", + " 5,\n", + " 5,\n", + " 4,\n", + " 5],\n", + " [10,\n", + " 4,\n", + " 3,\n", + " 14,\n", + " 3,\n", + " 1,\n", + " 8,\n", + " 5,\n", + " 2,\n", + " 5,\n", + " 5,\n", + " 4,\n", + " 3,\n", + " 4,\n", + " 4,\n", + " 4,\n", + " 1,\n", + " 4,\n", + " 8,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 3,\n", + " 5,\n", + " 2,\n", + " 7,\n", + " 6,\n", + " 2,\n", + " 4,\n", + " 4,\n", + " 4,\n", + " 1,\n", + " 2,\n", + " 1],\n", + " [3, 2, 3, 1, 2, 1, 6],\n", + " nan,\n", + " [10,\n", + " 5,\n", + " 8,\n", + " 6,\n", + " 3,\n", + " 5,\n", + " 4,\n", + " 9,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 2,\n", + " 6,\n", + " 5,\n", + " 7,\n", + " 3,\n", + " 3,\n", + " 3,\n", + " 4,\n", + " 10,\n", + " 7,\n", + " 2,\n", + " 4,\n", + " 4,\n", + " 8,\n", + " 10,\n", + " 7,\n", + " 6,\n", + " 7,\n", + " 9,\n", + " 6,\n", + " 3,\n", + " 5,\n", + " 7,\n", + " 3,\n", + " 5,\n", + " 6,\n", + " 13,\n", + " 4,\n", + " 5,\n", + " 6,\n", + " 6,\n", + " 8,\n", + " 7,\n", + " 4,\n", + " 4,\n", + " 5,\n", + " 2,\n", + " 6,\n", + " 3,\n", + " 8,\n", + " 15,\n", + " 10,\n", + " 7,\n", + " 4,\n", + " 6,\n", + " 5,\n", + " 3,\n", + " 5,\n", + " 7,\n", + " 5,\n", + " 8,\n", + " 2,\n", + " 10,\n", + " 5,\n", + " 8,\n", + " 4,\n", + " 4,\n", + " 4,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 5,\n", + " 4,\n", + " 9,\n", + " 5,\n", + " 5,\n", + " 11,\n", + " 4,\n", + " 5,\n", + " 3,\n", + " 5,\n", + " 4,\n", + " 5,\n", + " 8,\n", + " 3,\n", + " 1]]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "query_day" ] }, { "cell_type": "code", - "execution_count": null, - "id": "tracked-pattern", + "execution_count": 11, + "id": "victorian-venue", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[[26, 61, 18, 7, 12],\n", + " nan,\n", + " [36, 52, 75],\n", + " [42, 28, 25],\n", + " [2, 3, 2],\n", + " [14, 38, 50],\n", + " [18, 7, 24, 1],\n", + " [131, 148, 4],\n", + " [71, 91, 28, 8],\n", + " [48, 50, 47],\n", + " [16, 2],\n", + " nan,\n", + " [153, 182, 152]]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "query_month" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "instant-somerset", + "cell_type": "markdown", + "id": "seven-istanbul", "metadata": {}, - "outputs": [], "source": [ - "mean_day = math.ceil(mean([x for x in query_day if str(x) != 'nan']))\n", - "mean_day" + "### Plot query times in a day" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "documented-municipality", + "cell_type": "markdown", + "id": "supreme-search", "metadata": {}, - "outputs": [], "source": [ - "mean_month = math.ceil(mean([x for x in query_month if str(x) != 'nan']))\n", - "mean_month" + "Note: frequecy represents the number of days have specific query times. Each graph represents query times for a user" ] }, { "cell_type": "code", - "execution_count": null, - "id": "burning-elephant", + "execution_count": 12, + "id": "corresponding-graphics", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAETCAYAAAAveV3LAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAXvElEQVR4nO3df5xVdZ3H8ddbQMEfgMGkIY1DiilqCAyyu/mDogUKVxdN+22W7ui6Jdv+is0NdavHwzZ2Cx5WLKtlrawWppn6AM3AJcVUQEwESwVWJwVxsgB/w3z2j3MGL9f5fQ9z54vv5+NxH3PuOed+z+eemXmfc7/nx1VEYGZm6dmn2gWYmVn3OMDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlALekSfqSpKurXMMiSZ+uwnInSmrs6eVa7yGfB26pkDQRuC4ihle7lt7A68O8B25VI6lvtWswS5kD3N5E0hhJqyRtk/QjSTdI+mo+7TxJ95TNH5KOzIf3kzRb0lOSNkuaJ2lAPm2ipEZJX5S0Cfi+pDWS/qKkrX6Snpd0QtkyDgAWAcMkbc8fwyRdLum6fJ66vJbPSHpa0guSLpI0XtKvJf1B0lVl7X5W0rp83jskHZ6Pl6RvSnpO0h/z1x/Xxvq6W9IFpesnXwcvSNog6YPtrOuZkp7M1/VaSdPbmXeApGvzdtcC4zvTVv47+b2k40vmfbuklyXVtLU86/0c4LYbSfsCPwX+G3gbsBA4qwtNfB04CjgBOBI4DJhVMv3QvN3DgQbgh8AnS6Z/CHg2IlaXNhoRLwIfBJ6JiAPzxzNt1DABGAl8BPgWcCnwAeBY4BxJp+bv9S+BLwFnAjXAL4Hr8zYmA6fk72Vw3lZTJ9fBBOA3wFDg34BrJKmNeZ8ETgYGAVcA10l6RxvzXgYckT+mAOX97q22FRGvAjew+3r+GHBXRGzp5Huy3igi/PBj14MstJ4hPz6Sj1sOfDUfPg+4p+w1QRbWAl4EjiiZ9qfAhnx4IvAa0L9k+jBgGzAwf34j8E9t1DYRaCwbdzlZPzBAXV7LYSXTm4CPlDz/CfC3+fAi4PySafsAL5FtXN4P/Bb4E2CfDtbZ3cAFJevniZJp++c1HdrJ9b8aOKONaeuBqSXPG8rXR1ttkW1Unm55L8AK4Jxq/735UdnDe+BWbhjwu8j/y3P/18nX1pAF1sq8u+IPwOJ8fIstEfFKy5PI9qLvBc6SNJhsL3tBJW8A2Fwy/HIrzw/Mhw8H5pTU+nuyjdBhEbEEuAr4NrBZ0nxJAzu5/E0tAxHxUj54YGszSjpX0uqSGo4j23NvzTCyEG6x2++lvbYi4n6yjeupko4m2+D+rJPvx3opB7iVexY4rOwjf23J8ItkIQ2ApENLpj1PFpDHRsTg/DEoIkrDq7XTnn5A9vH+bOC+iPhdG7UVfcrU08CFJbUOjogBEbEcICLmRsQ4sq6Xo4B/LHLheX/7fwGfA4ZExGBgDdlGpDXPAu8seb7r99LJtlrW86eAG0s3pJYmB7iVuw/YAVwiqa+kM4ETS6Y/DBwr6QRJ/cm6MACIiGayEPmmpLcDSDpM0pQOlvlTYCwwg6xPvC2bgSGSBnXxPbVlHvDPko4FkDRI0tn58HhJEyT1I9tovQLsLGi5LQ4g2yhtyZf5GbK95rb8OK/3YEnDgc93sa3/BqaThXh769kS4QC33UTEa2QH9c4DXiA7eHdTyfTfAv8K3AU8DtxT1sQXgSeAX0nams/37g6W+TJZ3/SI0mW1Mt9jZAcZ1+fdBMO68t5aae9msoOuN+S1riHrwgEYSLYxeoGsq6IJmF3J8lpZ/lrg38k2mpuB48m6k9pyRV7LBuBOskDudFsR0QisIgv6Xxb1Pqx6fCGPdUjStWQHy/5lDy5jFnBURHyyw5mt2yR9j+xMnj32u7Se4wsprOokvQ04n6xv1vYQSXVkn67GVLcSK4q7UKyqJP0V2cHERRGxrNr17K0kfYWsi+gbEbGh2vVYMdyFYmaWKO+Bm5klygFuZpaoHj2IOXTo0Kirq+vJRZqZJW/lypXPR8SbbjzWowFeV1fHihUrenKRZmbJk9Tq7SzchWJmligHuJlZohzgZmaJ8pWYZlao119/ncbGRl55xTc77Kr+/fszfPhw+vXr16n5HeBmVqjGxkYOOugg6urqaPuLiKxcRNDU1ERjYyMjRozo1GvchWJmhXrllVcYMmSIw7uLJDFkyJAufXJxgJtZ4Rze3dPV9eYAN7O9zty5cznmmGP4xCc+Ue1S9qhe2QdeN/P2wtvceOW0wttMpU6zair6/6Qz/yPf+c53WLRo0W59yTt27KBv314Zed3mPXAz26tcdNFFrF+/ntNPP51BgwbR0NDA5MmTOffcc9myZQtnnXUW48ePZ/z48dx7b/alRU1NTUyePJkxY8Zw4YUXcvjhh/P888+zceNGjjvujW+mmz17NpdffjkATz75JFOnTmXcuHGcfPLJPPbYYwBs3ryZ6dOnM3r0aEaPHs3y5cv58pe/zJw5c3a1c+mllzJ37tyK3+vetTkys7e8efPmsXjxYpYuXcpVV13Frbfeyj333MOAAQP4+Mc/zhe+8AVOOukknnrqKaZMmcK6deu44oorOOmkk5g1axa333478+fP73A5DQ0NzJs3j5EjR3L//fdz8cUXs2TJEi655BJOPfVUbr75Znbu3Mn27dsZNmwYZ555JjNmzKC5uZkbbriBBx54oOL36gA3s73a6aefzoABAwC46667WLt27a5pW7duZdu2bSxbtoybbsq+jnXatGkcfPDB7ba5fft2li9fztlnn71r3KuvvgrAkiVL+OEPs++M7tOnD4MGDWLQoEEMGTKEhx56iM2bNzNmzBiGDBlS8XtzgJvZXu2AAw7YNdzc3Mx99923K9BLtXYGSN++fWlubt71vOUUv+bmZgYPHszq1as7XccFF1zAtddey6ZNm/jsZz/blbfQJveBm9lbxuTJk7nqqqt2PW8J4FNOOYUFCxYAsGjRIl544QUADjnkEJ577jmampp49dVXue222wAYOHAgI0aMYOHChUB2Ec7DDz8MwKRJk/jud78LwM6dO9m6dSsA06dPZ/HixTz44INMmTKlkPfjADezt4y5c+eyYsUK3vOe9zBq1CjmzZsHwGWXXcayZcsYO3Ysd955J7W1tQD069ePWbNmMWHCBE477TSOPvroXW0tWLCAa665htGjR3Psscdyyy23ADBnzhyWLl3K8ccfz7hx43j00UcB2HfffXnf+97HOeecQ58+fQp5Pz36nZj19fXRmfuBp3J6Xip1mvWkdevWccwxx1S7jIq0fHfB0KFDC2uzubmZsWPHsnDhQkaOHNnmfK2tP0krI6K+fF7vgZuZ7WFr167lyCOPZNKkSe2Gd1f5IKaZWZmNGzcW2t6oUaNYv359oW2C98DNzJLlADezwvXksbW9SVfXmwPczArVv39/mpqaHOJd1HI/8P79+3f6Ne4DN7NCDR8+nMbGRrZs2VLtUpLT8o08neUAN7NC9evXr9PfKGOV6bALRdL3JD0naU3JuLdJ+rmkx/Of7d84wMzMCteZPvBrgall42YCv4iIkcAv8udmZtaDOgzwiFgG/L5s9BnAD/LhHwB/WXBdZmbWge6ehXJIRDwLkP98e3ElmZlZZ+zx0wglNUhaIWmFj0qbmRWnuwG+WdI7APKfz7U1Y0TMj4j6iKivqanp5uLMzKxcdwP8Z8Cn8+FPA7cUU46ZmXVWZ04jvB64D3i3pEZJ5wNXAn8u6XHgz/PnZmbWgzq8kCciPtbGpEkF12JmZl3ge6GYmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSWqogCX9AVJj0paI+l6Sf2LKszMzNrX7QCXdBhwCVAfEccBfYCPFlWYmZm1r9IulL7AAEl9gf2BZyovyczMOqPbAR4RvwNmA08BzwJ/jIg7iyrMzMza17e7L5R0MHAGMAL4A7BQ0icj4rqy+RqABoDa2toKSrXuqpt5e+FtbrxyWuFtmlnXVNKF8gFgQ0RsiYjXgZuAPyufKSLmR0R9RNTX1NRUsDgzMytVSYA/BfyJpP0lCZgErCumLDMz60glfeD3AzcCq4BH8rbmF1SXmZl1oNt94AARcRlwWUG1mJlZF/hKTDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRFUU4JIGS7pR0mOS1kn606IKMzOz9vWt8PVzgMUR8WFJ+wL7F1CTmZl1QrcDXNJA4BTgPICIeA14rZiyzMysI5Xsgb8L2AJ8X9JoYCUwIyJeLJ1JUgPQAFBbW1vB4mxvVzfz9sLb3HjltMLbNOstKukD7wuMBb4bEWOAF4GZ5TNFxPyIqI+I+pqamgoWZ2ZmpSoJ8EagMSLuz5/fSBboZmbWA7od4BGxCXha0rvzUZOAtYVUZWZmHar0LJTPAwvyM1DWA5+pvCQzM+uMigI8IlYD9QXVYmZmXeArMc3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MElXp7WTNrJfyV9Tt/bwHbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiKg5wSX0kPSTptiIKMjOzziliD3wGsK6AdszMrAsqCnBJw4FpwNXFlGNmZp1V6R74t4B/ApoLqMXMzLqg21+pJuk04LmIWClpYjvzNQANALW1td1dnFmv4a8qs96ikj3w9wKnS9oI3AC8X9J15TNFxPyIqI+I+pqamgoWZ2Zmpbod4BHxzxExPCLqgI8CSyLik4VVZmZm7fJ54GZmiep2H3ipiLgbuLuItszMrHO8B25mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mlqhuB7ikd0paKmmdpEclzSiyMDMza1/fCl67A/j7iFgl6SBgpaSfR8TagmozM7N2dHsPPCKejYhV+fA2YB1wWFGFmZlZ+wrpA5dUB4wB7i+iPTMz61glXSgASDoQ+AnwtxGxtZXpDUADQG1tbaWLM7O9TN3M2wttb+OV0wptD4qvEYqps6I9cEn9yMJ7QUTc1No8ETE/Iuojor6mpqaSxZmZWYlKzkIRcA2wLiL+o7iSzMysMyrZA38v8Cng/ZJW548PFVSXmZl1oNt94BFxD6ACazEzsy7wlZhmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZomqKMAlTZX0G0lPSJpZVFFmZtaxbge4pD7At4EPAqOAj0kaVVRhZmbWvkr2wE8EnoiI9RHxGnADcEYxZZmZWUcUEd17ofRhYGpEXJA//xQwISI+VzZfA9CQP3038Jvul9uqocDzBbe5J7jO4qRQI7jOor2V6zw8ImrKR/atoEG1Mu5NW4OImA/Mr2A57RchrYiI+j3VflFcZ3FSqBFcZ9Fc55tV0oXSCLyz5Plw4JnKyjEzs86qJMAfBEZKGiFpX+CjwM+KKcvMzDrS7S6UiNgh6XPAHUAf4HsR8WhhlXXeHuueKZjrLE4KNYLrLJrrLNPtg5hmZlZdvhLTzCxRDnAzs0Q5wM3MEuUA3wMkHS1pkqQDy8ZPrVZNrZF0oqTx+fAoSX8n6UPVrqsjkn5Y7Ro6IumkfH1OrnYtpSRNkDQwHx4g6QpJt0r6uqRB1a6vhaRLJL2z4zmrS9IRkv5B0hxJ/y7pop5cj3vNQUxJn4mI7/eCOi4B/gZYB5wAzIiIW/JpqyJibDXrayHpMrL72PQFfg5MAO4GPgDcERFfq151b5BUfmqqgPcBSwAi4vQeL6oVkh6IiBPz4b8i+xu4GZgM3BoRV1azvhaSHgVG52eRzQdeAm4EJuXjz6xqgTlJfwReBJ4ErgcWRsSW6la1u/x//S+A/wU+BKwGXgCmAxdHxN17vIiI2CsewFPVriGv4xHgwHy4DlhBFuIAD1W7vrI6+wD7A1uBgfn4AcCvq11fSZ2rgOuAicCp+c9n8+FTq11fSZ0PlQw/CNTkwwcAj1S7vpLa1pWu27Jpq6tdX+n6JOshmAxcA2wBFgOfBg6qdn15jY8AffLh/YG78+Hanvpfr+RS+h4n6ddtTQIO6cla2tEnIrYDRMRGSROBGyUdTuu3H6iWHRGxE3hJ0pMRsRUgIl6W1Fzl2krVAzOAS4F/jIjVkl6OiP+tcl3l9pF0MFnoKPK9xYh4UdKO6pa2mzUln1YfllQfESskHQW8Xu3iSkRENAN3AndK6kf2ifFjwGzgTfcFqZK+wE5gP+AggIh4Kq+3RxaekkOAKWQfU0oJWN7z5bRqk6QTImI1QERsl3Qa8D3g+OqWtpvXJO0fES8B41pG5v13vSbA83/ib0pamP/cTO/8ux0ErCT7WwxJh0bEpvw4SG/acF8AzJH0L2Q3XLpP0tPA0/m03mK3dRYRr5Nd6f0zSQOqU9KbXA08KOlXwCnA1wEk1QC/74kCkuoDl3QN8P2IuKeVaf8TER+vQlnldQwn27vd1Mq090bEvVUo600k7RcRr7Yyfijwjoh4pApldUjSNOC9EfGlatfSGZL2Bw6JiA3VrqWUpIOAd5FtDBsjYnOVS9qNpKMi4rfVrqMjko4FjgHWRMRjPb78lALczMze4NMIzcwS5QA3M0uUA9yshKTBki4ueT5M0o3VrMmsLe4Dt7cESX3y0yY7mq8OuC0ijtvjRZlVyHvg1utIulTSbyTdJel6Sf+Qj79bUn0+PFTSxny4j6RvSHpQ0q8lXZiPnyhpqaT/AR6R9BVJM0qW87X8arpSVwJHSFqdt1knaU0+/3mSfppfer5B0ufyy+UfkvQrSW/L5ztC0mJJKyX9UtLR+fizJa2R9LCkZXt2LdpbQW88n9bewiSNI/t2pzFkf5+ryM6vbs/5wB8jYryk/YB7Jd2ZTzsROC4iNuR71zeRnQe9T76cE8vampnPf0JeT13Z9OPy2voDTwBfjIgxkr4JnAt8i+yG/hdFxOOSJgDfAd4PzAKmRMTvJA3u5Coxa5MD3Hqbk4Gb8wuMWrsXSmsmA++R9OH8+SBgJPAa8EDLOdj5lbFNksaQXRT2UEQ0dbG+pRGxDdiW36/j1nz8I3kNBwJ/BiyUdl2Lsl/+817gWkk/JtuQmFXEAW69UVsHZnbwRrdf/5LxAj4fEXeUzpzfxuDFsjauBs4DDiW7OrarSi9+ai553kz2/7QP8IeWPfhSEXFRvkc+DVidX7Hb1Q2I2S7uA7feZhkwXdmtTg8iu9tbi428cdn/h0vG3wH8dcv9JyQdJemANtq/GZgKjM9fV24b+T0tuiO/p8wGSWfntUjS6Hz4iIi4PyJmkV3G3utvl2q9mwPcepWIWAX8iOzWnD8BflkyeTZZUC8HhpaMvxpYC6zKDzj+J218uoyI14ClwI9bOysl3yO+Nz/Y+I1uvo1PAOdLehh4FDgjH/8NSY/kNS4DHu5m+2aATyO0Xk7S5cD2iJhdUHv7kB0YPTsiHi+iTbNq8R64vWVIGkV25sgvHN62N/AeuJlZorwHbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmi/h/5N7rDipLd5gAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAETCAYAAAAf9UzqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAATkElEQVR4nO3df5TVdZ3H8dcLGBtWdLCBNEAYjj9SQHH4IdtuKkkHKI2WDNuO5Zqwo+spzf2Vm5s/tnZPrewWZEZuulhhJGpRdlDXFQ8qVjD8UH5oKbA6qQgTBligOO/94/ud6TrOOHdw7twPM8/HOffw/TWf7/t+mXl9v/fz/XEdEQIApKtPuQsAALw1ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQ4Jtr9g+ztlrmGZ7b8qw3on227o7vUiHeY6aqTG9mRJ34+IYeWuJQVsD3BEjZKz3a/cNQCHMoK6F7Nda3uN7T22f2h7se0v5/Musv1Iq+XD9vH58Dtsz7X9rO3tthfY7p/Pm2y7wfbnbb8o6b9tb7D94YK2KmzvtH1aq3UcLmmZpCG29+avIbavs/39fJmavJZP237O9i7bl9qeaPtx2y/bvrFVuxfb3pwve5/tEfl02/6a7Zds/y7/+THtbK+HbM8p3D75Nthle6vtD77Ftr7K9jP5tt5ke+ZbLNvf9sK83U2SJhbTVv5/8lvbpxQs+y7bf7A9uL31IX0EdS9l+zBJP5b0PUnvlLRE0nmdaOKrkk6UdJqk4yUNlXRNwfxj8nZHSKqT9F1JnyyY/yFJL0TEusJGI+IVSR+U9HxEDMhfz7dTwyRJJ0j6uKSvS7pa0gckjZZ0vu2z8vf6F5K+IOmjkgZLeljSD/I2pko6M38vA/O2GovcBpMkPSVpkKR/l3SLbbez7DOSzpBUJel6Sd+3/e52lr1W0nH5a5qk1v3ibbYVEfslLdYbt/MnJD0QETuKfE9IUUTw6oUvZeH0vPLzFPm0lZK+nA9fJOmRVj8TykLZkl6RdFzBvPdK2poPT5b0qqTKgvlDJO2RdGQ+fqekf2yntsmSGlpNu05ZP60k1eS1DC2Y3yjp4wXjd0n6XD68TNLsgnl9JP1e2U7kbEm/kvSnkvp0sM0ekjSnYPs8XTDvT/Kajily+6+T9JF25m2RNL1gvK719mivLWU7j+ea34uk1ZLOL/fvG6+39+KIuvcaIuk3kf815/6vyJ8drCyY6vNuhpcl3ZtPb7YjIvY1j0R2VPyopPNsD1R21Lzo7bwBSdsLhv/QxviAfHiEpHkFtf5W2c5maEQ8KOlGSd+UtN32zbaPLHL9LzYPRMTv88EBbS1o+0Lb6wpqGKPsSLwtQ5SFbbM3/L+8VVsR8QtlO9GzbJ+kbMf6kyLfDxJFUPdeL0ga2uqj+vCC4VeUhbEkyfYxBfN2KgvC0RExMH9VRURhSLV1OdFtyj6Wz5L0WET8pp3auvpSpOckXVJQ68CI6B8RKyUpIuZHxHhlXSYnSvqHrlx53h/+X5I+I6k6IgZK2qBsZ9GWFyQdWzDe8v9SZFvN2/lTku4s3GHi0ERQ916PSTog6XLb/Wx/VNLpBfPXSxpt+zTblcq6HiRJEdGkLCy+ZvtdkmR7qO1pHazzx5LGSbpCWZ91e7ZLqrZd1cn31J4Fkv7J9mhJsl1le1Y+PNH2JNsVynZO+yS93kXrbXa4sp3Pjnydn1Z2FNyeO/J6j7I9TNJnO9nW9yTNVBbWb7WdcYggqHupiHhV2cm1iyTtUnYS7e6C+b+S9C+SHpD0a0mPtGri85KelvRz27vz5d7TwTr/oKzveGThutpY7kllJ/u25B/vh3TmvbXR3o+UnfxcnNe6QVnXiyQdqWyns0tZF0OjpLlvZ31trH+TpP9QtnPcLukUZd1A7bk+r2WrpPuVBW/RbUVEg6Q1ygL94a56HygfbnhBC9sLlZ20+ucSruMaSSdGxCc7XBgHzfatyq6cKdn/JboPNyKg29h+p6TZyvpOUSK2a5R9WqotbyXoKnR9oFvY/mtlJ/WWRcSKctfTU9n+krKunRsiYmu560HXoOsDABLHETUAJI6gBoDEleRk4qBBg6KmpqYUTQNAj1RfX78zItp8eFZJgrqmpkarV68uRdMA0CPZbvcRDnR9AEDiCGoASBxBDQCJ67Y7E1977TU1NDRo3z4e5NUZlZWVGjZsmCoqKspdCoAy6bagbmho0BFHHKGamhq1/yUYKBQRamxsVENDg0aOHFnucgCUSYddH7Yrbf/S9nrbG21ffzAr2rdvn6qrqwnpTrCt6upqPoUAvVwxR9T7JZ0dEXvzZ/Y+YntZRPy8sysjpDuPbQagwyPqyOzNRyvy1yH5gJD58+fr5JNP1gUXXFDuUgCgaEX1UdvuK6le2fevfTP/XrbWy9Qp+xJODR8+vPXsN6m56medKrQj275yTofL3HTTTVq2bNkb+nsPHDigfv142ivS0tV/H71dMfmQsqIuz4uI1yPiNEnDJJ1u+01fIxQRN0fEhIiYMHhwm3dBltWll16qLVu2aMaMGaqqqlJdXZ2mTp2qCy+8UDt27NB5552niRMnauLEiXr00ewLMxobGzV16lTV1tbqkksu0YgRI7Rz505t27ZNY8b8cRPMnTtX1113nSTpmWee0fTp0zV+/HidccYZevLJJyVJ27dv18yZMzV27FiNHTtWK1eu1Be/+EXNmzevpZ2rr75a8+fP776NAuCQ0KnrqCPiZUkPSZpekmpKaMGCBRoyZIiWL1+uK6+8UvX19Vq6dKluv/12XXHFFbryyiu1atUq3XXXXZozZ44k6frrr9f73vc+rV27VjNmzNCzzz7b4Xrq6ur0jW98Q/X19Zo7d64uu+wySdLll1+us846S+vXr9eaNWs0evRozZ49W7fddpskqampSYsXL6ZbBsCbdPiZ3/ZgSa9FxMu2+0v6gLLvnzukzZgxQ/3795ckPfDAA9q0aVPLvN27d2vPnj1asWKF7r47+2q/c845R0cdddRbtrl3716tXLlSs2bNapm2f/9+SdKDDz6o7343+57Rvn37qqqqSlVVVaqurtbatWu1fft21dbWqrq6ukvfJ4BDXzGds++WdFveT91H0h0RcU9pyyq9ww8/vGW4qalJjz32WEtwF2rrqot+/fqpqampZbz58rmmpiYNHDhQ69atK7qOOXPmaOHChXrxxRd18cUXd+YtAOglirnq4/GIqI2IUyNiTET8S3cU1p2mTp2qG2+8sWW8OWjPPPNMLVq0SJK0bNky7dq1S5J09NFH66WXXlJjY6P279+ve+7J9ltHHnmkRo4cqSVLlkjKblhZv369JGnKlCn61re+JUl6/fXXtXv3bknSzJkzde+992rVqlWaNm1aN7xbAIcanvWh7LK91atX69RTT9WoUaO0YMECSdK1116rFStWaNy4cbr//vtbrmapqKjQNddco0mTJuncc8/VSSed1NLWokWLdMstt2js2LEaPXq0li5dKkmaN2+eli9frlNOOUXjx4/Xxo0bJUmHHXaY3v/+9+v8889X3759u/mdAzgUlOQ7EydMmBCtn0e9efNmnXzyyV2+ru7U/JztQYMGdVmbTU1NGjdunJYsWaITTjihzWV6wrZD53B5Xtc6FC7Ps10fERPamscRdRlt2rRJxx9/vKZMmdJuSAMAd3p0wrZt27q0vVGjRmnLli1d2iaAnocjagBIXLcGdSn6w3s6thmAbgvqyspKNTY2Ejyd0Pw86srKynKXAqCMuq2PetiwYWpoaNCOHTu6a5U9QvM3vADovbotqCsqKviWEgA4CJxMBIDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgAS12FQ2z7W9nLbm21vtH1FdxQGAMj0K2KZA5L+LiLW2D5CUr3t/4mITSWuDQCgIo6oI+KFiFiTD++RtFnS0FIXBgDIdKqP2naNpFpJv2hjXp3t1bZX79ixo2uqAwAUH9S2B0i6S9LnImJ36/kRcXNETIiICYMHD+7KGgGgVysqqG1XKAvpRRFxd2lLAgAUKuaqD0u6RdLmiPjP0pcEAChUzBH1n0v6lKSzba/LXx8qcV0AgFyHl+dFxCOS3A21AADawJ2JAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEhch0Ft+1bbL9ne0B0FAQDeqJgj6oWSppe4DgBAOzoM6ohYIem33VALAKAN/bqqIdt1kuokafjw4V3VbMnUXPWzcpfQo2z7yjnlLgHosbrsZGJE3BwREyJiwuDBg7uqWQDo9bjqAwASR1ADQOKKuTzvB5Iek/Qe2w22Z5e+LABAsw5PJkbEJ7qjEABA2+j6AIDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0Diigpq29NtP2X7adtXlbooAMAfdRjUtvtK+qakD0oaJekTtkeVujAAQKaYI+rTJT0dEVsi4lVJiyV9pLRlAQCa9StimaGSnisYb5A0qfVCtusk1eWje20/9fbLg6RBknaWu4iO+KvlrgBlwu9n1xnR3oxigtptTIs3TYi4WdLNnSgKRbC9OiImlLsOoC38fnaPYro+GiQdWzA+TNLzpSkHANBaMUG9StIJtkfaPkzSX0r6SWnLAgA067DrIyIO2P6MpPsk9ZV0a0RsLHllaEZ3ElLG72c3cMSbupsBAAnhzkQASBxBDQCJI6gBIHEENYCi2D7J9hTbA1pNn16umnoLgvoQYfvT5a4BvZftyyUtlfRZSRtsFz5G4t/KU1XvwVUfhwjbz0bE8HLXgd7J9hOS3hsRe23XSLpT0vciYp7ttRFRW9YCe7hibiFHN7H9eHuzJB3dnbUArfSNiL2SFBHbbE+WdKftEWr7MRPoQgR1Wo6WNE3SrlbTLWll95cDtHjR9mkRsU6S8iPrcyXdKumU8pbW8xHUablH0oDmP4ZCth/q/nKAFhdKOlA4ISIOSLrQ9rfLU1LvQR81ACSOqz4AIHEENQAkjqBGr2R7oO3LCsaH2L6znDUB7aGPGj2K7b4R8XoRy9VIuicixpS8KOBt4ogaZWP7attP2X7A9g9s/30+/SHbE/LhQba35cN9bd9ge5Xtx21fkk+fbHu57dslPWH7S7avKFjPv+Z31hX6iqTjbK/L26yxvSFf/iLbP7b9U9tbbX/G9t/aXmv757bfmS93nO17bdfbftj2Sfn0WbY32F5ve0VptyJ6Ay7PQ1nYHq/s24Jqlf0erpFU38GPzZb0u4iYaPsdkh61fX8+73RJYyJia360fLekebb75Os5vVVbV+XLn5bXU9Nq/pi8tkpJT0v6fETU2v6askvVvq7sofmXRsSvbU+SdJOksyVdI2laRPzG9sAiNwnQLoIa5XKGpB9FxO8lyXYxX+82VdKptj+Wj1dJOkHSq5J+GRFbpZY75xpt1yq7iWhtRDR2sr7lEbFH0h7bv5P003z6E3kNAyT9maQldsuNee/I/31U0kLbdyjbYQBvC0GNcmrvBMkB/bFbrrJguiV9NiLuK1w4v535lVZtfEfSRZKOUXb3XGftLxhuKhhvUvZ300fSy81H5IUi4tL8CPscSevyO/o6u6MAWtBHjXJZIWmm7f62j5D04YJ52ySNz4c/VjD9Pkl/Y7tCkmyfaPvwdtr/kaTpkibmP9faHklHHGzxEbFb0lbbs/JabHtsPnxcRPwiIq6RtFPSsQe7HkAiqFEmEbFG0g8lrZN0l6SHC2bPVRbIKyUNKpj+HUmbJK3JT/x9W+18KoyIVyUtl3RHW1eB5Ee4j+Yn/W44yLdxgaTZttdL2iip+dGfN9h+Iq9xhaT1B9k+IInL85AI29dJ2hsRc7uovT7KTlDOiohfd0WbQLlwRI0ex/YoZVdq/C8hjZ6AI2oASBxH1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBx/w+0fWgfrPx15wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAETCAYAAAAf9UzqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAUC0lEQVR4nO3de5RV5X3G8ecRUIjAYIBoEGGoYuSi3KVpvJDQBSQaUzSYRhPrhaJ1GW16i40JapK2SaVNYBGlNCpNQkKC0Ri1oLWQhVyCAkJE0ESB6kRBmJAAKijMr3/sPXiczDC3M5x3Zr6ftc5yn733effvvIPP2ec9++KIEAAgXceUugAAwJER1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOo0SrY/qLt75S4hkW2/6IE2x1nu+JobxfpMMdRIzW2x0n6fkT0LXUtKaA/wB41WpztjqWuAWjNCOp2zPYI2+ts77X9I9sLbH8tX3al7eU11g/bp+XTx9meYfsl2ztsz7HdJV82znaF7S/Y3i7pXtsbbX+8oK1OtnfZHl5jG8dLWiSpj+19+aOP7dtsfz9fpzyv5SrbL9vebfs622Ns/9L272zPrtHu1bY35+s+art/Pt+2v2n7Ndu/z18/tI7++rntqYX9k/fBbttbbX/0CH19s+0X877eZHvyEdbtYnte3u4mSWMa0lb+N/mt7TML1n2f7Tdt965re0gfQd1O2T5W0k8lfU/SeyUtlHRJI5r4hqTTJQ2XdJqkkyVNL1h+Ut5uf0nTJH1X0mcKln9M0qsRsb6w0Yh4XdJHJb0SEV3zxyt11DBW0kBJn5L0LUm3SPpTSUMkXWr7/Py9/pmkL0q6WFJvSU9I+mHexgRJ5+XvpUfeVmUD+2CspOcl9ZL0r5Lutu061n1R0rmSyiTdLun7tt9fx7q3Sjo1f0yUVHNcvNa2IuKApAV6dz9/WtLjEbGzge8JKYoIHu3woSycXlH+O0U+b6Wkr+XTV0paXuM1oSyULel1SacWLPugpK359DhJb0nqXLC8j6S9krrnz++T9A911DZOUkWNebcpG6eVpPK8lpMLlldK+lTB859I+ut8epGkawqWHSPpDWUfIh+R9CtJfyzpmHr67OeSphb0zwsFy96T13RSA/t/vaRP1LFsi6RJBc+n1eyPutpS9uHxcvV7kbRG0qWl/vfGo3kP9qjbrz6SfhP5/825/2vga3srC6a1+TDD7yQtzudX2xkR+6ufRLZXvELSJbZ7KNtrnt+cNyBpR8H0m7U875pP95c0s6DW3yr7sDk5IpZImi3p25J22J5ru3sDt7+9eiIi3sgnu9a2ou0rbK8vqGGosj3x2vRRFrbV3vV3OVJbEbFa2Yfo+bbPUPbB+rMGvh8kiqBuv16VdHKNr+r9CqZfVxbGkiTbJxUs26UsCIdERI/8URYRhSFV2+FE/6Xsa/kUSasi4jd11FbsQ5FelnRtQa09IqJLRKyUpIiYFRGjlA2ZnC7p74u58Xw8/D8l3SCpZ0T0kLRR2YdFbV6VdErB88N/lwa2Vd3Pn5V0X+EHJlongrr9WiXpoKQbbXe0fbGkswuWb5A0xPZw252VDT1IkiKiSllYfNP2+yTJ9sm2J9azzZ9KGinpJmVj1nXZIamn7bJGvqe6zJH0j7aHSJLtMttT8ukxtsfa7qTsw2m/pENF2m6145V9+OzMt3mVsr3guvw4r/cE230lfa6RbX1P0mRlYX2kfkYrQVC3UxHxlrIf166UtFvZj2j3Fyz/laSvSHpc0q8lLa/RxBckvSDpF7b35Ot9oJ5tvqls7HhA4bZqWe85ZT/2bcm/3vdpzHurpb0HlP34uSCvdaOyoRdJ6q7sQ2e3siGGSkkzmrO9Wra/SdK/Kftw3CHpTGXDQHW5Pa9lq6THlAVvg9uKiApJ65QF+hPFeh8oHU54wWG25yn70epLLbiN6ZJOj4jP1Lsymsz2PcqOnGmxvyWOHk5EwFFj+72SrlE2dooWYrtc2belEaWtBMXC0AeOCtt/qexHvUURsazU9bRVtr+qbGjnjojYWup6UBwMfQBA4tijBoDEEdQAkLgW+TGxV69eUV5e3hJNA0CbtHbt2l0RUevFs1okqMvLy7VmzZqWaBoA2iTbdV7CgaEPAEgcQQ0AiSOoASBxnJkIoEnefvttVVRUaP9+Ls7XGJ07d1bfvn3VqVOnBr+GoAbQJBUVFerWrZvKy8tV941tUCgiVFlZqYqKCg0YMKDBr6t36MP2KbaX5vebe9b2Tc2qFECbsH//fvXs2ZOQbgTb6tmzZ6O/hTRkj/qgpL+NiHW2uym7q8f/5JdbBNCOEdKN15Q+q3ePOiJejYh1+fReSZuV3cgUAEpq1qxZGjRokC6//PJSl9KiGjVGnV8+cYSk1bUsm6bsJpzq169fzcXNVn7zI0Vvs9i2ff2CUpfQIK2hLyX6s9hauj+L3Q8NqffOO+/UokWL3jXee/DgQXXs2LZ+fmvw4Xm2u+qdOzvvqbk8IuZGxOiIGN27d61nQQJA0Vx33XXasmWLLrroIpWVlWnatGmaMGGCrrjiCu3cuVOXXHKJxowZozFjxmjFiuwmOJWVlZowYYJGjBiha6+9Vv3799euXbu0bds2DR36zh3NZsyYodtuu02S9OKLL2rSpEkaNWqUzj33XD333HOSpB07dmjy5MkaNmyYhg0bppUrV+rLX/6yZs6cebidW265RbNmzWr2e23Qx05+P7mfSJofEXXeQgkAjpY5c+Zo8eLFWrp0qWbPnq2HHnpIy5cvV5cuXXTZZZfp85//vM455xy99NJLmjhxojZv3qzbb79d55xzjqZPn65HHnlEc+fOrXc706ZN05w5czRw4ECtXr1a119/vZYsWaIbb7xR559/vh544AEdOnRI+/btU58+fXTxxRfrpptuUlVVlRYsWKAnn3yy2e+13qDO71J9t6TNEfHvzd4iALSAiy66SF26dJEkPf7449q06Z3jHfbs2aO9e/dq2bJluv/+bF/zggsu0AknnHDENvft26eVK1dqypQph+cdOHBAkrRkyRJ997vZvYM7dOigsrIylZWVqWfPnnr66ae1Y8cOjRgxQj179mz2e2vIHvWHlN066Rnb6/N5X4yI/2721gGgSI4//vjD01VVVVq1atXh4C5U21EXHTt2VFVV1eHn1YfPVVVVqUePHlq/fv0fvKYuU6dO1bx587R9+3ZdffXVjXkLdWrIUR/LI8IRcVZEDM8fhDSAZE2YMEGzZ88+/Lw6aM877zzNnz9fkrRo0SLt3r1bknTiiSfqtddeU2VlpQ4cOKCHH35YktS9e3cNGDBACxculJSdsLJhwwZJ0vjx43XXXXdJkg4dOqQ9e7Kf7iZPnqzFixfrqaee0sSJE4vyfrjWB4A2Z9asWVqzZo3OOussDR48WHPmzJEk3XrrrVq2bJlGjhypxx577PARap06ddL06dM1duxYXXjhhTrjjDMOtzV//nzdfffdGjZsmIYMGaIHH3xQkjRz5kwtXbpUZ555pkaNGqVnn31WknTsscfqwx/+sC699FJ16NChKO+nRe6ZOHr06Cj29ahbwyFQHE5WXPRncRW7Pzdv3qxBgwYVtc2jrfra+b169Spam1VVVRo5cqQWLlyogQMH1rpObX1ne21EjK5tffaoAaBINm3apNNOO03jx4+vM6Sbom0dFQ4AjbBt27aitjd48GBt2bKlqG1K7FEDQPIIagBN1hK/cbV1TekzghpAk3Tu3FmVlZWEdSNUX4+6c+fOjXodY9QAmqRv376qqKjQzp07S11Kq1J9h5fGIKgBNEmnTp0adZcSNB1DHwCQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAImrN6ht32P7Ndsbj0ZBAIB3a8ge9TxJk1q4DgBAHeoN6ohYJum3R6EWAEAtGKMGgMR1LFZDtqdJmiZJ/fr1K1azANqZ8psfKXUJ9dr29QuO6vaKtkcdEXMjYnREjO7du3exmgWAdo+hDwBIXEMOz/uhpFWSPmC7wvY1LV8WAKBavWPUEfHpo1EIAKB2DH0AQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkLgGBbXtSbaft/2C7ZtbuigAwDvqDWrbHSR9W9JHJQ2W9Gnbg1u6MABApiF71GdLeiEitkTEW5IWSPpEy5YFAKjmiDjyCvYnJU2KiKn5889KGhsRN9RYb5qkafnTD0h6vvjlFlUvSbtKXUQbQn8WF/1ZXK2hP/tHRO/aFnRswItdy7w/SPeImCtpbiMLKxnbayJidKnraCvoz+KiP4urtfdnQ4Y+KiSdUvC8r6RXWqYcAEBNDQnqpyQNtD3A9rGS/lzSz1q2LABAtXqHPiLioO0bJD0qqYOkeyLi2RavrOW1mmGaVoL+LC76s7hadX/W+2MiAKC0ODMRABJHUANA4ghqAEgcQY0msX2G7fG2u9aYP6lUNbVmts+2PSafHmz7b2x/rNR1tTa2x9runk93sX277Ydsf8N2WanrayqCWpLtq0pdQ2ti+0ZJD0r6nKSNtgsvKfDPpamq9bJ9q6RZku6y/S+SZkvqKulm27eUtLjW5x5Jb+TTMyWVSfpGPu/eUhXVXBz1Icn2SxHRr9R1tBa2n5H0wYjYZ7tc0n2SvhcRM20/HREjSlpgK5P353BJx0naLqlvROyx3UXS6og4q6QFtiK2N0fEoHx6XUSMLFi2PiKGl666pmvIKeRtgu1f1rVI0olHs5Y2oENE7JOkiNhme5yk+2z3V+2XHMCRHYyIQ5LesP1iROyRpIh403ZViWtrbTbavioi7pW0wfboiFhj+3RJb5e6uKZqN0GtLIwnStpdY74lrTz65bRq220Pj4j1kpTvWV+o7GvnmaUtrVV6y/Z7IuINSaOqZ+ZjqgR140yVNNP2l5RdhGmV7ZclvZwva5XazdCH7bsl3RsRy2tZ9oOIuKwEZbVKtvsq2wvcXsuyD0XEihKU1WrZPi4iDtQyv5ek90fEMyUoq1Wz3U3SHynbGa2IiB0lLqlZ2k1QA0BrxVEfAJA4ghoAEkdQo12y3cP29QXP+9i+r5Q1AXVhjBptiu0O+aFu9a1XLunhiBja4kUBzcQeNUrG9i22n7f9uO0f2v67fP7PbY/Op3vZ3pZPd7B9h+2nbP/S9rX5/HG2l9r+gaRnbH/V9k0F2/mn/GzKQl+XdKrt9Xmb5bY35utfafun+anHW23fkJ/S/bTtX9h+b77eqbYX215r+wnbZ+Tzp9jeaHuD7WUt24toD9rTcdRIiO1Ryu4WNELZv8N1ktbW87JrJP0+IsbYPk7SCtuP5cvOljQ0Irbme8v3Kzue9ph8O2fXaOvmfP3heT3lNZYPzWvrLOkFSV+IiBG2vynpCknfUnYx+usi4te2x0q6U9JHJE2XNDEifmO7RwO7BKgTQY1SOVfSA/lJHrLdkNu7TZB0lu1P5s/LJA2U9JakJyNiq3T4bMlK2yOUnej0dERUNrK+pRGxV9Je27+X9FA+/5m8hq6S/kTSQvvwyZjH5f9dIWme7R8r+8AAmoWgRinV9QPJQb0zLNe5YL4lfS4iHi1cOT+F/fUabXxH0pWSTlJ2xmRjFZ6AUlXwvErZ/zfHSPpdbdeOiIjr8j3sCyStz8/ibOwHBXAYY9QolWWSJueXouwm6eMFy7bpnVOpP1kw/1FJf2W7kyTZPt328XW0/4CkSZLG5K+raa+kbk0tPr8ex1bbU/JabHtYPn1qRKyOiOnKTmM+panbASSCGiUSEesk/UjSekk/kfREweIZygJ5paReBfO/I2mTpHX5D3//oTq+FUbEW5KWSvpxbUeB5Hu4K/If/e5o4tu4XNI1tjdIelZS9eVe77D9TF7jMkkbmtg+IInD85AI27dJ2hcRM4rU3jHKfqCcEhG/LkabQKmwR402x/ZgZUdq/C8hjbaAPWoASBx71ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBx/w9Umq0adTwQDwAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "for i in range (len(query_day)):\n", + " if query_day[i] is not NaN:\n", + " plot_day(query_day[i])" + ] + }, + { + "cell_type": "markdown", + "id": "worst-musical", + "metadata": {}, + "source": [ + "### Plot query times in a month" + ] + }, + { + "cell_type": "markdown", + "id": "adverse-bruce", "metadata": {}, - "outputs": [], "source": [ - "df = pd.DataFrame(data = {'query times in a day':query_day,'query times in a month':query_month})\n", - "df.loc['mean'] = [mean_day,mean_month]\n", - "df" + "Note: not all users have data on the same number of months. Each graph represents query times for a user in a month." ] }, { "cell_type": "code", - "execution_count": null, - "id": "incoming-gibson", + "execution_count": 13, + "id": "removed-sheffield", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAETCAYAAAAmkv2xAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAZbElEQVR4nO3de5RV5Z3m8e/DLSSC0EJ1K4VF0QGNigqIqIto42VU7BCj8ZqLrWmHiZdpzWrpGE0UWaaj6Yw9GhLRaW2COjKNiQ6J1yQ2E21bQC4iilE0lVANXkBFUCFcfvPH3kUfD6fqnIJTdeCt57PWWey93/fs/atd1FO73rMvigjMzGzP163WBZiZWXU40M3MEuFANzNLhAPdzCwRDnQzs0Q40M3MEuFAtz2GpGsk/VONa3hU0l/VsoaOJqlRUkjqUetarH3k89BtdyRpPHBvRAyudS2pk9QEXBwRv8rnG4HfAT0jYkvtKrP28hG6dQof7Zl1PAd6FydplKRFktZL+j+SZkm6MW+7UNLTRf1D0rB8+hOSfiDpD5LelDRd0ifztvGSmiV9U9IbwD9LWiZpYsG6ekpaI2lk0Tb2Ah4FBknakL8GSZoi6d68T8uwwEWSVkp6V9LXJR0paamk9yRNK1rv1yQtz/s+LmlIvlyS/lHSW5LW5e8f0cr+mivp4sL9k++DdyX9TtKENvb11ZJey/f1S5LOaKPvFEmzJd2b939B0gGSvpXXuVLSyQX9B0maI+kdSSsk/deidf2LpJn5ul6UNCZvuwdoAH6e7+e/Kyjjy/n3do2ka1ur1XYfDvQuTFIv4CHgHmAfYDbwxXas4mbgAGAkMAyoB64raN83X+8QYBIwE/hKQftpwOqIWFK40oj4AJgArIqIPvlrVSs1HAUMB84F/idwLXAScAhwjqS/yL/WLwDXAGcCdcBTwP35Ok4Gjsu/lv75utZWuA+OAn4LDAS+D9wlSa30fQ04FugH3ADcK2m/NtY9kex78yfAYuBxsp/ZemAqcEdB3/uBZmAQcBbw95JOLGj/PDAr//rmANMAIuKrwB+Aifl+/n7Bez4LHAicCFwn6aA2arXdQUT41UVfZCG2ivyzlHzZM8CN+fSFwNNF7wmy8BbwAfDpgrZjgN/l0+OBPwK9C9oHAeuBvfP5B4C/a6W28UBz0bIpZOPqAI15LfUF7WuBcwvmfwpcmU8/Cvx1QVs34EOyXzYnAK8ARwPdyuyzuWTjzS37Z0VB26fymvatcP8vAU5vpW0K8MuC+YnABqB7Pt8331Z/YH9gK9C3oP/3gBkF6/pVQdvBwEcF803ASQXzLft2cMGy+cB5tf4/61fbLx+hd22DgP+I/Cc29/sK31tHFmAL8+GN94DH8uUt3o6IjS0zkR1l/xvwRUn9yY7C79uVLwB4s2D6oxLzffLpIcCtBbW+Q/ZLqT4iniQ7Yv0R8KakOyXtXeH232iZiIgP88k+pTpKukDSkoIaRpAd2Vf6ta2JiK0F8y3bGgS8ExHrC/r/nuxIfoc6yX6R9a7gc43i95T8umz34UDv2lYD9UVDBA0F0x+QhTYAkvYtaFtDFiqHRET//NUvIgp/6EudQvUTsmGXs4F/j4j/aKW2ap9+tRL4bwW19o+IT0bEMwARcVtEHEE2VHMAMLmaG8/H6/8XcDkwICL6A8vIfqnsqlXAPpL6FixrAFrbt8V8qlsiHOhd278DW4C/kdRD0pnA2IL254FDJI2U1JvsT3cAImIbWUD9o6Q/BZBUL+mUMtt8CBgNXEE2pt6aN4EBkvq182tqzXTgW5IOAZDUT9LZ+fSRko6S1JPsl9hGsiGMatqLLDjfzrd5EdkR+i6LiJVkQ2Xfk9Rb0mHAX1P5Xz9vAn9ejVqsthzoXVhE/JHsQ8ILgXfJPgz8WUH7K2Qfvv0KeBV4umgV3wRWAM9Kej/vd2CZbX5ENrY9tHBbJfq9TPZB3+v5EMWg9nxtJdb3INmHuLPyWpeRDfkA7E32y+ldsqGKtcAPdmV7Jbb/EvA/yH6JvgkcSjb8VC3nk419rwIeBK6PiF9W+N7vAd/O9/NVVazJOpkvLLKPkTSD7MPIb3fgNq4DDoiIr5TtbGYV88Ue1qkk7UM2HPDVWtdilhoPuVinyS92WQk8GhG/qXU9ZqnxkIuZWSJ8hG5mlggHuplZImr2oejAgQOjsbGxVps3M9sjLVy4cE1E1JVqq1mgNzY28txzz9Vq82ZmeyRJrd6ew0MuZmaJcKCbmSXCgW5mlghfKWpmrdq8eTPNzc1s3LixfGerqt69ezN48GB69uxZ8Xsc6GbWqubmZvr27UtjYyOtP4jJqi0iWLt2Lc3NzQwdOrTi95Udcslvxzlf0vP5swhvKNFHkm7Ln2W4VNLodtZvZruhjRs3MmDAAId5J5PEgAED2v2XUSVH6JuAEyJiQ36/6KclPRoRzxb0mUD2XMfhZM9YvD3/18z2cA7z2tiZ/V72CD0yG/LZnvmr+AYwpwMz877PAv3LPPzWzGy399577/HjH/94+/yqVas466yzalhR2yoaQ5fUHVhI9nDgH0XEvKIu9WR30WvRnC9bXbSeSWRPf6ehofBJZ2Zta7z64VqX0KGabvrLWpdQkWp/H2r1dW/dupXu3buX7dcS6JdeeikAgwYN4oEHHujo8nZaRactRsTWiBgJDAbGSip+dFapvw12uI1jRNwZEWMiYkxdXckrV83MPua73/0uBx54ICeddBLnn38+P/hB9jCp8ePHb7/afM2aNbTcSmTr1q1MnjyZI488ksMOO4w77rgDgLlz53L88cfzpS99iUMPPZTvfOc73Hrrrdu3c+2113Lbbbd9bNtXX301r732GiNHjmTy5Mk0NTUxYkQWfzNmzOALX/gCEydOZOjQoUybNo1bbrmFUaNGcfTRR/POO+8A8Nprr3HqqadyxBFHcOyxx/Lyyy8DMHv2bEaMGMHhhx/OcccdV5V91a6zXCLiPUlzgVPJHuHVohnYv2B+MNmjsMzMdtrChQuZNWsWixcvZsuWLYwePZojjjiizffcdddd9OvXjwULFrBp0ybGjRvHySefDMD8+fNZtmwZQ4cOpampiTPPPJMrrriCbdu2MWvWLObPn/+xdd10000sW7aMJUuWANDU1PSx9mXLlrF48WI2btzIsGHDuPnmm1m8eDHf+MY3mDlzJldeeSWTJk1i+vTpDB8+nHnz5nHppZfy5JNPMnXqVB5//HHq6+t57733qrK/yga6pDpgcx7mnwROIns2Y6E5wOWSZpF9GLouIlZjZrYLnnrqKc444ww+9alPAfD5z3++7HueeOIJli5dun1oZN26dbz66qv06tWLsWPHbj8NsLGxkQEDBrB48WLefPNNRo0axYABA9pV3/HHH0/fvn3p27cv/fr1Y+LEiQAceuihLF26lA0bNvDMM89w9tlnb3/Ppk2bABg3bhwXXngh55xzDmeeeWa7ttuaSo7Q9wN+ko+jdwP+JSJ+IenrABExHXgEOI3sgcEfAhdVpToz6/JaO9ujR48ebNu2DeBjp/dFBD/84Q855ZRTPtZ/7ty57LXXXh9bdvHFFzNjxgzeeOMNvva1r7W7tk984hPbp7t167Z9vlu3bmzZsoVt27bRv3//7Uf4haZPn868efN4+OGHGTlyJEuWLGn3L5RilZzlsjQiRkXEYRExIiKm5sun52HecibMZRHx6Yg4NCJ8G0Uz22XHHXccDz74IB999BHr16/n5z//+fa2xsZGFi5cCPCxDypPOeUUbr/9djZv3gzAK6+8wgcffFBy/WeccQaPPfYYCxYs2OEXAEDfvn1Zv379Tte/9957M3ToUGbPng1kv2yef/55IBtbP+qoo5g6dSoDBw5k5cqVba2qIr5S1Mx2W6NHj+bcc89l5MiRDBkyhGOPPXZ721VXXcU555zDPffcwwknnLB9+cUXX0xTUxOjR48mIqirq+Ohhx4quf5evXpx/PHH079//5JnvQwYMIBx48YxYsQIJkyYwGWXXdbur+G+++7jkksu4cYbb2Tz5s2cd955HH744UyePJlXX32ViODEE0/k8MMPb/e6i9XsmaJjxowJ3w/dKuXTFmtj+fLlHHTQQbUuY7spU6bQp08frrrqqqqsb9u2bYwePZrZs2czfPjwqqyzmkrtf0kLI2JMqf6+26KZdUkvvfQSw4YN48QTT9wtw3xneMjFzPYYU6ZMqdq6Dj74YF5//fWqrW934CN0M7NEONDNrE21+pytq9uZ/e5AN7NW9e7dm7Vr1zrUO1nL/dB79+7drvd5DN3MWjV48GCam5t5++23a11Kl9PyxKL2cKCbWat69uzZrifmWG15yMXMLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRDjQzcwS4UA3M0tE2UCXtL+kf5W0XNKLkq4o0We8pHWSluSv6zqmXDMza00lTyzaAvxtRCyS1BdYKOmXEfFSUb+nIuJz1S/RzMwqUfYIPSJWR8SifHo9sByo7+jCzMysfdo1hi6pERgFzCvRfIyk5yU9KumQKtRmZmbtUPFDoiX1AX4KXBkR7xc1LwKGRMQGSacBDwHDS6xjEjAJoKGhYaeLNjOzHVV0hC6pJ1mY3xcRPytuj4j3I2JDPv0I0FPSwBL97oyIMRExpq6ubhdLNzOzQpWc5SLgLmB5RNzSSp99835IGpuvd201CzUzs7ZVMuQyDvgq8IKkJfmya4AGgIiYDpwFXCJpC/ARcF5ERAfUa2ZmrSgb6BHxNKAyfaYB06pVlJmZtZ+vFDUzS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEWUDXdL+kv5V0nJJL0q6okQfSbpN0gpJSyWN7phyzcysNT0q6LMF+NuIWCSpL7BQ0i8j4qWCPhOA4fnrKOD2/F8zM+skZY/QI2J1RCzKp9cDy4H6om6nAzMj8yzQX9J+Va/WzMxaVckR+naSGoFRwLyipnpgZcF8c75sddH7JwGTABoaGtpX6S5qvPrhTt1eZ2u66S9rXYJZSf7Z6zwVfygqqQ/wU+DKiHi/uLnEW2KHBRF3RsSYiBhTV1fXvkrNzKxNFQW6pJ5kYX5fRPysRJdmYP+C+cHAql0vz8zMKlXJWS4C7gKWR8QtrXSbA1yQn+1yNLAuIla30tfMzDpAJWPo44CvAi9IWpIvuwZoAIiI6cAjwGnACuBD4KLql2pmZm0pG+gR8TSlx8gL+wRwWbWKMjOz9vOVomZmiXCgm5klwoFuZpYIB7qZWSIc6GZmiXCgm5klwoFuZpYIB7qZWSIc6GZmiXCgm5klwoFuZpYIB7qZWSIc6GZmiXCgm5klwoFuZpYIB7qZWSIc6GZmiXCgm5klwoFuZpYIB7qZWSIc6GZmiXCgm5klwoFuZpYIB7qZWSIc6GZmiSgb6JLulvSWpGWttI+XtE7Skvx1XfXLNDOzcnpU0GcGMA2Y2UafpyLic1WpyMzMdkrZI/SI+A3wTifUYmZmu6BaY+jHSHpe0qOSDqnSOs3MrB0qGXIpZxEwJCI2SDoNeAgYXqqjpEnAJICGhoYqbNrMzFrs8hF6RLwfERvy6UeAnpIGttL3zogYExFj6urqdnXTZmZWYJcDXdK+kpRPj83XuXZX12tmZu1TdshF0v3AeGCgpGbgeqAnQERMB84CLpG0BfgIOC8iosMqNjOzksoGekScX6Z9GtlpjWZmVkO+UtTMLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRDjQzcwS4UA3M0uEA93MLBEOdDOzRJQNdEl3S3pL0rJW2iXpNkkrJC2VNLr6ZZqZWTmVHKHPAE5to30CMDx/TQJu3/WyzMysvcoGekT8BninjS6nAzMj8yzQX9J+1SrQzMwqU40x9HpgZcF8c75sB5ImSXpO0nNvv/12FTZtZmYtqhHoKrEsSnWMiDsjYkxEjKmrq6vCps3MrEU1Ar0Z2L9gfjCwqgrrNTOzdqhGoM8BLsjPdjkaWBcRq6uwXjMza4ce5TpIuh8YDwyU1AxcD/QEiIjpwCPAacAK4EPgoo4q1szMWlc20CPi/DLtAVxWtYrMzGyn+EpRM7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEVBTokk6V9FtJKyRdXaJ9vKR1kpbkr+uqX6qZmbWlR7kOkroDPwL+C9AMLJA0JyJeKur6VER8rgNqNDOzClRyhD4WWBERr0fEH4FZwOkdW5aZmbVXJYFeD6wsmG/OlxU7RtLzkh6VdEhVqjMzs4qVHXIBVGJZFM0vAoZExAZJpwEPAcN3WJE0CZgE0NDQ0M5SzcysLZUcoTcD+xfMDwZWFXaIiPcjYkM+/QjQU9LA4hVFxJ0RMSYixtTV1e1C2WZmVqySQF8ADJc0VFIv4DxgTmEHSftKUj49Nl/v2moXa2ZmrSs75BIRWyRdDjwOdAfujogXJX09b58OnAVcImkL8BFwXkQUD8uYmVkHqmQMvWUY5ZGiZdMLpqcB06pbmpmZtYevFDUzS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsEQ50M7NEONDNzBLhQDczS4QD3cwsERUFuqRTJf1W0gpJV5dol6Tb8valkkZXv1QzM2tL2UCX1B34ETABOBg4X9LBRd0mAMPz1yTg9irXaWZmZVRyhD4WWBERr0fEH4FZwOlFfU4HZkbmWaC/pP2qXKuZmbWhRwV96oGVBfPNwFEV9KkHVhd2kjSJ7AgeYIOk37ar2j3LQGBNZ21MN3fWlroMf//2XKl/74a01lBJoKvEstiJPkTEncCdFWxzjyfpuYgYU+s6bOf4+7fn6srfu0qGXJqB/QvmBwOrdqKPmZl1oEoCfQEwXNJQSb2A84A5RX3mABfkZ7scDayLiNXFKzIzs45TdsglIrZIuhx4HOgO3B0RL0r6et4+HXgEOA1YAXwIXNRxJe8xusTQUsL8/dtzddnvnSJ2GOo2M7M9kK8UNTNLhAPdzCwRDnQzs0RUch66VUDSZ8iumK0nOwd/FTAnIpbXtDCzxOU/e/XAvIjYULD81Ih4rHaVdT4foVeBpG+S3RJBwHyyUz0F3F/qZma255DkM7Z2Y5L+Bvi/wH8HlkkqvC3J39emqtrxWS5VIOkV4JCI2Fy0vBfwYkQMr01ltqsk/SEiGmpdh5Um6QXgmIjYIKkReAC4JyJulbQ4IkbVtMBO5iGX6tgGDAJ+X7R8v7zNdmOSlrbWBPxZZ9Zi7da9ZZglIpokjQcekDSE0rckSZoDvTquBH4t6VX+8yZlDcAw4PKaVWWV+jPgFODdouUCnun8cqwd3pA0MiKWAORH6p8D7gYOrW1pnc+BXgUR8ZikA8huNVxPFgTNwIKI2FrT4qwSvwD6tIRCIUlzO78ca4cLgC2FCyJiC9mtSO6oTUm14zF0M7NE+CwXM7NEONDNzBLhQLcuRVJ/SZcWzI+X9IsK3jdV0knt3FaTpIE7U6fZznCgW1fTH7i0bK8iEXFdRPyqA+oxqxoHuu2WJDVKelnSP0laJuk+SSdJ+jdJr0oam/fbR9JDkpZKelbSYfnyKZLuljRX0uv5FYUANwGflrRE0j/ky/pIeiDf3n2Sdjh/WdIMSWfl002SbpC0SNIL+aXnSBog6QlJi/MzLFTw/q9Imp9v9w5J3SUdmdfdW9Jekl6UNKLj9qqlzoFuu7NhwK3AYcBngC8BnwWuAq7J+9wALI6Iw/JlMwve/xmy88vHAtdL6glcDbwWESMjYnLebxTZtQQHA38OjKugtjURMRq4Pa8H4Hrg6fzqxDlk1yIg6SDgXGBcRIwEtgJfjogFeb8bge8D90bEsgr3jdkOfB667c5+FxEvAEh6Efh1RER+uXdj3uezwBcBIuLJ/Ci5X972cERsAjZJeovWr/qcHxHN+XaW5Ot+ukxtP8v/XQicmU8f1zIdEQ9LarlQ6UTgCGBBfvD/SeCtvG0q2b1/NgItf0WY7RQHuu3ONhVMbyuY38Z//t8tdXl3y8UVhe/fSuv/3yvtV+o9xf1LXdgh4CcR8a0SbfsAfYCeQG/ggwq2bVaSh1xsT/cb4MuQnbFCNhTyfhv91wN9O6GWCcCf5Mt/DZwl6U/ztn3ye41A9vzL7wD3ATd3UF3WRfgI3fZ0U4B/zm+w9SHwV211joi1+Qery4BHgYerWMsNZLdMXgT8P+AP+TZfkvRt4AlJ3YDNwGWS/gLYEhH/W1J34BlJJ0TEk1WsyboQX/pvZpYID7mYmSXCgW5mlggHuplZIhzoZmaJcKCbmSXCgW5mlggHuplZIhzoZmaJ+P/gy4GhD8MdzgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "df.plot(kind = 'bar')" + "for i in range (len(query_month)):\n", + " if query_month[i] is not NaN:\n", + " query_month_df = pd.DataFrame(data = {'query times':query_month[i]})\n", + " query_month_df.plot(xlabel='month index',kind = 'bar',title='query times in a month')" ] } ], diff --git a/tour_model_eval/v-measurel_bins_all_user.ipynb b/tour_model_eval/v-measurel_bins_all_user.ipynb index a4af2bd..c74fe3c 100644 --- a/tour_model_eval/v-measurel_bins_all_user.ipynb +++ b/tour_model_eval/v-measurel_bins_all_user.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "hungry-polish", + "id": "spoken-acrobat", "metadata": { "scrolled": true }, @@ -28,7 +28,7 @@ { "cell_type": "code", "execution_count": null, - "id": "coupled-transportation", + "id": "solid-decimal", "metadata": {}, "outputs": [], "source": [ @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "interstate-queue", + "id": "acute-departure", "metadata": {}, "outputs": [], "source": [ @@ -49,7 +49,7 @@ { "cell_type": "code", "execution_count": null, - "id": "norwegian-victoria", + "id": "practical-flour", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ }, { "cell_type": "markdown", - "id": "loved-estonia", + "id": "indirect-retrieval", "metadata": {}, "source": [ "## Bins above cutoff" @@ -67,7 +67,7 @@ }, { "cell_type": "markdown", - "id": "laughing-macintosh", + "id": "deadly-microwave", "metadata": {}, "source": [ "### Original user input" @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "amber-photograph", + "id": "strange-dining", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ { "cell_type": "code", "execution_count": null, - "id": "collected-throat", + "id": "black-mobility", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ }, { "cell_type": "markdown", - "id": "headed-dating", + "id": "composite-possibility", "metadata": {}, "source": [ "### After changing language" @@ -104,7 +104,7 @@ { "cell_type": "code", "execution_count": null, - "id": "blank-arnold", + "id": "wanted-mustang", "metadata": { "scrolled": true }, @@ -116,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "maritime-twins", + "id": "productive-palestine", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +125,7 @@ }, { "cell_type": "markdown", - "id": "consistent-kuwait", + "id": "heard-florist", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -134,7 +134,7 @@ { "cell_type": "code", "execution_count": null, - "id": "confused-sensitivity", + "id": "smaller-creek", "metadata": {}, "outputs": [], "source": [ @@ -144,7 +144,7 @@ { "cell_type": "code", "execution_count": null, - "id": "advance-passing", + "id": "square-importance", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +153,7 @@ }, { "cell_type": "markdown", - "id": "bronze-operations", + "id": "approximate-groove", "metadata": {}, "source": [ "### DataFrame" @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "pacific-reflection", + "id": "studied-saint", "metadata": {}, "outputs": [], "source": [ @@ -176,7 +176,100 @@ }, { "cell_type": "markdown", - "id": "documented-vacation", + "id": "wrapped-rebate", + "metadata": {}, + "source": [ + "#### homogeneity_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "suburban-insertion", + "metadata": {}, + "outputs": [], + "source": [ + "homo_df = pd.DataFrame(data={'original user input':homo_score_ori,'after translation':homo_score_sp2en,\n", + " 'after converting purposes and replaced mode':homo_score_cvt}, \n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "everyday-conditioning", + "metadata": {}, + "outputs": [], + "source": [ + "homo_df.plot(kind = 'bar',title='homogeneity_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] + }, + { + "cell_type": "markdown", + "id": "engaged-injury", + "metadata": {}, + "source": [ + "#### completeness_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "mighty-consolidation", + "metadata": {}, + "outputs": [], + "source": [ + "comp_df = pd.DataFrame(data={'original user input':comp_score_ori,'after translation':comp_score_sp2en,\n", + " 'after converting purposes and replaced mode':comp_score_cvt},\n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "christian-revelation", + "metadata": {}, + "outputs": [], + "source": [ + "comp_df.plot(kind = 'bar',title='completeness_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] + }, + { + "cell_type": "markdown", + "id": "lovely-particle", + "metadata": {}, + "source": [ + "#### v_measure_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "documented-perfume", + "metadata": {}, + "outputs": [], + "source": [ + "v_df = pd.DataFrame(data={'original user input':v_score_ori,'after translation':v_score_sp2en,\n", + " 'after converting purposes and replaced mode':v_score_cvt},\n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "published-kruger", + "metadata": {}, + "outputs": [], + "source": [ + "v_df.plot(kind = 'bar',title='v_measure_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] + }, + { + "cell_type": "markdown", + "id": "large-cement", "metadata": {}, "source": [ "## All bins" @@ -184,7 +277,7 @@ }, { "cell_type": "markdown", - "id": "selected-compiler", + "id": "wrapped-channels", "metadata": {}, "source": [ "### Original user input" @@ -193,7 +286,7 @@ { "cell_type": "code", "execution_count": null, - "id": "tight-contract", + "id": "assigned-pearl", "metadata": {}, "outputs": [], "source": [ @@ -203,7 +296,7 @@ { "cell_type": "code", "execution_count": null, - "id": "regular-shadow", + "id": "rotary-macro", "metadata": {}, "outputs": [], "source": [ @@ -212,7 +305,7 @@ }, { "cell_type": "markdown", - "id": "brutal-trinidad", + "id": "communist-harvard", "metadata": {}, "source": [ "### After changing language" @@ -221,7 +314,7 @@ { "cell_type": "code", "execution_count": null, - "id": "decent-machinery", + "id": "median-speed", "metadata": {}, "outputs": [], "source": [ @@ -231,7 +324,7 @@ { "cell_type": "code", "execution_count": null, - "id": "lasting-synthetic", + "id": "present-installation", "metadata": {}, "outputs": [], "source": [ @@ -240,7 +333,7 @@ }, { "cell_type": "markdown", - "id": "sexual-opportunity", + "id": "armed-former", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -249,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "periodic-freedom", + "id": "august-maintenance", "metadata": {}, "outputs": [], "source": [ @@ -259,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "stylish-thailand", + "id": "shaped-strip", "metadata": {}, "outputs": [], "source": [ @@ -268,7 +361,7 @@ }, { "cell_type": "markdown", - "id": "floppy-theory", + "id": "gentle-plenty", "metadata": {}, "source": [ "### DataFrame" @@ -277,7 +370,7 @@ { "cell_type": "code", "execution_count": null, - "id": "british-working", + "id": "architectural-perspective", "metadata": {}, "outputs": [], "source": [ @@ -288,6 +381,99 @@ " index = ['original user input','after translation','after converting purposes and replaced mode'])\n", "all_df" ] + }, + { + "cell_type": "markdown", + "id": "periodic-password", + "metadata": {}, + "source": [ + "#### homogeneity_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "theoretical-oliver", + "metadata": {}, + "outputs": [], + "source": [ + "ab_homo_df=pd.DataFrame(data={'original user input':ab_homo_score_ori,'after translation':ab_homo_score_sp2en,\n", + " 'after converting purposes and replaced mode':ab_homo_score_cvt},\n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "seasonal-coalition", + "metadata": {}, + "outputs": [], + "source": [ + "ab_homo_df.plot(kind = 'bar',title='homogeneity_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] + }, + { + "cell_type": "markdown", + "id": "outdoor-pavilion", + "metadata": {}, + "source": [ + "#### completeness_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "activated-schedule", + "metadata": {}, + "outputs": [], + "source": [ + "ab_comp_df = pd.DataFrame(data={'original user input':ab_comp_score_ori,'after translation':ab_comp_score_sp2en,\n", + " 'after converting purposes and replaced mode':ab_comp_score_cvt},\n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "graphic-revision", + "metadata": {}, + "outputs": [], + "source": [ + "ab_comp_df.plot(kind = 'bar',title='completeness_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] + }, + { + "cell_type": "markdown", + "id": "golden-philadelphia", + "metadata": {}, + "source": [ + "#### v_measure_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "relevant-nebraska", + "metadata": {}, + "outputs": [], + "source": [ + "ab_v_df = pd.DataFrame(data={'original user input':ab_v_score_ori,'after translation':ab_v_score_sp2en,\n", + " 'after converting purposes and replaced mode':ab_v_score_cvt},\n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "assisted-franchise", + "metadata": {}, + "outputs": [], + "source": [ + "ab_v_df.plot(kind = 'bar',title='v_measure_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] } ], "metadata": { diff --git a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb index 67a0f56..f44bfdc 100644 --- a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb +++ b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "adverse-recipient", + "id": "secondary-armor", "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ { "cell_type": "code", "execution_count": null, - "id": "pleased-fence", + "id": "smooth-reviewer", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "global-leave", + "id": "accessible-jaguar", "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,7 @@ { "cell_type": "code", "execution_count": null, - "id": "hundred-surge", + "id": "endangered-device", "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "markdown", - "id": "funky-darkness", + "id": "prescription-disease", "metadata": {}, "source": [ "## Evaluate clusters above cutoff based on silhouette_score" @@ -65,7 +65,7 @@ }, { "cell_type": "markdown", - "id": "critical-prime", + "id": "fatal-castle", "metadata": {}, "source": [ "### Original user input" @@ -74,7 +74,7 @@ { "cell_type": "code", "execution_count": null, - "id": "copyrighted-vehicle", + "id": "federal-convertible", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ { "cell_type": "code", "execution_count": null, - "id": "stainless-exposure", + "id": "polyphonic-astronomy", "metadata": {}, "outputs": [], "source": [ @@ -93,7 +93,7 @@ }, { "cell_type": "markdown", - "id": "returning-skiing", + "id": "later-share", "metadata": {}, "source": [ "### After changing language" @@ -102,7 +102,7 @@ { "cell_type": "code", "execution_count": null, - "id": "authorized-complement", + "id": "lesbian-realtor", "metadata": {}, "outputs": [], "source": [ @@ -112,7 +112,7 @@ { "cell_type": "code", "execution_count": null, - "id": "hispanic-drunk", + "id": "defensive-numbers", "metadata": {}, "outputs": [], "source": [ @@ -121,7 +121,7 @@ }, { "cell_type": "markdown", - "id": "united-grade", + "id": "amended-girlfriend", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -130,7 +130,7 @@ { "cell_type": "code", "execution_count": null, - "id": "outer-usage", + "id": "synthetic-debate", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ { "cell_type": "code", "execution_count": null, - "id": "positive-grass", + "id": "finite-ireland", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "assured-oriental", + "id": "inclusive-champagne", "metadata": {}, "source": [ "### DataFrame" @@ -158,7 +158,7 @@ { "cell_type": "code", "execution_count": null, - "id": "authentic-compression", + "id": "protective-appraisal", "metadata": {}, "outputs": [], "source": [ @@ -169,6 +169,101 @@ " index = ['original user input','after translation','after converting purposes and replaced mode'])\n", "df" ] + }, + { + "cell_type": "markdown", + "id": "intended-campaign", + "metadata": {}, + "source": [ + "#### homogeneity_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "robust-matthew", + "metadata": {}, + "outputs": [], + "source": [ + "homo_df = pd.DataFrame(data={'original user input':homo_score_ori,'after translation':homo_score_sp2en,\n", + " 'after converting purposes and replaced mode':homo_score_cvt}, \n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "surrounded-karma", + "metadata": {}, + "outputs": [], + "source": [ + "homo_df.plot(kind = 'bar',title='homogeneity_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] + }, + { + "cell_type": "markdown", + "id": "furnished-valve", + "metadata": {}, + "source": [ + "#### completeness_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "regulation-storage", + "metadata": {}, + "outputs": [], + "source": [ + "comp_df = pd.DataFrame(data={'original user input':comp_score_ori,'after translation':comp_score_sp2en,\n", + " 'after converting purposes and replaced mode':comp_score_cvt},\n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "liked-palestine", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "comp_df.plot(kind = 'bar',title='completeness_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] + }, + { + "cell_type": "markdown", + "id": "gorgeous-denver", + "metadata": {}, + "source": [ + "#### v_measure_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "quiet-capital", + "metadata": {}, + "outputs": [], + "source": [ + "v_df = pd.DataFrame(data={'original user input':v_score_ori,'after translation':v_score_sp2en,\n", + " 'after converting purposes and replaced mode':v_score_cvt},\n", + " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", + " 'user9','user10','user11','user12','user13'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "behavioral-embassy", + "metadata": {}, + "outputs": [], + "source": [ + "v_df.plot(kind = 'bar',title='v_measure_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + ] } ], "metadata": { From 11f018beca808057d4bf40d124919d0b48af3fe5 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Tue, 9 Mar 2021 04:15:27 -0800 Subject: [PATCH 10/16] modified evaluation code, got subplots for query times --- .../confirmed_trips_eval_bins_clusters.py | 51 +- tour_model_eval/query_times_all_users.ipynb | 1018 ++++------------- .../v-measurel_all_bins_single_user.ipynb | 108 +- .../v-measurel_bins_all_user.ipynb | 120 +- ...urel_clusters_above_cutoff_all_users.ipynb | 69 +- .../v-measurel_cutoff_bins_single_user.ipynb | 6 +- ...measurel_cutoff_clusters_single_user.ipynb | 10 +- 7 files changed, 413 insertions(+), 969 deletions(-) diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index 4f9652d..a538910 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -22,15 +22,31 @@ map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home', 'insurance_payment':'insurance'} +def get_user_ls(all_users): + user_ls = [] + for i in range(len(all_users)): + curr_user = 'user' + str(i + 1) + user_ls.append(curr_user) + return user_ls + def filter_data(user,radius): trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] - valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t["data"]["user_input"] and - 'purpose_confirm' in t["data"]["user_input"] and 'replaced_mode' in t["data"]["user_input"]] + non_empty_trips_df = pd.DataFrame(t["data"]["user_input"] for t in non_empty_trips) + valid_trips_df = non_empty_trips_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) + valid_trips_idx_ls = valid_trips_df.index.tolist() + valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls] + sim = similarity.similarity(valid_trips, radius) filter_trips = sim.data - return filter_trips,sim + return filter_trips,sim,trips + +def valid_user(filter_trips,trips): + valid = False + if len(filter_trips) >= 10 and len(filter_trips) / len(trips) >= 0.5: + valid = True + return valid # v_measure_bins takes 5 parameters @@ -45,10 +61,11 @@ def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): v_score = [] for i in range(len(all_users)): user = all_users[i] - filter_trips,sim = filter_data(user,radius) + filter_trips,sim,trips = filter_data(user,radius) - # filter out users that haven't enough trips (at least 10) to analyze - if len(filter_trips) < 10: + # filter out users that haven't enough trips (at least 10 valid trips + # and 50% of total trips are valid) to analyze + if not valid_user(filter_trips,trips): homo_score.append(NaN) comp_score.append(NaN) v_score.append(NaN) @@ -67,11 +84,6 @@ def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): bin_trips = sim.newdata bins = sim.bins - if len(bin_trips) < 10: - homo_score.append(NaN) - comp_score.append(NaN) - v_score.append(NaN) - continue bin_trips_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips]) if sp2en: @@ -111,7 +123,7 @@ def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): for index in bin: bin_ls.append(index) bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls]) - # compare two data frames, return nothing if two data frames are the same + # compare two data frames, the program will continue to score calculation if two data frames are the same assert_frame_equal(bins_ts, bin_trips_ts) homo = metrics.homogeneity_score(labels_true, labels_pred) homo_score.append(float('%.3f' % homo)) @@ -134,10 +146,10 @@ def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): v_score = [] for i in range(len(all_users)): user = all_users[i] - filter_trips,sim = filter_data(user,radius) + filter_trips,sim,trips = filter_data(user,radius) - # filter out users that haven't enough trips (at least 10) to analyze - if len(filter_trips) < 10: + # filter out users that haven't enough trips to analyze + if not valid_user(filter_trips,trips): homo_score.append(NaN) comp_score.append(NaN) v_score.append(NaN) @@ -147,12 +159,6 @@ def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): bin_trips = sim.newdata bins = sim.bins - if len(bin_trips) < 10: - homo_score.append(NaN) - comp_score.append(NaN) - v_score.append(NaN) - continue - # clustering the data only based on sil score (min_cluster = 0) instead of bins number (len(bins)) feat = featurization.featurization(bin_trips) min = 0 @@ -186,7 +192,8 @@ def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): labels_true.append(no_dup_list.index(trip)) labels_pred = feat.labels - # compare the points in cluster_trips and those in feat.points, return nothing if two data frames are the same + # compare the points in cluster_trips and those in feat.points, the program will continue to score calculation + # if the frames are the same cluster_ps = [] for trip in cluster_trips: cluster_ps.append([trip["data"]["start_loc"]["coordinates"][0], diff --git a/tour_model_eval/query_times_all_users.ipynb b/tour_model_eval/query_times_all_users.ipynb index 20164ba..a15e626 100644 --- a/tour_model_eval/query_times_all_users.ipynb +++ b/tour_model_eval/query_times_all_users.ipynb @@ -2,21 +2,12 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "id": "pending-avatar", + "execution_count": null, + "id": "inner-sacramento", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "storage not configured, falling back to sample, default configuration\n", - "Connecting to database URL localhost\n" - ] - } - ], + "outputs": [], "source": [ "import logging\n", "\n", @@ -33,13 +24,15 @@ "from sklearn import metrics\n", "from pandas.testing import assert_frame_equal\n", "import matplotlib.pyplot as plt\n", - "from matplotlib.ticker import MaxNLocator" + "from matplotlib.ticker import MaxNLocator\n", + "import numpy as np\n", + "# import matplotlib.ticker as ticker" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "handmade-burning", + "execution_count": null, + "id": "subjective-cruise", "metadata": {}, "outputs": [], "source": [ @@ -49,8 +42,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "outstanding-representation", + "execution_count": null, + "id": "posted-broadcasting", "metadata": {}, "outputs": [], "source": [ @@ -59,8 +52,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "needed-backing", + "execution_count": null, + "id": "lesbian-employee", "metadata": {}, "outputs": [], "source": [ @@ -69,8 +62,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "pacific-ranking", + "execution_count": null, + "id": "wound-thomson", "metadata": {}, "outputs": [], "source": [ @@ -79,8 +72,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "minute-anniversary", + "execution_count": null, + "id": "destroyed-attention", "metadata": {}, "outputs": [], "source": [ @@ -94,8 +87,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "chief-renewal", + "execution_count": null, + "id": "breeding-cream", "metadata": {}, "outputs": [], "source": [ @@ -109,43 +102,34 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "iraqi-festival", + "execution_count": null, + "id": "formed-drive", "metadata": {}, "outputs": [], "source": [ - "def plot_day(query_day_ls): \n", - " query_day_ls_df = pd.DataFrame(data = query_day_ls)\n", - " query_day_df=query_day_ls_df.value_counts(sort = False).rename_axis('query times').to_frame('frequecy').reset_index()\n", - " query_day_df.set_index(['query times'], inplace=True)\n", - " query_day=query_day_df.plot(kind = 'bar',title='query times in a day')\n", - " query_day.yaxis.set_major_locator(MaxNLocator(integer=True))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "stopped-tractor", - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(len(all_users)):\n", - " user = all_users[i]\n", - " filter_trips,sim = evaluation.filter_data(user,radius)\n", + "# build a base dataframe for query times in January \n", + "date_df = pd.DataFrame(data = {'date':np.arange(1,32),'drop_col':np.arange(1,32)})\n", + "date_df.set_index(['date'], inplace=True)\n", + "\n", + "\n", + "for a in range(len(all_users)):\n", + " user = all_users[a]\n", + " filter_trips,sim,trips = evaluation.filter_data(user,radius)\n", " logging.debug(\"len(filter_trips)is %s \"% len(filter_trips))\n", "\n", - " # filter out users that don't have valid labeled trips\n", - " if len(filter_trips) == 0:\n", + " # filter out users that don't have enough valid labeled trips\n", + " if not evaluation.valid_user(filter_trips,trips):\n", " query_day.append(NaN)\n", - " query_month.append(NaN)\n", + " query_month.append(NaN) \n", " continue\n", " \n", " sim.bin_data()\n", " sim.delete_bins()\n", " bins = sim.bins\n", " \n", - " # collect query trips indices above cutoff\n", + " # collect query trips and common trips(no need to query) indices above cutoff\n", " ab_trip_ls = []\n", + " no_query_trip_ls = []\n", " for bin in bins:\n", " early_trip = filter_trips[bin[0]]\n", " trip_index = 0\n", @@ -161,6 +145,10 @@ " early_trip = compare_trip\n", " trip_index = i\n", " ab_trip_ls.append(bin[trip_index])\n", + " \n", + " for k in range(len(bin)):\n", + " if k != trip_index:\n", + " no_query_trip_ls.append(bin[k])\n", "\n", "\n", " \n", @@ -174,7 +162,8 @@ " bl_trip_ls.append(index)\n", " \n", " # whole list of query trips indices\n", - " query_trips_ls=ab_trip_ls+bl_trip_ls \n", + " query_trips_ls=ab_trip_ls+bl_trip_ls\n", + " \n", " \n", " # collect query times in a day\n", " bin_day = []\n", @@ -190,11 +179,23 @@ " query_day_ls = []\n", " for bin in bin_day:\n", " query_day_ls.append(len(bin))\n", + " \n", + " # collect 0 query days \n", + " for trip_index in no_query_trip_ls:\n", + " trip = filter_trips[trip_index]\n", + " match = False\n", + " for bin in bin_day:\n", + " if match_day(trip,bin):\n", + " match = True\n", + " break\n", + " if not match:\n", + " query_day_ls.append(0)\n", + " \n", "\n", " # collect query times in a day for every user\n", " query_day.append(query_day_ls)\n", " \n", - " \n", + " \n", " # collect query times in a month\n", " bin_month = []\n", " for trip_index in query_trips_ls:\n", @@ -211,652 +212,120 @@ " query_month_ls.append(len(bin))\n", "\n", " # collect query times in a month for every user\n", - " query_month.append(query_month_ls)\n" + " query_month.append(query_month_ls)\n", + " \n", + " # select the trips that are in Jan 2021\n", + " jan_trips = []\n", + " for trip_index in query_trips_ls:\n", + " if filter_trips[trip_index]['data']['start_local_dt']['year']==2021 and filter_trips[trip_index]['data']['start_local_dt']['month']==1:\n", + " jan_trips.append(trip_index)\n", + " \n", + " # create the data frame for query times in Jan 2021\n", + " date = []\n", + " for trip_index in jan_trips:\n", + " trip_date = filter_trips[trip_index]['data']['start_local_dt']['day']\n", + " date.append(trip_date)\n", + " new_date_df = pd.DataFrame(data = date)\n", + " new_date_df=new_date_df.value_counts(sort = False).rename_axis('date').to_frame('user'+str(a+1)).reset_index()\n", + " new_date_df.set_index(['date'], inplace=True)\n", + " date_df = date_df.join(new_date_df,how='outer')\n", + "date_df=date_df.drop(columns=['drop_col'])" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "victorian-rating", + "execution_count": null, + "id": "functional-proposal", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[5,\n", - " 6,\n", - " 6,\n", - " 9,\n", - " 8,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 4,\n", - " 1,\n", - " 1,\n", - " 2,\n", - " 2,\n", - " 1,\n", - " 3,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 1,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 3,\n", - " 2,\n", - " 1,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 3,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 1,\n", - " 1,\n", - " 5,\n", - " 4,\n", - " 3,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 3,\n", - " 1],\n", - " nan,\n", - " [2,\n", - " 4,\n", - " 3,\n", - " 4,\n", - " 2,\n", - " 4,\n", - " 7,\n", - " 2,\n", - " 5,\n", - " 6,\n", - " 3,\n", - " 3,\n", - " 9,\n", - " 1,\n", - " 2,\n", - " 7,\n", - " 2,\n", - " 5,\n", - " 2,\n", - " 6,\n", - " 1,\n", - " 1,\n", - " 2,\n", - " 1,\n", - " 1,\n", - " 6,\n", - " 2,\n", - " 3,\n", - " 6,\n", - " 2,\n", - " 1,\n", - " 2,\n", - " 5,\n", - " 1,\n", - " 3,\n", - " 3,\n", - " 9,\n", - " 4,\n", - " 6,\n", - " 4,\n", - " 5,\n", - " 2,\n", - " 3,\n", - " 4,\n", - " 1,\n", - " 1,\n", - " 3,\n", - " 1,\n", - " 1],\n", - " [3,\n", - " 2,\n", - " 1,\n", - " 1,\n", - " 3,\n", - " 3,\n", - " 3,\n", - " 3,\n", - " 1,\n", - " 4,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 2,\n", - " 4,\n", - " 1,\n", - " 1,\n", - " 2,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 2,\n", - " 3,\n", - " 2,\n", - " 1,\n", - " 2,\n", - " 4,\n", - " 4,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 2,\n", - " 1,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 1,\n", - " 1,\n", - " 2,\n", - " 1,\n", - " 1,\n", - " 2,\n", - " 3,\n", - " 2],\n", - " [2, 1, 2, 2],\n", - " [2,\n", - " 3,\n", - " 3,\n", - " 1,\n", - " 3,\n", - " 3,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 3,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 3,\n", - " 2,\n", - " 1,\n", - " 3,\n", - " 1,\n", - " 5,\n", - " 2,\n", - " 1,\n", - " 2,\n", - " 2,\n", - " 3,\n", - " 2,\n", - " 1,\n", - " 5,\n", - " 3,\n", - " 4,\n", - " 1,\n", - " 2,\n", - " 2,\n", - " 5,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 2,\n", - " 3,\n", - " 1,\n", - " 3,\n", - " 2,\n", - " 1,\n", - " 1,\n", - " 1],\n", - " [2, 3, 3, 1, 2, 2, 4, 2, 2, 4, 2, 2, 2, 1, 2, 1, 1, 2, 2, 3, 1, 3, 2, 1],\n", - " [3,\n", - " 8,\n", - " 8,\n", - " 9,\n", - " 1,\n", - " 7,\n", - " 2,\n", - " 8,\n", - " 12,\n", - " 8,\n", - " 11,\n", - " 4,\n", - " 6,\n", - " 9,\n", - " 4,\n", - " 6,\n", - " 4,\n", - " 5,\n", - " 4,\n", - " 5,\n", - " 5,\n", - " 8,\n", - " 11,\n", - " 4,\n", - " 9,\n", - " 9,\n", - " 6,\n", - " 3,\n", - " 1,\n", - " 3,\n", - " 1,\n", - " 12,\n", - " 4,\n", - " 2,\n", - " 9,\n", - " 6,\n", - " 3,\n", - " 15,\n", - " 1,\n", - " 2,\n", - " 4,\n", - " 6,\n", - " 3,\n", - " 6,\n", - " 1,\n", - " 2,\n", - " 1,\n", - " 3,\n", - " 4,\n", - " 2,\n", - " 4,\n", - " 5,\n", - " 2,\n", - " 1,\n", - " 1],\n", - " [2,\n", - " 1,\n", - " 5,\n", - " 2,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 5,\n", - " 3,\n", - " 5,\n", - " 8,\n", - " 6,\n", - " 5,\n", - " 7,\n", - " 1,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 4,\n", - " 3,\n", - " 4,\n", - " 4,\n", - " 8,\n", - " 5,\n", - " 8,\n", - " 3,\n", - " 2,\n", - " 4,\n", - " 2,\n", - " 4,\n", - " 3,\n", - " 3,\n", - " 6,\n", - " 4,\n", - " 8,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 1,\n", - " 3,\n", - " 3,\n", - " 3,\n", - " 4,\n", - " 16,\n", - " 5,\n", - " 5,\n", - " 4,\n", - " 5],\n", - " [10,\n", - " 4,\n", - " 3,\n", - " 14,\n", - " 3,\n", - " 1,\n", - " 8,\n", - " 5,\n", - " 2,\n", - " 5,\n", - " 5,\n", - " 4,\n", - " 3,\n", - " 4,\n", - " 4,\n", - " 4,\n", - " 1,\n", - " 4,\n", - " 8,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 2,\n", - " 3,\n", - " 5,\n", - " 2,\n", - " 7,\n", - " 6,\n", - " 2,\n", - " 4,\n", - " 4,\n", - " 4,\n", - " 1,\n", - " 2,\n", - " 1],\n", - " [3, 2, 3, 1, 2, 1, 6],\n", - " nan,\n", - " [10,\n", - " 5,\n", - " 8,\n", - " 6,\n", - " 3,\n", - " 5,\n", - " 4,\n", - " 9,\n", - " 7,\n", - " 6,\n", - " 7,\n", - " 2,\n", - " 6,\n", - " 5,\n", - " 7,\n", - " 3,\n", - " 3,\n", - " 3,\n", - " 4,\n", - " 10,\n", - " 7,\n", - " 2,\n", - " 4,\n", - " 4,\n", - " 8,\n", - " 10,\n", - " 7,\n", - " 6,\n", - " 7,\n", - " 9,\n", - " 6,\n", - " 3,\n", - " 5,\n", - " 7,\n", - " 3,\n", - " 5,\n", - " 6,\n", - " 13,\n", - " 4,\n", - " 5,\n", - " 6,\n", - " 6,\n", - " 8,\n", - " 7,\n", - " 4,\n", - " 4,\n", - " 5,\n", - " 2,\n", - " 6,\n", - " 3,\n", - " 8,\n", - " 15,\n", - " 10,\n", - " 7,\n", - " 4,\n", - " 6,\n", - " 5,\n", - " 3,\n", - " 5,\n", - " 7,\n", - " 5,\n", - " 8,\n", - " 2,\n", - " 10,\n", - " 5,\n", - " 8,\n", - " 4,\n", - " 4,\n", - " 4,\n", - " 1,\n", - " 2,\n", - " 2,\n", - " 5,\n", - " 4,\n", - " 9,\n", - " 5,\n", - " 5,\n", - " 11,\n", - " 4,\n", - " 5,\n", - " 3,\n", - " 5,\n", - " 4,\n", - " 5,\n", - " 8,\n", - " 3,\n", - " 1]]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "query_day" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "victorian-venue", + "execution_count": null, + "id": "logical-adolescent", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[26, 61, 18, 7, 12],\n", - " nan,\n", - " [36, 52, 75],\n", - " [42, 28, 25],\n", - " [2, 3, 2],\n", - " [14, 38, 50],\n", - " [18, 7, 24, 1],\n", - " [131, 148, 4],\n", - " [71, 91, 28, 8],\n", - " [48, 50, 47],\n", - " [16, 2],\n", - " nan,\n", - " [153, 182, 152]]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "query_month" ] }, { "cell_type": "markdown", - "id": "seven-istanbul", + "id": "ruled-dictionary", "metadata": {}, "source": [ "### Plot query times in a day" ] }, { - "cell_type": "markdown", - "id": "supreme-search", + "cell_type": "code", + "execution_count": null, + "id": "biological-middle", "metadata": {}, + "outputs": [], "source": [ - "Note: frequecy represents the number of days have specific query times. Each graph represents query times for a user" + "# show query times in a data frame\n", + "base = 0\n", + "for i in range (len(query_day)):\n", + " if query_day[i] is not NaN:\n", + " query_day_ls_df = pd.DataFrame(data = query_day[i])\n", + " query_day_df=query_day_ls_df.value_counts(sort = False).rename_axis('query times').to_frame('user'+str(i+1)).reset_index()\n", + " query_day_df.set_index(['query times'], inplace=True)\n", + " base = i\n", + " break\n", + " \n", + "for i in range (base+1,len(query_day)):\n", + " if query_day[i] is not NaN:\n", + " new_day_df = pd.DataFrame(data = query_day[i])\n", + " new_day_df = new_day_df.value_counts(sort = False).rename_axis('query times').to_frame('user'+str(i+1)).reset_index()\n", + " new_day_df.set_index(['query times'], inplace=True)\n", + " query_day_df=query_day_df.join(new_day_df,how='outer',sort='query times')\n", + " \n", + "query_day_df" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "corresponding-graphics", + "execution_count": null, + "id": "revised-venice", + "metadata": {}, + "outputs": [], + "source": [ + "# plot a graph for all valid users\n", + "yticks_max = query_day_df.max().max()\n", + "graph_day = query_day_df.plot(kind='bar',figsize=(14,16),title='query times in a day',fontsize=18,yticks=np.arange(0, yticks_max+4, step=4))\n", + "graph_day.title.set_size(20)\n", + "plt.xlabel('query times',fontsize=16)\n", + "plt.ylabel('days', fontsize=16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "requested-canyon", "metadata": { "scrolled": false }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAETCAYAAAAf9UzqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAATkElEQVR4nO3df5TVdZ3H8dcLGBtWdLCBNEAYjj9SQHH4IdtuKkkHKI2WDNuO5Zqwo+spzf2Vm5s/tnZPrewWZEZuulhhJGpRdlDXFQ8qVjD8UH5oKbA6qQgTBligOO/94/ud6TrOOHdw7twPM8/HOffw/TWf7/t+mXl9v/fz/XEdEQIApKtPuQsAALw1ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQ4Jtr9g+ztlrmGZ7b8qw3on227o7vUiHeY6aqTG9mRJ34+IYeWuJQVsD3BEjZKz3a/cNQCHMoK6F7Nda3uN7T22f2h7se0v5/Musv1Iq+XD9vH58Dtsz7X9rO3tthfY7p/Pm2y7wfbnbb8o6b9tb7D94YK2KmzvtH1aq3UcLmmZpCG29+avIbavs/39fJmavJZP237O9i7bl9qeaPtx2y/bvrFVuxfb3pwve5/tEfl02/6a7Zds/y7/+THtbK+HbM8p3D75Nthle6vtD77Ftr7K9jP5tt5ke+ZbLNvf9sK83U2SJhbTVv5/8lvbpxQs+y7bf7A9uL31IX0EdS9l+zBJP5b0PUnvlLRE0nmdaOKrkk6UdJqk4yUNlXRNwfxj8nZHSKqT9F1JnyyY/yFJL0TEusJGI+IVSR+U9HxEDMhfz7dTwyRJJ0j6uKSvS7pa0gckjZZ0vu2z8vf6F5K+IOmjkgZLeljSD/I2pko6M38vA/O2GovcBpMkPSVpkKR/l3SLbbez7DOSzpBUJel6Sd+3/e52lr1W0nH5a5qk1v3ibbYVEfslLdYbt/MnJD0QETuKfE9IUUTw6oUvZeH0vPLzFPm0lZK+nA9fJOmRVj8TykLZkl6RdFzBvPdK2poPT5b0qqTKgvlDJO2RdGQ+fqekf2yntsmSGlpNu05ZP60k1eS1DC2Y3yjp4wXjd0n6XD68TNLsgnl9JP1e2U7kbEm/kvSnkvp0sM0ekjSnYPs8XTDvT/Kajily+6+T9JF25m2RNL1gvK719mivLWU7j+ea34uk1ZLOL/fvG6+39+KIuvcaIuk3kf815/6vyJ8drCyY6vNuhpcl3ZtPb7YjIvY1j0R2VPyopPNsD1R21Lzo7bwBSdsLhv/QxviAfHiEpHkFtf5W2c5maEQ8KOlGSd+UtN32zbaPLHL9LzYPRMTv88EBbS1o+0Lb6wpqGKPsSLwtQ5SFbbM3/L+8VVsR8QtlO9GzbJ+kbMf6kyLfDxJFUPdeL0ga2uqj+vCC4VeUhbEkyfYxBfN2KgvC0RExMH9VRURhSLV1OdFtyj6Wz5L0WET8pp3auvpSpOckXVJQ68CI6B8RKyUpIuZHxHhlXSYnSvqHrlx53h/+X5I+I6k6IgZK2qBsZ9GWFyQdWzDe8v9SZFvN2/lTku4s3GHi0ERQ916PSTog6XLb/Wx/VNLpBfPXSxpt+zTblcq6HiRJEdGkLCy+ZvtdkmR7qO1pHazzx5LGSbpCWZ91e7ZLqrZd1cn31J4Fkv7J9mhJsl1le1Y+PNH2JNsVynZO+yS93kXrbXa4sp3Pjnydn1Z2FNyeO/J6j7I9TNJnO9nW9yTNVBbWb7WdcYggqHupiHhV2cm1iyTtUnYS7e6C+b+S9C+SHpD0a0mPtGri85KelvRz27vz5d7TwTr/oKzveGThutpY7kllJ/u25B/vh3TmvbXR3o+UnfxcnNe6QVnXiyQdqWyns0tZF0OjpLlvZ31trH+TpP9QtnPcLukUZd1A7bk+r2WrpPuVBW/RbUVEg6Q1ygL94a56HygfbnhBC9sLlZ20+ucSruMaSSdGxCc7XBgHzfatyq6cKdn/JboPNyKg29h+p6TZyvpOUSK2a5R9WqotbyXoKnR9oFvY/mtlJ/WWRcSKctfTU9n+krKunRsiYmu560HXoOsDABLHETUAJI6gBoDEleRk4qBBg6KmpqYUTQNAj1RfX78zItp8eFZJgrqmpkarV68uRdMA0CPZbvcRDnR9AEDiCGoASBxBDQCJ67Y7E1977TU1NDRo3z4e5NUZlZWVGjZsmCoqKspdCoAy6bagbmho0BFHHKGamhq1/yUYKBQRamxsVENDg0aOHFnucgCUSYddH7Yrbf/S9nrbG21ffzAr2rdvn6qrqwnpTrCt6upqPoUAvVwxR9T7JZ0dEXvzZ/Y+YntZRPy8sysjpDuPbQagwyPqyOzNRyvy1yH5gJD58+fr5JNP1gUXXFDuUgCgaEX1UdvuK6le2fevfTP/XrbWy9Qp+xJODR8+vPXsN6m56medKrQj275yTofL3HTTTVq2bNkb+nsPHDigfv142ivS0tV/H71dMfmQsqIuz4uI1yPiNEnDJJ1u+01fIxQRN0fEhIiYMHhwm3dBltWll16qLVu2aMaMGaqqqlJdXZ2mTp2qCy+8UDt27NB5552niRMnauLEiXr00ewLMxobGzV16lTV1tbqkksu0YgRI7Rz505t27ZNY8b8cRPMnTtX1113nSTpmWee0fTp0zV+/HidccYZevLJJyVJ27dv18yZMzV27FiNHTtWK1eu1Be/+EXNmzevpZ2rr75a8+fP776NAuCQ0KnrqCPiZUkPSZpekmpKaMGCBRoyZIiWL1+uK6+8UvX19Vq6dKluv/12XXHFFbryyiu1atUq3XXXXZozZ44k6frrr9f73vc+rV27VjNmzNCzzz7b4Xrq6ur0jW98Q/X19Zo7d64uu+wySdLll1+us846S+vXr9eaNWs0evRozZ49W7fddpskqampSYsXL6ZbBsCbdPiZ3/ZgSa9FxMu2+0v6gLLvnzukzZgxQ/3795ckPfDAA9q0aVPLvN27d2vPnj1asWKF7r47+2q/c845R0cdddRbtrl3716tXLlSs2bNapm2f/9+SdKDDz6o7343+57Rvn37qqqqSlVVVaqurtbatWu1fft21dbWqrq6ukvfJ4BDXzGds++WdFveT91H0h0RcU9pyyq9ww8/vGW4qalJjz32WEtwF2rrqot+/fqpqampZbz58rmmpiYNHDhQ69atK7qOOXPmaOHChXrxxRd18cUXd+YtAOglirnq4/GIqI2IUyNiTET8S3cU1p2mTp2qG2+8sWW8OWjPPPNMLVq0SJK0bNky7dq1S5J09NFH66WXXlJjY6P279+ve+7J9ltHHnmkRo4cqSVLlkjKblhZv369JGnKlCn61re+JUl6/fXXtXv3bknSzJkzde+992rVqlWaNm1aN7xbAIcanvWh7LK91atX69RTT9WoUaO0YMECSdK1116rFStWaNy4cbr//vtbrmapqKjQNddco0mTJuncc8/VSSed1NLWokWLdMstt2js2LEaPXq0li5dKkmaN2+eli9frlNOOUXjx4/Xxo0bJUmHHXaY3v/+9+v8889X3759u/mdAzgUlOQ7EydMmBCtn0e9efNmnXzyyV2+ru7U/JztQYMGdVmbTU1NGjdunJYsWaITTjihzWV6wrZD53B5Xtc6FC7Ps10fERPamscRdRlt2rRJxx9/vKZMmdJuSAMAd3p0wrZt27q0vVGjRmnLli1d2iaAnocjagBIXLcGdSn6w3s6thmAbgvqyspKNTY2Ejyd0Pw86srKynKXAqCMuq2PetiwYWpoaNCOHTu6a5U9QvM3vADovbotqCsqKviWEgA4CJxMBIDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgAS12FQ2z7W9nLbm21vtH1FdxQGAMj0K2KZA5L+LiLW2D5CUr3t/4mITSWuDQCgIo6oI+KFiFiTD++RtFnS0FIXBgDIdKqP2naNpFpJv2hjXp3t1bZX79ixo2uqAwAUH9S2B0i6S9LnImJ36/kRcXNETIiICYMHD+7KGgGgVysqqG1XKAvpRRFxd2lLAgAUKuaqD0u6RdLmiPjP0pcEAChUzBH1n0v6lKSzba/LXx8qcV0AgFyHl+dFxCOS3A21AADawJ2JAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEhch0Ft+1bbL9ne0B0FAQDeqJgj6oWSppe4DgBAOzoM6ohYIem33VALAKAN/bqqIdt1kuokafjw4V3VbMnUXPWzcpfQo2z7yjnlLgHosbrsZGJE3BwREyJiwuDBg7uqWQDo9bjqAwASR1ADQOKKuTzvB5Iek/Qe2w22Z5e+LABAsw5PJkbEJ7qjEABA2+j6AIDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0Diigpq29NtP2X7adtXlbooAMAfdRjUtvtK+qakD0oaJekTtkeVujAAQKaYI+rTJT0dEVsi4lVJiyV9pLRlAQCa9StimaGSnisYb5A0qfVCtusk1eWje20/9fbLg6RBknaWu4iO+KvlrgBlwu9n1xnR3oxigtptTIs3TYi4WdLNnSgKRbC9OiImlLsOoC38fnaPYro+GiQdWzA+TNLzpSkHANBaMUG9StIJtkfaPkzSX0r6SWnLAgA067DrIyIO2P6MpPsk9ZV0a0RsLHllaEZ3ElLG72c3cMSbupsBAAnhzkQASBxBDQCJI6gBIHEENYCi2D7J9hTbA1pNn16umnoLgvoQYfvT5a4BvZftyyUtlfRZSRtsFz5G4t/KU1XvwVUfhwjbz0bE8HLXgd7J9hOS3hsRe23XSLpT0vciYp7ttRFRW9YCe7hibiFHN7H9eHuzJB3dnbUArfSNiL2SFBHbbE+WdKftEWr7MRPoQgR1Wo6WNE3SrlbTLWll95cDtHjR9mkRsU6S8iPrcyXdKumU8pbW8xHUablH0oDmP4ZCth/q/nKAFhdKOlA4ISIOSLrQ9rfLU1LvQR81ACSOqz4AIHEENQAkjqBGr2R7oO3LCsaH2L6znDUB7aGPGj2K7b4R8XoRy9VIuicixpS8KOBt4ogaZWP7attP2X7A9g9s/30+/SHbE/LhQba35cN9bd9ge5Xtx21fkk+fbHu57dslPWH7S7avKFjPv+Z31hX6iqTjbK/L26yxvSFf/iLbP7b9U9tbbX/G9t/aXmv757bfmS93nO17bdfbftj2Sfn0WbY32F5ve0VptyJ6Ay7PQ1nYHq/s24Jqlf0erpFU38GPzZb0u4iYaPsdkh61fX8+73RJYyJia360fLekebb75Os5vVVbV+XLn5bXU9Nq/pi8tkpJT0v6fETU2v6askvVvq7sofmXRsSvbU+SdJOksyVdI2laRPzG9sAiNwnQLoIa5XKGpB9FxO8lyXYxX+82VdKptj+Wj1dJOkHSq5J+GRFbpZY75xpt1yq7iWhtRDR2sr7lEbFH0h7bv5P003z6E3kNAyT9maQldsuNee/I/31U0kLbdyjbYQBvC0GNcmrvBMkB/bFbrrJguiV9NiLuK1w4v535lVZtfEfSRZKOUXb3XGftLxhuKhhvUvZ300fSy81H5IUi4tL8CPscSevyO/o6u6MAWtBHjXJZIWmm7f62j5D04YJ52ySNz4c/VjD9Pkl/Y7tCkmyfaPvwdtr/kaTpkibmP9faHklHHGzxEbFb0lbbs/JabHtsPnxcRPwiIq6RtFPSsQe7HkAiqFEmEbFG0g8lrZN0l6SHC2bPVRbIKyUNKpj+HUmbJK3JT/x9W+18KoyIVyUtl3RHW1eB5Ee4j+Yn/W44yLdxgaTZttdL2iip+dGfN9h+Iq9xhaT1B9k+IInL85AI29dJ2hsRc7uovT7KTlDOiohfd0WbQLlwRI0ex/YoZVdq/C8hjZ6AI2oASBxH1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBx/w+0fWgfrPx15wAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAETCAYAAAAf9UzqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAUC0lEQVR4nO3de5RV5X3G8ecRUIjAYIBoEGGoYuSi3KVpvJDQBSQaUzSYRhPrhaJ1GW16i40JapK2SaVNYBGlNCpNQkKC0Ri1oLWQhVyCAkJE0ESB6kRBmJAAKijMr3/sPXiczDC3M5x3Zr6ftc5yn733effvvIPP2ec9++KIEAAgXceUugAAwJER1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOo0SrY/qLt75S4hkW2/6IE2x1nu+JobxfpMMdRIzW2x0n6fkT0LXUtKaA/wB41WpztjqWuAWjNCOp2zPYI2+ts77X9I9sLbH8tX3al7eU11g/bp+XTx9meYfsl2ztsz7HdJV82znaF7S/Y3i7pXtsbbX+8oK1OtnfZHl5jG8dLWiSpj+19+aOP7dtsfz9fpzyv5SrbL9vebfs622Ns/9L272zPrtHu1bY35+s+art/Pt+2v2n7Ndu/z18/tI7++rntqYX9k/fBbttbbX/0CH19s+0X877eZHvyEdbtYnte3u4mSWMa0lb+N/mt7TML1n2f7Tdt965re0gfQd1O2T5W0k8lfU/SeyUtlHRJI5r4hqTTJQ2XdJqkkyVNL1h+Ut5uf0nTJH1X0mcKln9M0qsRsb6w0Yh4XdJHJb0SEV3zxyt11DBW0kBJn5L0LUm3SPpTSUMkXWr7/Py9/pmkL0q6WFJvSU9I+mHexgRJ5+XvpUfeVmUD+2CspOcl9ZL0r5Lutu061n1R0rmSyiTdLun7tt9fx7q3Sjo1f0yUVHNcvNa2IuKApAV6dz9/WtLjEbGzge8JKYoIHu3woSycXlH+O0U+b6Wkr+XTV0paXuM1oSyULel1SacWLPugpK359DhJb0nqXLC8j6S9krrnz++T9A911DZOUkWNebcpG6eVpPK8lpMLlldK+lTB859I+ut8epGkawqWHSPpDWUfIh+R9CtJfyzpmHr67OeSphb0zwsFy96T13RSA/t/vaRP1LFsi6RJBc+n1eyPutpS9uHxcvV7kbRG0qWl/vfGo3kP9qjbrz6SfhP5/825/2vga3srC6a1+TDD7yQtzudX2xkR+6ufRLZXvELSJbZ7KNtrnt+cNyBpR8H0m7U875pP95c0s6DW3yr7sDk5IpZImi3p25J22J5ru3sDt7+9eiIi3sgnu9a2ou0rbK8vqGGosj3x2vRRFrbV3vV3OVJbEbFa2Yfo+bbPUPbB+rMGvh8kiqBuv16VdHKNr+r9CqZfVxbGkiTbJxUs26UsCIdERI/8URYRhSFV2+FE/6Xsa/kUSasi4jd11FbsQ5FelnRtQa09IqJLRKyUpIiYFRGjlA2ZnC7p74u58Xw8/D8l3SCpZ0T0kLRR2YdFbV6VdErB88N/lwa2Vd3Pn5V0X+EHJlongrr9WiXpoKQbbXe0fbGkswuWb5A0xPZw252VDT1IkiKiSllYfNP2+yTJ9sm2J9azzZ9KGinpJmVj1nXZIamn7bJGvqe6zJH0j7aHSJLtMttT8ukxtsfa7qTsw2m/pENF2m6145V9+OzMt3mVsr3guvw4r/cE230lfa6RbX1P0mRlYX2kfkYrQVC3UxHxlrIf166UtFvZj2j3Fyz/laSvSHpc0q8lLa/RxBckvSDpF7b35Ot9oJ5tvqls7HhA4bZqWe85ZT/2bcm/3vdpzHurpb0HlP34uSCvdaOyoRdJ6q7sQ2e3siGGSkkzmrO9Wra/SdK/Kftw3CHpTGXDQHW5Pa9lq6THlAVvg9uKiApJ65QF+hPFeh8oHU54wWG25yn70epLLbiN6ZJOj4jP1Lsymsz2PcqOnGmxvyWOHk5EwFFj+72SrlE2dooWYrtc2belEaWtBMXC0AeOCtt/qexHvUURsazU9bRVtr+qbGjnjojYWup6UBwMfQBA4tijBoDEEdQAkLgW+TGxV69eUV5e3hJNA0CbtHbt2l0RUevFs1okqMvLy7VmzZqWaBoA2iTbdV7CgaEPAEgcQQ0AiSOoASBxnJkIoEnefvttVVRUaP9+Ls7XGJ07d1bfvn3VqVOnBr+GoAbQJBUVFerWrZvKy8tV941tUCgiVFlZqYqKCg0YMKDBr6t36MP2KbaX5vebe9b2Tc2qFECbsH//fvXs2ZOQbgTb6tmzZ6O/hTRkj/qgpL+NiHW2uym7q8f/5JdbBNCOEdKN15Q+q3ePOiJejYh1+fReSZuV3cgUAEpq1qxZGjRokC6//PJSl9KiGjVGnV8+cYSk1bUsm6bsJpzq169fzcXNVn7zI0Vvs9i2ff2CUpfQIK2hLyX6s9hauj+L3Q8NqffOO+/UokWL3jXee/DgQXXs2LZ+fmvw4Xm2u+qdOzvvqbk8IuZGxOiIGN27d61nQQJA0Vx33XXasmWLLrroIpWVlWnatGmaMGGCrrjiCu3cuVOXXHKJxowZozFjxmjFiuwmOJWVlZowYYJGjBiha6+9Vv3799euXbu0bds2DR36zh3NZsyYodtuu02S9OKLL2rSpEkaNWqUzj33XD333HOSpB07dmjy5MkaNmyYhg0bppUrV+rLX/6yZs6cebidW265RbNmzWr2e23Qx05+P7mfSJofEXXeQgkAjpY5c+Zo8eLFWrp0qWbPnq2HHnpIy5cvV5cuXXTZZZfp85//vM455xy99NJLmjhxojZv3qzbb79d55xzjqZPn65HHnlEc+fOrXc706ZN05w5czRw4ECtXr1a119/vZYsWaIbb7xR559/vh544AEdOnRI+/btU58+fXTxxRfrpptuUlVVlRYsWKAnn3yy2e+13qDO71J9t6TNEfHvzd4iALSAiy66SF26dJEkPf7449q06Z3jHfbs2aO9e/dq2bJluv/+bF/zggsu0AknnHDENvft26eVK1dqypQph+cdOHBAkrRkyRJ997vZvYM7dOigsrIylZWVqWfPnnr66ae1Y8cOjRgxQj179mz2e2vIHvWHlN066Rnb6/N5X4yI/2721gGgSI4//vjD01VVVVq1atXh4C5U21EXHTt2VFVV1eHn1YfPVVVVqUePHlq/fv0fvKYuU6dO1bx587R9+3ZdffXVjXkLdWrIUR/LI8IRcVZEDM8fhDSAZE2YMEGzZ88+/Lw6aM877zzNnz9fkrRo0SLt3r1bknTiiSfqtddeU2VlpQ4cOKCHH35YktS9e3cNGDBACxculJSdsLJhwwZJ0vjx43XXXXdJkg4dOqQ9e7Kf7iZPnqzFixfrqaee0sSJE4vyfrjWB4A2Z9asWVqzZo3OOussDR48WHPmzJEk3XrrrVq2bJlGjhypxx577PARap06ddL06dM1duxYXXjhhTrjjDMOtzV//nzdfffdGjZsmIYMGaIHH3xQkjRz5kwtXbpUZ555pkaNGqVnn31WknTsscfqwx/+sC699FJ16NChKO+nRe6ZOHr06Cj29ahbwyFQHE5WXPRncRW7Pzdv3qxBgwYVtc2jrfra+b169Spam1VVVRo5cqQWLlyogQMH1rpObX1ne21EjK5tffaoAaBINm3apNNOO03jx4+vM6Sbom0dFQ4AjbBt27aitjd48GBt2bKlqG1K7FEDQPIIagBN1hK/cbV1TekzghpAk3Tu3FmVlZWEdSNUX4+6c+fOjXodY9QAmqRv376qqKjQzp07S11Kq1J9h5fGIKgBNEmnTp0adZcSNB1DHwCQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAImrN6ht32P7Ndsbj0ZBAIB3a8ge9TxJk1q4DgBAHeoN6ohYJum3R6EWAEAtGKMGgMR1LFZDtqdJmiZJ/fr1K1azANqZ8psfKXUJ9dr29QuO6vaKtkcdEXMjYnREjO7du3exmgWAdo+hDwBIXEMOz/uhpFWSPmC7wvY1LV8WAKBavWPUEfHpo1EIAKB2DH0AQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkDiCGgASR1ADQOIIagBIHEENAIkjqAEgcQQ1ACSOoAaAxBHUAJA4ghoAEkdQA0DiCGoASBxBDQCJI6gBIHEENQAkjqAGgMQR1ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBxBDUAJI6gBoDEEdQAkLgGBbXtSbaft/2C7ZtbuigAwDvqDWrbHSR9W9JHJQ2W9Gnbg1u6MABApiF71GdLeiEitkTEW5IWSPpEy5YFAKjmiDjyCvYnJU2KiKn5889KGhsRN9RYb5qkafnTD0h6vvjlFlUvSbtKXUQbQn8WF/1ZXK2hP/tHRO/aFnRswItdy7w/SPeImCtpbiMLKxnbayJidKnraCvoz+KiP4urtfdnQ4Y+KiSdUvC8r6RXWqYcAEBNDQnqpyQNtD3A9rGS/lzSz1q2LABAtXqHPiLioO0bJD0qqYOkeyLi2RavrOW1mmGaVoL+LC76s7hadX/W+2MiAKC0ODMRABJHUANA4ghqAEgcQY0msX2G7fG2u9aYP6lUNbVmts+2PSafHmz7b2x/rNR1tTa2x9runk93sX277Ydsf8N2WanrayqCWpLtq0pdQ2ti+0ZJD0r6nKSNtgsvKfDPpamq9bJ9q6RZku6y/S+SZkvqKulm27eUtLjW5x5Jb+TTMyWVSfpGPu/eUhXVXBz1Icn2SxHRr9R1tBa2n5H0wYjYZ7tc0n2SvhcRM20/HREjSlpgK5P353BJx0naLqlvROyx3UXS6og4q6QFtiK2N0fEoHx6XUSMLFi2PiKGl666pmvIKeRtgu1f1rVI0olHs5Y2oENE7JOkiNhme5yk+2z3V+2XHMCRHYyIQ5LesP1iROyRpIh403ZViWtrbTbavioi7pW0wfboiFhj+3RJb5e6uKZqN0GtLIwnStpdY74lrTz65bRq220Pj4j1kpTvWV+o7GvnmaUtrVV6y/Z7IuINSaOqZ+ZjqgR140yVNNP2l5RdhGmV7ZclvZwva5XazdCH7bsl3RsRy2tZ9oOIuKwEZbVKtvsq2wvcXsuyD0XEihKU1WrZPi4iDtQyv5ek90fEMyUoq1Wz3U3SHynbGa2IiB0lLqlZ2k1QA0BrxVEfAJA4ghoAEkdQo12y3cP29QXP+9i+r5Q1AXVhjBptiu0O+aFu9a1XLunhiBja4kUBzcQeNUrG9i22n7f9uO0f2v67fP7PbY/Op3vZ3pZPd7B9h+2nbP/S9rX5/HG2l9r+gaRnbH/V9k0F2/mn/GzKQl+XdKrt9Xmb5bY35utfafun+anHW23fkJ/S/bTtX9h+b77eqbYX215r+wnbZ+Tzp9jeaHuD7WUt24toD9rTcdRIiO1Ryu4WNELZv8N1ktbW87JrJP0+IsbYPk7SCtuP5cvOljQ0Irbme8v3Kzue9ph8O2fXaOvmfP3heT3lNZYPzWvrLOkFSV+IiBG2vynpCknfUnYx+usi4te2x0q6U9JHJE2XNDEifmO7RwO7BKgTQY1SOVfSA/lJHrLdkNu7TZB0lu1P5s/LJA2U9JakJyNiq3T4bMlK2yOUnej0dERUNrK+pRGxV9Je27+X9FA+/5m8hq6S/kTSQvvwyZjH5f9dIWme7R8r+8AAmoWgRinV9QPJQb0zLNe5YL4lfS4iHi1cOT+F/fUabXxH0pWSTlJ2xmRjFZ6AUlXwvErZ/zfHSPpdbdeOiIjr8j3sCyStz8/ibOwHBXAYY9QolWWSJueXouwm6eMFy7bpnVOpP1kw/1FJf2W7kyTZPt328XW0/4CkSZLG5K+raa+kbk0tPr8ex1bbU/JabHtYPn1qRKyOiOnKTmM+panbASSCGiUSEesk/UjSekk/kfREweIZygJ5paReBfO/I2mTpHX5D3//oTq+FUbEW5KWSvpxbUeB5Hu4K/If/e5o4tu4XNI1tjdIelZS9eVe77D9TF7jMkkbmtg+IInD85AI27dJ2hcRM4rU3jHKfqCcEhG/LkabQKmwR402x/ZgZUdq/C8hjbaAPWoASBx71ACQOIIaABJHUANA4ghqAEgcQQ0AiSOoASBx/w9Umq0adTwQDwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "for i in range (len(query_day)):\n", - " if query_day[i] is not NaN:\n", - " plot_day(query_day[i])" + "# subplots\n", + "day_ax_arr = query_day_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey=True)\n", + "\n", + "for ax in day_ax_arr[-1]:\n", + " ax.set_xlabel('query times', fontsize=16)\n", + " \n", + "for ax_arr in day_ax_arr:\n", + " ax_arr[0].yaxis.set_major_locator(MaxNLocator(integer=True))\n", + " ax_arr[0].set_ylabel('days', fontsize=16)\n", + "plt.tight_layout()" ] }, { "cell_type": "markdown", - "id": "worst-musical", + "id": "confident-capacity", "metadata": {}, "source": [ "### Plot query times in a month" @@ -864,156 +333,107 @@ }, { "cell_type": "markdown", - "id": "adverse-bruce", + "id": "younger-destiny", "metadata": {}, "source": [ - "Note: not all users have data on the same number of months. Each graph represents query times for a user in a month." + "Note: not all users have data on the same number of months. " ] }, { "cell_type": "code", - "execution_count": 13, - "id": "removed-sheffield", + "execution_count": null, + "id": "quiet-minister", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAETCAYAAAAveV3LAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAZnklEQVR4nO3de5gV9Z3n8feHi6KCINgaLsEmIxoRpMH2MiEyEI2SVVRUNOaGUZfsY7KjeVYSormgT7LBbDYZEydeVhMwOhJxYiRxvQWXSRwz3AQRxYiYjvSA3NSIF1Dgu39UNRyabvp09+k+/YPP63n6OVW/+lXVt4vm09W/U3VKEYGZmaWnU7kLMDOzlnGAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuHZqk6yTdWeYaHpE0qZw1tDVJlZJCUpdy12LFk68Dt45C0hjgnogYUO5a9nWSaoArI+L3+Xwl8Bega0RsK19l1hw+A7c247M5s7blAN8PSRoh6RlJmyX9StIsSd/Nl10m6al6/UPS0fn0gZJ+KOlVSesk3SbpoHzZGEm1kr4u6TXgF5KWSxpfsK2ukjZKqqq3j0OAR4B+kt7Ov/pJmibpnrxP3Z/5X5S0WtIbkv6bpJMkLZP0pqRb6m33ckkr8r6PSToqb5ekH0taL+lv+fpDGzle8yRdWXh88mPwhqS/SPrUXo71VEmr8mP9gqQJe+k7TdJsSffk/Z+TdIykb+R1rpZ0ZkH/fpLmSHpd0suS/mu9bd0v6e58W89Lqs6X/RIYCPw2P85fKyjjs/m/7UZJ1zdWq3UMDvD9jKQDgN8AvwR6A7OBC5uxiZuAY4Aq4GigP/DtguUfyrd7FDAZuBv4XMHy/wKsjYilhRuNiHeATwFrIqJ7/rWmkRpOAQYDlwD/BFwPnAEcD1ws6R/y7/V84DrgAqAC+CNwX76NM4HR+ffSK9/WpiKPwSnAn4HDgR8Ad0lSI31XAacBPYEbgHsk9d3LtseT/dscBiwBHiP7f9ofuBG4vaDvfUAt0A+4CPifkk4vWH4uMCv//uYAtwBExOeBV4Hx+XH+QcE6HweOBU4Hvi3puL3UauUWEf7aj77IQmsN+fsfedvTwHfz6cuAp+qtE2RhLeAd4O8Klv098Jd8egzwPtCtYHk/YDNwaD7/APC1RmobA9TWa5tGNi4OUJnX0r9g+SbgkoL5fwWuyacfAa4oWNYJeJfsl8sngJeAU4FOTRyzeWTjxXXH5+WCZQfnNX2oyOO/FDivkWXTgCcK5scDbwOd8/ke+b56AR8GtgM9Cvp/H5hRsK3fFywbArxXMF8DnFEwX3dsBxS0LQA+Xe6fWX81/uUz8P1PP+A/I/8fmvtrketWkAXW4ny44k3g0by9zoaI2FI3E9lZ9L8DF0rqRXaWfW9rvgFgXcH0ew3Md8+njwJuLqj1dbJfQv0j4kmyM9J/BtZJukPSoUXu/7W6iYh4N5/s3lBHSV+QtLSghqFkZ+7Ffm8bI2J7wXzdvvoBr0fE5oL+fyU7U9+jTrJfXN2KeF+i/joNfl/WMTjA9z9rgf71/uQfWDD9DllIAyDpQwXLNpKFyPER0Sv/6hkRhf/JG7qsaSbZMMpE4E8R8Z+N1FbqS6JWA18qqLVXRBwUEU8DRMRPIuJEsqGXY4Appdx5Pt7+f4CvAH0iohewnOyXSGutAXpL6lHQNhBo7NjW58vP9gEO8P3Pn4BtwD9K6iLpAuDkguXPAsdLqpLUjexPcQAiYgdZIP1Y0hEAkvpLOquJff4GGAlcTTYm3ph1QB9JPZv5PTXmNuAbko4HkNRT0sR8+iRJp0jqSvZLawvZkEQpHUIWlBvyfX6R7Ay81SJiNdnQ1/cldZN0AnAFxf91sw74SClqsfJxgO9nIuJ9sjf1LgPeIHvz7tcFy18ie7Ps98BK4Kl6m/g68DLwH5Leyvsd28Q+3yMbmx5UuK8G+r1I9sbcK/mQQ7/mfG8NbO9BsjddZ+W1LicbwgE4lOyX0RtkQw+bgB+2Zn8N7P8F4H+T/dJcBwwjG04qlUvJxq7XAA8C34mIJ4pc9/vAN/PjfG0Ja7J25Bt5DEkzyN48/GYb7uPbwDER8bkmO5tZUXyjhbU5Sb3J/rz/fLlrMduXeAjF2lR+c8lq4JGI+EO56zHbl3gIxcwsUT4DNzNLlAPczCxR7fom5uGHHx6VlZXtuUszs+QtXrx4Y0RU1G9v1wCvrKxk0aJF7blLM7PkSWrw4y48hGJmligHuJlZohzgZmaJ8p2YZrabDz74gNraWrZs2dJ0Zyupbt26MWDAALp27VpUfwe4me2mtraWHj16UFlZSeMPGrJSiwg2bdpEbW0tgwYNKmodD6GY2W62bNlCnz59HN7tTBJ9+vRp1l8+DnAz24PDuzyae9wd4GZmBd58801+9rOf7Zxfs2YNF110URkralxRY+D5swzvJHuaSACXkz2V+1dkHyhfA1wcEW+0SZXWoMqpD5e7BGqmn13uEqyNlfrnrFw/M9u3b6dz585N9qsL8KuuugqAfv368cADD7R1eS1S7Bn4zcCjEfFRYDiwApgKzI2IwcDcfN7MrNW+973vceyxx3LGGWdw6aWX8sMfZg9LGjNmzM67uTdu3EjdR3Ns376dKVOmcNJJJ3HCCSdw++23AzBv3jzGjh3LZz7zGYYNG8a3vvUtbr755p37uf766/nJT36y276nTp3KqlWrqKqqYsqUKdTU1DB0aPYkvBkzZnD++eczfvx4Bg0axC233MKPfvQjRowYwamnnsrrr78OwKpVqxg3bhwnnngip512Gi+++CIAs2fPZujQoQwfPpzRo0e3+jg1eQaeP6l7NNkjuOoeyfW+pPOAMXm3mcA8ssdtmZm12OLFi5k1axZLlixh27ZtjBw5khNPPHGv69x111307NmThQsXsnXrVkaNGsWZZ54JwIIFC1i+fDmDBg2ipqaGCy64gKuvvpodO3Ywa9YsFixYsNu2pk+fzvLly1m6dCkANTU1uy1fvnw5S5YsYcuWLRx99NHcdNNNLFmyhK9+9avcfffdXHPNNUyePJnbbruNwYMHM3/+fK666iqefPJJbrzxRh577DH69+/Pm2++2epjVcwQykfIHsr6C0nDgcVkD6c9MiLWAkTE2rqH3JqZtcYf//hHJkyYwMEHHwzAueee2+Q6jz/+OMuWLds51PG3v/2NlStXcsABB3DyySfvvCyvsrKSPn36sGTJEtatW8eIESPo06dPs+obO3YsPXr0oEePHvTs2ZPx48cDMGzYMJYtW8bbb7/N008/zcSJE3eus3XrVgBGjRrFZZddxsUXX8wFF1zQrP02pJgA70L2RPH/HhHzJd1MM4ZLJE0GJgMMHDiwRUWa2f6lsasxunTpwo4dOwB2u9wuIvjpT3/KWWedtVv/efPmccghh+zWduWVVzJjxgxee+01Lr/88mbXduCBB+6c7tSp0875Tp06sW3bNnbs2EGvXr12nsEXuu2225g/fz4PP/wwVVVVLF26tNm/QAoVMwZeS/bA2/n5/ANkgb5OUl+A/HV9QytHxB0RUR0R1RUVe3waopnZbkaPHs2DDz7Ie++9x+bNm/ntb3+7c1llZSWLFy8G2O2NxbPOOotbb72VDz74AICXXnqJd955p8HtT5gwgUcffZSFCxfuEfgAPXr0YPPmzS2u/9BDD2XQoEHMnj0byH65PPvss0A2Nn7KKadw4403cvjhh7N69eoW7weKCPCIeA1YLenYvOl04AVgDjApb5sEPNSqSszMgJEjR3LJJZdQVVXFhRdeyGmnnbZz2bXXXsutt97Kxz72MTZu3Liz/corr2TIkCGMHDmSoUOH8qUvfYlt27Y1uP0DDjiAsWPHcvHFFzd4VUqfPn0YNWoUQ4cOZcqUKS36Hu69917uuusuhg8fzvHHH89DD2XxOGXKFIYNG8bQoUMZPXo0w4cPb9H26xT1TExJVWSXER4AvAJ8kSz87wcGAq8CEyPi9b1tp7q6Ovx54KXjywitLaxYsYLjjjuu3GXsNG3aNLp37861115bku3t2LGDkSNHMnv2bAYPHlySbZZSQ8df0uKIqK7ft6jrwCNiKbDHymRn42ZmSXjhhRc455xzmDBhQocM7+byh1mZWYc2bdq0km1ryJAhvPLKKyXbXrn5Vnozs0Q5wM1sD8W8N2al19zj7gA3s91069aNTZs2OcTbWd3ngXfr1q3odTwGbma7GTBgALW1tWzYsKHcpex36p7IUywHuJntpmvXrkU/EcbKy0MoZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mlqiinokpqQbYDGwHtkVEtaTewK+ASqAGuDgi3mibMs3MrL7mnIGPjYiqiKjO56cCcyNiMDA3nzczs3bSmiGU84CZ+fRM4PzWl2NmZsUqNsADeFzSYkmT87YjI2ItQP56RFsUaGZmDStqDBwYFRFrJB0BPCHpxWJ3kAf+ZICBAwe2oEQzM2tIUWfgEbEmf10PPAicDKyT1Bcgf13fyLp3RER1RFRXVFSUpmozM2s6wCUdIqlH3TRwJrAcmANMyrtNAh5qqyLNzGxPxQyhHAk8KKmu/79ExKOSFgL3S7oCeBWY2HZlmplZfU0GeES8AgxvoH0TcHpbFGVmZk3znZhmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZokqOsAldZa0RNLv8vnekp6QtDJ/PaztyjQzs/qacwZ+NbCiYH4qMDciBgNz83kzM2snRQW4pAHA2cCdBc3nATPz6ZnA+aUtzczM9qbYM/B/Ar4G7ChoOzIi1gLkr0eUuDYzM9uLJgNc0jnA+ohY3JIdSJosaZGkRRs2bGjJJszMrAHFnIGPAs6VVAPMAj4h6R5gnaS+APnr+oZWjog7IqI6IqorKipKVLaZmTUZ4BHxjYgYEBGVwKeBJyPic8AcYFLebRLwUJtVaWZme2jNdeDTgU9KWgl8Mp83M7N20qU5nSNiHjAvn94EnF76kszMrBi+E9PMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFFNBrikbpIWSHpW0vOSbsjbe0t6QtLK/PWwti/XzMzqFHMGvhX4REQMB6qAcZJOBaYCcyNiMDA3nzczs3bSZIBH5u18tmv+FcB5wMy8fSZwfptUaGZmDSpqDFxSZ0lLgfXAExExHzgyItYC5K9HtF2ZZmZWX1EBHhHbI6IKGACcLGlosTuQNFnSIkmLNmzY0NI6zcysnmZdhRIRbwLzgHHAOkl9AfLX9Y2sc0dEVEdEdUVFRSvLNTOzOsVchVIhqVc+fRBwBvAiMAeYlHebBDzUVkWamdmeuhTRpy8wU1JnssC/PyJ+J+lPwP2SrgBeBSa2YZ1mZlZPkwEeEcuAEQ20bwJOb4uizMysab4T08wsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS1Qxd2J2KJVTHy53CdRMP7vcJZiZ+QzczCxVDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLVJMBLunDkv6fpBWSnpd0dd7eW9ITklbmr4e1fblmZlanmDPwbcD/iIjjgFOBL0saAkwF5kbEYGBuPm9mZu2kyQCPiLUR8Uw+vRlYAfQHzgNm5t1mAue3VZFmZranZo2BS6oERgDzgSMjYi1kIQ8cUerizMyscUUHuKTuwL8C10TEW81Yb7KkRZIWbdiwoSU1mplZA4oKcEldycL73oj4dd68TlLffHlfYH1D60bEHRFRHRHVFRUVpajZzMwo7ioUAXcBKyLiRwWL5gCT8ulJwEOlL8/MzBrTpYg+o4DPA89JWpq3XQdMB+6XdAXwKjCxbUo0M7OGNBngEfEUoEYWn17acszMrFi+E9PMLFHFDKGYdXiVUx8udwnUTD+73CXYfsZn4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaL8UGMz22ft6w+79hm4mVmiHOBmZolqMsAl/VzSeknLC9p6S3pC0sr89bC2LdPMzOor5gx8BjCuXttUYG5EDAbm5vNmZtaOmgzwiPgD8Hq95vOAmfn0TOD8EtdlZmZNaOkY+JERsRYgfz2isY6SJktaJGnRhg0bWrg7MzOrr83fxIyIOyKiOiKqKyoq2np3Zmb7jZYG+DpJfQHy1/WlK8nMzIrR0gCfA0zKpycBD5WmHDMzK1YxlxHeB/wJOFZSraQrgOnAJyWtBD6Zz5uZWTtq8lb6iLi0kUWnl7gWMzNrBt+JaWaWKAe4mVmiHOBmZolygJuZJcoBbmaWKD/QwWwfs68/xMB28Rm4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJapVAS5pnKQ/S3pZ0tRSFWVmZk1rcYBL6gz8M/ApYAhwqaQhpSrMzMz2rjVn4CcDL0fEKxHxPjALOK80ZZmZWVMUES1bUboIGBcRV+bznwdOiYiv1Os3GZiczx4L/Lnl5ZbE4cDGMtfQUfhY7OJjsYuPxS4d5VgcFREV9Ru7tGKDaqBtj98GEXEHcEcr9lNSkhZFRHW56+gIfCx28bHYxcdil45+LFozhFILfLhgfgCwpnXlmJlZsVoT4AuBwZIGSToA+DQwpzRlmZlZU1o8hBIR2yR9BXgM6Az8PCKeL1llbafDDOd0AD4Wu/hY7OJjsUuHPhYtfhPTzMzKy3dimpklygFuZpYoB7iZWaJacx14EiR9lOwO0f5k16mvAeZExIqyFmZllf9c9AfmR8TbBe3jIuLR8lXW/iSdDERELMw/DmMc8GJE/N8yl1Z2ku6OiC+Uu47G7NNvYkr6OnAp2W3+tXnzALJLHmdFxPRy1dbRSPpiRPyi3HW0B0n/CHwZWAFUAVdHxEP5smciYmQ562tPkr5D9nlGXYAngFOAecAZwGMR8b3yVde+JNW/DFrAWOBJgIg4t92LasK+HuAvAcdHxAf12g8Ano+IweWprOOR9GpEDCx3He1B0nPA30fE25IqgQeAX0bEzZKWRMSIshbYjvJjUQUcCLwGDIiItyQdRPbXyQllLbAdSXoGeAG4k+yvdQH3kZ3wERH/Vr7qGravD6HsAPoBf63X3jdftl+RtKyxRcCR7VlLmXWuGzaJiBpJY4AHJB1Fwx8RsS/bFhHbgXclrYqItwAi4j1J+9v/kWrgauB6YEpELJX0XkcM7jr7eoBfA8yVtBJYnbcNBI4GvtLoWvuuI4GzgDfqtQt4uv3LKZvXJFVFxFKA/Ez8HODnwLDyltbu3pd0cES8C5xY1yipJ/vZSU5E7AB+LGl2/rqODp6RHbq41oqIRyUdQ/bRt/3JgqoWWJifdexvfgd0rwuuQpLmtX85ZfMFYFthQ0RsA74g6fbylFQ2oyNiK+wMsDpdgUnlKam8IqIWmCjpbOCtctezN/v0GLiZ2b7M14GbmSXKAW5mligHuO3zJPWSdFXB/BhJvytivRslndHMfdVIOrwldZo1lwPc9ge9gKua7FVPRHw7In7fBvWYlYQD3DoMSZWSXpR0p6Tlku6VdIakf5e0Mr/lG0m9Jf1G0jJJ/yHphLx9mqSfS5on6ZX8jkuA6cDfSVoq6X/lbd0lPZDv715Je1z/LWlG/uzXujPrGyQ9I+m5/FZ8JPWR9LikJfkVLCpY/3OSFuT7vV1SZ0kn5XV3k3SIpOclDW27o2r7Mge4dTRHAzcDJwAfBT4DfBy4Frgu73MDsCS/S/A64O6C9T9Kdq37ycB3JHUFpgKrIqIqIqbk/UaQ3ScwBPgIMKqI2jbmt9nfmtcD8B3gqfzuzTlk9xkg6TjgEmBURFQB24HPRsTCvN93gR8A90TE8iKPjdlu9unrwC1Jf4mI5wAkPQ/MjYjIb/muzPt8HLgQICKezM+Ce+bLHs6va94qaT2N32G6IL/eF0lL820/1URtv85fFwMX5NOj66Yj4mFJdTdJnU52Y8zC/OT+IGB9vuxGskcSbgHq/kowazYHuHU0WwumdxTM72DXz2tDt7vX3dBQuP52Gv8ZL7ZfQ+vU79/QzRQCZkbENxpY1hvoTnazTDfgnSL2bbYHD6FYiv4AfBayK0rIhjb2dsfcZqBHO9TyKeCwvH0ucJGkI/JlvfPPWoHsOYvfAu4Fbmqjumw/4DNwS9E04Bf5h3O9SxO3fEfEpvyN0OXAI8DDJazlBuC+/JPs/g14Nd/nC5K+CTwuqRPwAfBlSf9A9gFS/yKpM/C0pE9ExJMlrMn2E76V3swsUR5CMTNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEvX/AT2be6qWKlrcAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAETCAYAAAAveV3LAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAX6ElEQVR4nO3deZRV5Z3u8e/DFBJBkKI0DNGiGzQqSoE49CLaIMbhplEkjpkw0cu90dzWrCsJiRmQZTqYzk23iZ0oHdPg0KHFjkrCcgpebmKbZgqIKEbEVKQaZFISMEIYfvePvYsciirqVNWpOvVSz2ets84e3r337+yqes6u9+x9tiICMzNLT5dyF2BmZi3jADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3Do0SV+W9MMy1/CEpMnlrKGtSaqSFJK6lbsWK558Hrh1FJLGAg9GxOBy13Kkk1QD3BARP8/Hq4DfAt0jYm/5KrPm8BG4tRkfzZm1LQd4JyRppKRfS9oh6d8kzZV0Rz7vOknP1Wsfkobmw++R9G1Jb0jaJOkeSe/N542VVCvpi5LeBP5F0mpJEwrW1V3SVknV9bZxFPAEMFDSzvwxUNJ0SQ/mber+zf+0pPWS3pb0PyWdKWmVpO2S7q633s9IWpO3fUrSCfl0SfoHSZsl/T5ffngj+2uRpBsK90++D96W9FtJlxxmX0+TtC7f1y9LuvwwbadLmifpwbz9i5JOlPSlvM71ki4saD9Q0nxJb0l6TdJ/r7euhyXdn6/rJUmj83kPAMcDP8338xcKyvh4/rPdKum2xmq1jsEB3slI6gE8BjwA9APmAR9txiruBE4EqoGhwCDgawXz35+v9wRgCnA/8ImC+f8N2BgRKwtXGhHvAJcAGyKiV/7Y0EgNZwPDgKuBfwRuAy4ATgWukvTX+WudCHwZmARUAr8Efpyv40LgvPy19M3Xta3IfXA28BugP/At4D5JaqTtOuBcoA9wO/CgpAGHWfcEsp/NMcAK4Cmyv9NBwAzg3oK2PwZqgYHAFcDfSRpfMP9SYG7++uYDdwNExCeBN4AJ+X7+VsEyHwJOAsYDX5N08mFqtXKLCD860YMstDaQf/6RT3seuCMfvg54rt4yQRbWAt4B/rJg3l8Bv82HxwJ/AnoWzB8I7ACOzscfAb7QSG1jgdp606aT9YsDVOW1DCqYvw24umD834Fb8uEngOsL5nUB/kj25nI+8CpwDtCliX22iKy/uG7/vFYw7315Te8vcv+vBC5rZN504JmC8QnATqBrPt4731Zf4APAPqB3QftvArML1vXzgnmnAO8WjNcAFxSM1+3bwQXTlgDXlPt31o/GHz4C73wGAv8V+V9o7ndFLltJFljL8+6K7cCT+fQ6WyJiV91IZEfR/wF8VFJfsqPsh1rzAoBNBcPvNjDeKx8+AbiroNa3yN6EBkXEs2RHpP8EbJI0S9LRRW7/zbqBiPhjPtiroYaSPiVpZUENw8mO3It9bVsjYl/BeN22BgJvRcSOgva/IztSP6ROsjeunkV8LlF/mQZfl3UMDvDOZyMwqN6//McXDL9DFtIASHp/wbytZCFyakT0zR99IqLwj7yh05rmkHWjXAn8KiL+q5HaSn1K1HrgfxTU2jci3hsRzwNExHcj4gyyrpcTgaml3Hje3/7PwOeAiojoC6wmexNprQ1AP0m9C6YdDzS2b+vz6WdHAAd45/MrYC/wt5K6SZoEnFUw/wXgVEnVknqS/SsOQETsJwukf5B0LICkQZIuamKbjwGjgJvJ+sQbswmokNSnma+pMfcAX5J0KoCkPpKuzIfPlHS2pO5kb1q7yLokSukosqDckm/z02RH4K0WEevJur6+KamnpNOB6yn+v5tNwF+UohYrHwd4JxMRfyL7UO864G2yD+9+UjD/VbIPy34OrAWeq7eKLwKvAf8p6Q95u5Oa2Oa7ZH3TQwq31UC7V8g+mHs973IY2JzX1sD6HiX70HVuXutqsi4cgKPJ3ozeJut62AZ8uzXba2D7LwP/h+xNcxNwGll3UqlcS9Z3vQF4FPh6RDxT5LLfBL6S7+dbS1iTtSNfyGNImk324eFX2nAbXwNOjIhPNNnYzIriCy2szUnqR/bv/SfLXYvZkcRdKNam8otL1gNPRMQvyl2P2ZHEXShmZonyEbiZWaIc4GZmiWrXDzH79+8fVVVV7blJM7PkLV++fGtEVNaf3q4BXlVVxbJly9pzk2ZmyZPU4NdduAvFzCxRDnAzs0Q5wM3MElX2KzH37NlDbW0tu3btarqxlVTPnj0ZPHgw3bt3L3cpZtYCZQ/w2tpaevfuTVVVFY3f1MRKLSLYtm0btbW1DBkypNzlmFkLFNWFIqkmvz/fSknL8mn9JD0jaW3+fExLCti1axcVFRUO73YmiYqKCv/nY5aw5vSBj4uI6ogYnY9PAxZGxDBgYT7eIg7v8vB+N0tbaz7EvIzsTivkzxNbX07ns337dr7//e8fGN+wYQNXXHFFGSsys1QU2wcewNOSArg3ImYBx0XERoCI2Fh3h5b6JE0huzs5xx9/fENNDlI1bUGRJRWnZuZHSrq+Yu3bt4+uXbs22a4uwG+88UYABg4cyCOPPNLW5Zm1qVL/HXc05cqV+oo9Ah8TEaPI7mZyk6Tzit1ARMyKiNERMbqy8pArQTuEb3zjG5x00klccMEFXHvttXz729mNWcaOHXvgytGtW7dS9zUA+/btY+rUqZx55pmcfvrp3HvvvQAsWrSIcePG8bGPfYzTTjuNr371q9x1110HtnPbbbfx3e9+96BtT5s2jXXr1lFdXc3UqVOpqalh+PDsrluzZ89m4sSJTJgwgSFDhnD33Xfzne98h5EjR3LOOefw1ltvAbBu3TouvvhizjjjDM4991xeeeUVAObNm8fw4cMZMWIE551X9I/MzBJR1BF4fmdxImKzpEfJ7qG4SdKA/Oh7ALC5DetsM8uXL2fu3LmsWLGCvXv3MmrUKM4444zDLnPffffRp08fli5dyu7duxkzZgwXXnghAEuWLGH16tUMGTKEmpoaJk2axM0338z+/fuZO3cuS5YsOWhdM2fOZPXq1axcuRKAmpqag+avXr2aFStWsGvXLoYOHcqdd97JihUr+PznP8/999/PLbfcwpQpU7jnnnsYNmwYixcv5sYbb+TZZ59lxowZPPXUUwwaNIjt27eXbqeZWYfQZIBLOgroEhE78uELye6ZOB+YDMzMnx9vy0Lbyi9/+Usuv/xy3ve+7Ebsl156aZPLPP3006xatepAV8fvf/971q5dS48ePTjrrLMOnJZXVVVFRUUFK1asYNOmTYwcOZKKiopm1Tdu3Dh69+5N79696dOnDxMmTADgtNNOY9WqVezcuZPnn3+eK6+88sAyu3fvBmDMmDFcd911XHXVVUyaNKlZ2zWzjq+YI/DjgEfzMxa6Af8aEU9KWgo8LOl64A3gysOso0Nr7GyMbt26sX//foCDTreLCL73ve9x0UUH34x90aJFHHXUUQdNu+GGG5g9ezZvvvkmn/nMZ5pd23ve854Dw126dDkw3qVLF/bu3cv+/fvp27fvgSP4Qvfccw+LFy9mwYIFVFdXs3Llyma/gZhZx9VkH3hEvB4RI/LHqRHxjXz6togYHxHD8ue32r7c0jvvvPN49NFHeffdd9mxYwc//elPD8yrqqpi+fLlAAd9sHjRRRfxgx/8gD179gDw6quv8s477zS4/ssvv5wnn3ySpUuXHhL4AL1792bHjh0trv/oo49myJAhzJs3D8jeXF544QUg6xs/++yzmTFjBv3792f9+vUt3o6ZdTxlvxKz3EaNGsXVV19NdXU1J5xwAueee+6BebfeeitXXXUVDzzwAOeff/6B6TfccAM1NTWMGjWKiKCyspLHHnuswfX36NGDcePG0bdv3wbPSqmoqGDMmDEMHz6cSy65hJtuuqnZr+Ghhx7is5/9LHfccQd79uzhmmuuYcSIEUydOpW1a9cSEYwfP54RI0Y0e91m1nG16z0xR48eHfW/D3zNmjWcfPLJ7VZDU6ZPn06vXr249dZbS7K+/fv3M2rUKObNm8ewYcNKss5S6mj7344MPo2wtCQtL7iI8gB/G2Ebevnllxk6dCjjx4/vkOFtZmnr9F0o9U2fPr1k6zrllFN4/fXXS7Y+M7NCPgI3M0tUhwjw9uyHtz/zfjdLW9kDvGfPnmzbts1h0s7qvg+8Z8+e5S7FzFqo7H3ggwcPpra2li1btpS7lE6n7o48Zpamsgd49+7dfUcYM7MWKHsXipmZtYwD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEFR3gkrpKWiHpZ/l4P0nPSFqbPx/TdmWamVl9zTkCvxlYUzA+DVgYEcOAhfm4mZm1k6ICXNJg4CPADwsmXwbMyYfnABNLW5qZmR1OsUfg/wh8AdhfMO24iNgIkD8fW+LazMzsMJoMcEl/A2yOiOUt2YCkKZKWSVq2ZcuWlqzCzMwaUMwR+BjgUkk1wFzgfEkPApskDQDInzc3tHBEzIqI0RExurKyskRlm5lZkwEeEV+KiMERUQVcAzwbEZ8A5gOT82aTgcfbrEozMztEa84Dnwl8WNJa4MP5uJmZtZNuzWkcEYuARfnwNmB86UsyM7Ni+EpMM7NEOcDNzBLlADczS1Sz+sDN2kvVtAXlLqFN1cz8SLlLsCOAj8DNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NENRngknpKWiLpBUkvSbo9n95P0jOS1ubPx7R9uWZmVqeYI/DdwPkRMQKoBi6WdA4wDVgYEcOAhfm4mZm1kyYDPDI789Hu+SOAy4A5+fQ5wMQ2qdDMzBpUVB+4pK6SVgKbgWciYjFwXERsBMifj227Ms3MrL6iAjwi9kVENTAYOEvS8GI3IGmKpGWSlm3ZsqWldZqZWT3NOgslIrYDi4CLgU2SBgDkz5sbWWZWRIyOiNGVlZWtLNfMzOoUcxZKpaS++fB7gQuAV4D5wOS82WTg8bYq0szMDtWtiDYDgDmSupIF/sMR8TNJvwIelnQ98AZwZRvWaWZm9TQZ4BGxChjZwPRtwPi2KMrMzJrmKzHNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLVZIBL+oCk/ytpjaSXJN2cT+8n6RlJa/PnY9q+XDMzq1PMEfhe4H9HxMnAOcBNkk4BpgELI2IYsDAfNzOzdtJkgEfExoj4dT68A1gDDAIuA+bkzeYAE9uqSDMzO1Sz+sAlVQEjgcXAcRGxEbKQB44tdXFmZta4ogNcUi/g34FbIuIPzVhuiqRlkpZt2bKlJTWamVkDigpwSd3JwvuhiPhJPnmTpAH5/AHA5oaWjYhZETE6IkZXVlaWomYzM6O4s1AE3AesiYjvFMyaD0zOhycDj5e+PDMza0y3ItqMAT4JvChpZT7ty8BM4GFJ1wNvAFe2TYlmZtaQJgM8Ip4D1Mjs8aUtx8zMiuUrMc3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRxdzUOFlV0xaUu4Q2VTPzI+UuwczKyEfgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZopoMcEk/krRZ0uqCaf0kPSNpbf58TNuWaWZm9RVzBD4buLjetGnAwogYBizMx83MrB01GeAR8QvgrXqTLwPm5MNzgIklrsvMzJrQ0j7w4yJiI0D+fGzpSjIzs2K0+YeYkqZIWiZp2ZYtW9p6c2ZmnUZLA3yTpAEA+fPmxhpGxKyIGB0RoysrK1u4OTMzq6+lAT4fmJwPTwYeL005ZmZWrGJOI/wx8CvgJEm1kq4HZgIflrQW+HA+bmZm7ajJmxpHxLWNzBpf4lrMzKwZfCWmmVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiWhXgki6W9BtJr0maVqqizMysaS0OcEldgX8CLgFOAa6VdEqpCjMzs8NrzRH4WcBrEfF6RPwJmAtcVpqyzMysKd1asewgYH3BeC1wdv1GkqYAU/LRnZJ+04ptdnT9ga3ttTHd2V5b6hT8s0vbkf7zO6Ghia0JcDUwLQ6ZEDELmNWK7SRD0rKIGF3uOqz5/LNLW2f9+bWmC6UW+EDB+GBgQ+vKMTOzYrUmwJcCwyQNkdQDuAaYX5qyzMysKS3uQomIvZI+BzwFdAV+FBEvlayyNHWKrqIjlH92aeuUPz9FHNJtbWZmCfCVmGZmiXKAm5klygFuZpao1pwH3qlJ+iDZlaeDyM5/3wDMj4g1ZS3MrBPI//4GAYsjYmfB9Isj4snyVda+fATeApK+SPbVAQKWkJ1SKeDH/lKvtEn6dLlrsMOT9LfA48D/AlZLKvwKj78rT1Xl4bNQWkDSq8CpEbGn3vQewEsRMaw8lVlrSXojIo4vdx3WOEkvAn8VETslVQGPAA9ExF2SVkTEyLIW2I7chdIy+4GBwO/qTR+Qz7MOTNKqxmYBx7VnLdYiXeu6TSKiRtJY4BFJJ9DwV3wcsRzgLXMLsFDSWv78hV7HA0OBz5WtKivWccBFwNv1pgt4vv3LsWZ6U1J1RKwEyI/E/wb4EXBaeUtrXw7wFoiIJyWdSPaVuoPI/vBrgaURsa+sxVkxfgb0qguAQpIWtX851kyfAvYWToiIvcCnJN1bnpLKw33gZmaJ8lkoZmaJcoCbmSXKAW5HPEl9Jd1YMD5W0s+KWG6GpAuaua0aSf1bUqdZcznArTPoC9zYZKt6IuJrEfHzNqjHrCQc4NZhSKqS9IqkH0paLekhSRdI+g9JayWdlbfrJ+kxSask/aek0/Pp0yX9SNIiSa/nV+wBzAT+UtJKSX+fT+sl6ZF8ew9JOuT8YUmzJV2RD9dIul3SryW9mF/KjaQKSU9LWpGfAaGC5T8haUm+3XsldZV0Zl53T0lHSXpJ0vC226t2JHOAW0czFLgLOB34IPAx4EPArcCX8za3Aysi4vR82v0Fy3+Q7Bzvs4CvS+oOTAPWRUR1REzN240kO5//FOAvgDFF1LY1IkYBP8jrAfg68Fx+9d98susBkHQycDUwJiKqgX3AxyNiad7uDuBbwIMRsbrIfWN2EJ8Hbh3NbyPiRQBJLwELIyLyy6er8jYfAj4KEBHP5kfBffJ5CyJiN7Bb0mYav7JySUTU5ttZma/7uSZq+0n+vByYlA+fVzccEQsk1V0cNB44A1iaH9y/F9icz5tB9v05u4C6/xLMms0Bbh3N7oLh/QXj+/nz72tDl0vXXdBQuPw+Gv8dL7ZdQ8vUb9/QxRQC5kTElxqY1w/oBXQHegLvFLFts0O4C8VS9Avg45CdUULWtfGHw7TfAfRuh1ouAY7Jpy8ErpB0bD6vX/5dHZDdv/GrwEPAnW1Ul3UCPgK3FE0H/iX/Uqo/ApMP1zgituUfhK4GngAWlLCW28m+RvjXwP8D3si3+bKkrwBPS+oC7AFukvTXwN6I+FdJXYHnJZ0fEc+WsCbrJHwpvZlZotyFYmaWKAe4mVmiHOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygJuZJer/A9xpvw0Did7zAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAETCAYAAADNpUayAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAaRklEQVR4nO3de5RU5Z3u8e/DRYmCINAaAWOTgEZEudiiOUQHxYhOgrd4wcQEox7ORDPRrCMJxkSNSyea8ZgxcbxwogGVkREmKonLKw7HOCbcBJGLCiqBDggNXoJGCJff+aN2M2VbTVd3VXfTL89nrV6197vfvfevdsNTu9/au0oRgZmZpaVdaxdgZmbl53A3M0uQw93MLEEOdzOzBDnczcwS5HA3M0uQw93aJEk/lPSrVq7hCUljW7OG5iapUlJI6tDatVjjyNe52+5O0gjgwYjo09q1pE7SSuDSiHg2m68E3gI6RsS21qvMGstn7tbifBZo1vwc7raTpCGSXpK0SdK/S5oq6cZs2UWSXqjTPyT1y6b3lnSrpFWS1km6W9KnsmUjJFVL+oGkt4FfS1osaXTetjpK2iBpcJ197As8AfSS9EH200vS9ZIezPrUDh18S9JqSe9K+gdJx0haJOk9SXfU2e7FkpZlfZ+SdEjWLkk/l7Re0vvZ+gPrOV6zJF2af3yyY/CupLcknbaLYz1B0hvZsV4q6axd9L1e0jRJD2b9X5F0qKSrszpXSzolr38vSTMkvSNphaT/WWdbD0u6P9vWEklV2bIHgM8Av82O8/fzyvh69rvdIOma+mq13YfD3QCQtBfwKPAA0B2YBny1EZu4BTgUGAz0A3oD1+Yt/3S23UOAccD9wIV5y/8eWBsRC/M3GhEfAqcBayKic/azpp4ajgX6A+cD/wJcA5wMHAGcJ+nvsud6JvBD4GygAvg98FC2jVOAE7Ln0i3b1sYij8GxwGtAT+BnwL2SVE/fN4Djga7AT4AHJR20i22PJve72R9YADxF7v9vb+AG4J68vg8B1UAv4BzgnySNzFt+OjA1e34zgDsAIuIbwCpgdHacf5a3zheBw4CRwLWSDt9FrbY7iAj/+AdygbaG7H2YrO1F4MZs+iLghTrrBLkgF/Ah8Lm8ZV8A3sqmRwB/AzrlLe8FbAL2y+anA9+vp7YRQHWdtuvJjcMDVGa19M5bvhE4P2/+P4Ars+kngEvylrUD/kruheck4HXgOKBdA8dsFrnx6drjsyJv2T5ZTZ8u8vgvBM6oZ9n1wDN586OBD4D22XyXbF/dgIOB7UCXvP4/BSblbevZvGUDgI/y5lcCJ+fN1x7bPnltc4Axrf1v1j+7/vGZu9XqBfw5sv+9mT8VuW4FuTCbnw2BvAc8mbXXqomIzbUzkTv7/i/gq5K6kTs7n1LKEwDW5U1/VGC+czZ9CHB7Xq3vkHuB6h0Rz5E7k/1XYJ2kiZL2K3L/b9dORMRfs8nOhTpK+qakhXk1DCR3xl/sc9sQEdvz5mv31Qt4JyI25fX/E7kz/E/USe5FrVMR74PUXafg87Ldh8Pdaq0FetcZRvhM3vSH5AIcAEmfzlu2gVzAHBER3bKfrhGRHwCFLsuaTG5o5lzgDxHx53pqK/clXauB/5VXa7eI+FREvAgQEb+IiKPJDeccCowv586z8f3/C3wH6BER3YDF5F5gSrUG6C6pS17bZ4D6jm1dvnwuEQ53q/UHYBvwXUkdJJ0NDMtb/jJwhKTBkjqR+/MegIjYQS6sfi7pAABJvSWNamCfjwJDgSvIjcHXZx3QQ1LXRj6n+twNXC3pCABJXSWdm00fI+lYSR3JvaBtJjfMUU77kgvRmmyf3yJ35l6yiFhNbjjtp5I6SToKuITi/ypaB3y2HLVY63K4GwAR8TdybzBeBLxL7o3E3+Qtf53cG3fPAsuBF+ps4gfACuCPkv6S9TusgX1+RG4svG/+vgr0e5Xcm4RvZsMYvRrz3Aps7xFybwBPzWpdTG5YCGA/ci9U75IbztgI3FrK/grsfynwf8i9oK4DjiQ3RFUuF5AbK18DPAJcFxHPFLnuT4EfZcf5qjLWZC3MNzFZvSRNIvdG5o+acR/XAodGxIUNdjazovlmEms1krqTGzL4RmvXYpYaD8tYq8hurFkNPBERz7d2PWap8bCMmVmCfOZuZpYgh7uZWYJ2izdUe/bsGZWVla1dhplZmzJ//vwNEVFRaNluEe6VlZXMmzevtcswM2tTJNX7ESEeljEzS5DD3cwsQQ53M7ME7RZj7ma2+9u6dSvV1dVs3ry54c5WVp06daJPnz507Nix6HUc7mZWlOrqarp06UJlZSX1f8GUlVtEsHHjRqqrq+nbt2/R63lYxsyKsnnzZnr06OFgb2GS6NGjR6P/YnK4m1nRHOytoynH3eFuZlaE9957jzvvvHPn/Jo1azjnnHNasaJd85i7tTmVEx5v7RKa1cqbv9zaJRSl3L+H1nre27dvp3379g32qw33yy67DIBevXoxffr05i6vyXzmbmZtxk033cRhhx3GySefzAUXXMCtt+a+JGvEiBE773LfsGEDtR9nsn37dsaPH88xxxzDUUcdxT333APArFmzOPHEE/na177GkUceyY9//GNuv/32nfu55ppr+MUvfvGxfU+YMIE33niDwYMHM378eFauXMnAgblvR5w0aRJnnnkmo0ePpm/fvtxxxx3cdtttDBkyhOOOO4533nkHgDfeeINTTz2Vo48+muOPP55XX30VgGnTpjFw4EAGDRrECSecUJZj5TN3M2sT5s+fz9SpU1mwYAHbtm1j6NChHH300btc595776Vr167MnTuXLVu2MHz4cE455RQA5syZw+LFi+nbty8rV67k7LPP5oorrmDHjh1MnTqVOXPmfGxbN998M4sXL2bhwoUArFy58mPLFy9ezIIFC9i8eTP9+vXjlltuYcGCBXzve9/j/vvv58orr2TcuHHcfffd9O/fn9mzZ3PZZZfx3HPPccMNN/DUU0/Ru3dv3nvvvbIcL4e7mbUJv//97znrrLPYZ599ADj99NMbXOfpp59m0aJFO4dP3n//fZYvX85ee+3FsGHDdl5aWFlZSY8ePViwYAHr1q1jyJAh9OjRo1H1nXjiiXTp0oUuXbrQtWtXRo8eDcCRRx7JokWL+OCDD3jxxRc599xzd66zZcsWAIYPH85FF13Eeeedx9lnn92o/dbH4W5mbUZ9V4106NCBHTt2AHzsksGI4Je//CWjRo36WP9Zs2ax7777fqzt0ksvZdKkSbz99ttcfPHFja5t77333jndrl27nfPt2rVj27Zt7Nixg27duu0888939913M3v2bB5//HEGDx7MwoULG/3iUpfH3M2sTTjhhBN45JFH+Oijj9i0aRO//e1vdy6rrKxk/vz5AB97k3PUqFHcddddbN26FYDXX3+dDz/8sOD2zzrrLJ588knmzp37iRcDgC5durBp06Ym17/ffvvRt29fpk2bBuReeF5++WUgNxZ/7LHHcsMNN9CzZ09Wr17d5P3U8pm7mbUJQ4cO5fzzz2fw4MEccsghHH/88TuXXXXVVZx33nk88MADnHTSSTvbL730UlauXMnQoUOJCCoqKnj00UcLbn+vvfbixBNPpFu3bgWvnunRowfDhw9n4MCBnHbaaVx++eWNfg5Tpkzh29/+NjfeeCNbt25lzJgxDBo0iPHjx7N8+XIigpEjRzJo0KBGb7uu3eI7VKuqqsKf527F8qWQrWPZsmUcfvjhrV3GTtdffz2dO3fmqquuKsv2duzYwdChQ5k2bRr9+/cvyzbLqdDxlzQ/IqoK9W9wWEbSfZLWS1pcYNlVkkJSz7y2qyWtkPSapE/+bWNmtptZunQp/fr1Y+TIkbtlsDdFMcMyk4A7gPvzGyUdDHwJWJXXNgAYAxwB9AKelXRoRGwvV8FmZpA7cy+XAQMG8Oabb5Zte7uDBs/cI+J54J0Ci34OfB/IH9c5A5gaEVsi4i1gBTCsHIWamVnxmnS1jKTTgT9HxMt1FvUG8t/mrc7azCwBu8N7dHuiphz3Roe7pH2Aa4BrCy0u0FawKknjJM2TNK+mpqaxZZhZC+vUqRMbN250wLew2s9z79SpU6PWa8qlkJ8D+gIvZzcU9AFekjSM3Jn6wXl9+wBr6il4IjARclfLNKEOM2tBffr0obq6Gp+Mtbzab2JqjEaHe0S8AhxQOy9pJVAVERskzQD+TdJt5N5Q7Q/MKbghM2tTOnbs2KhvArLWVcylkA8BfwAOk1Qt6ZL6+kbEEuBhYCnwJHC5r5QxM2t5DZ65R8QFDSyvrDN/E3BTaWWZmVkp/NkyZmYJcribmSXI4W5mliCHu5lZghzuZmYJcribmSXI4W5mlqA98puY/GUPZpY6n7mbmSXI4W5mliCHu5lZghzuZmYJcribmSXI4W5mliCHu5lZghzuZmYJcribmSXI4W5mliCHu5lZgor5guz7JK2XtDiv7Z8lvSppkaRHJHXLW3a1pBWSXpM0qrkKNzOz+hVz5j4JOLVO2zPAwIg4CngduBpA0gBgDHBEts6dktqXrVozMytKg+EeEc8D79RpezoitmWzfwT6ZNNnAFMjYktEvAWsAIaVsV4zMytCOcbcLwaeyKZ7A6vzllVnbWZm1oJKCndJ1wDbgCm1TQW6RT3rjpM0T9K8mpqaUsowM7M6mhzuksYCXwG+HhG1AV4NHJzXrQ+wptD6ETExIqoioqqioqKpZZiZWQFNCndJpwI/AE6PiL/mLZoBjJG0t6S+QH9gTullmplZYzT4NXuSHgJGAD0lVQPXkbs6Zm/gGUkAf4yIf4iIJZIeBpaSG665PCK2N1fxZmZWWIPhHhEXFGi+dxf9bwJuKqUoMzMrje9QNTNLkMPdzCxBDnczswQ53M3MEuRwNzNLkMPdzCxBDnczswQ53M3MEuRwNzNLkMPdzCxBDnczswQ53M3MEuRwNzNLkMPdzCxBDnczswQ53M3MEuRwNzNLkMPdzCxBDnczswQ1GO6S7pO0XtLivLbukp6RtDx73D9v2dWSVkh6TdKo5irczMzqV8yZ+yTg1DptE4CZEdEfmJnNI2kAMAY4IlvnTknty1atmZkVpcFwj4jngXfqNJ8BTM6mJwNn5rVPjYgtEfEWsAIYVqZazcysSE0dcz8wItYCZI8HZO29gdV5/aqzNjMza0HlfkNVBdqiYEdpnKR5kubV1NSUuQwzsz1bU8N9naSDALLH9Vl7NXBwXr8+wJpCG4iIiRFRFRFVFRUVTSzDzMwKaWq4zwDGZtNjgcfy2sdI2ltSX6A/MKe0Es3MrLE6NNRB0kPACKCnpGrgOuBm4GFJlwCrgHMBImKJpIeBpcA24PKI2N5MtZuZWT0aDPeIuKCeRSPr6X8TcFMpRZmZWWl8h6qZWYIc7mZmCXK4m5klyOFuZpYgh7uZWYIc7mZmCXK4m5klyOFuZpYgh7uZWYIc7mZmCXK4m5klyOFuZpYgh7uZWYIc7mZmCXK4m5klyOFuZpYgh7uZWYIc7mZmCXK4m5klqKRwl/Q9SUskLZb0kKROkrpLekbS8uxx/3IVa2ZmxWlyuEvqDXwXqIqIgUB7YAwwAZgZEf2Bmdm8mZm1oFKHZToAn5LUAdgHWAOcAUzOlk8GzixxH2Zm1khNDveI+DNwK7AKWAu8HxFPAwdGxNqsz1rggELrSxonaZ6keTU1NU0tw8zMCihlWGZ/cmfpfYFewL6SLix2/YiYGBFVEVFVUVHR1DLMzKyAUoZlTgbeioiaiNgK/Ab4H8A6SQcBZI/rSy/TzMwao5RwXwUcJ2kfSQJGAsuAGcDYrM9Y4LHSSjQzs8bq0NQVI2K2pOnAS8A2YAEwEegMPCzpEnIvAOeWo1AzMytek8MdICKuA66r07yF3Fm8mZm1Et+hamaWIIe7mVmCHO5mZglyuJuZJcjhbmaWIIe7mVmCHO5mZglyuJuZJcjhbmaWIIe7mVmCHO5mZglyuJuZJcjhbmaWIIe7mVmCHO5mZglyuJuZJcjhbmaWIIe7mVmCHO5mZgkqKdwldZM0XdKrkpZJ+oKk7pKekbQ8e9y/XMWamVlxSj1zvx14MiI+DwwClgETgJkR0R+Ymc2bmVkLanK4S9oPOAG4FyAi/hYR7wFnAJOzbpOBM0st0szMGqeUM/fPAjXAryUtkPQrSfsCB0bEWoDs8YBCK0saJ2mepHk1NTUllGFmZnWVEu4dgKHAXRExBPiQRgzBRMTEiKiKiKqKiooSyjAzs7pKCfdqoDoiZmfz08mF/TpJBwFkj+tLK9HMzBqryeEeEW8DqyUdljWNBJYCM4CxWdtY4LGSKjQzs0brUOL6/whMkbQX8CbwLXIvGA9LugRYBZxb4j7MzKyRSgr3iFgIVBVYNLKU7ZqZWWl8h6qZWYIc7mZmCXK4m5klyOFuZpYgh7uZWYIc7mZmCXK4m5klyOFuZpYgh7uZWYIc7mZmCXK4m5klyOFuZpYgh7uZWYIc7mZmCXK4m5klyOFuZpYgh7uZWYIc7mZmCXK4m5klqORwl9Re0gJJv8vmu0t6RtLy7HH/0ss0M7PGKMeZ+xXAsrz5CcDMiOgPzMzmzcysBZUU7pL6AF8GfpXXfAYwOZueDJxZyj7MzKzxSj1z/xfg+8COvLYDI2ItQPZ4QKEVJY2TNE/SvJqamhLLMDOzfE0Od0lfAdZHxPymrB8REyOiKiKqKioqmlqGmZkV0KGEdYcDp0v6e6ATsJ+kB4F1kg6KiLWSDgLWl6NQMzMrXpPP3CPi6ojoExGVwBjguYi4EJgBjM26jQUeK7lKMzNrlOa4zv1m4EuSlgNfyubNzKwFlTIss1NEzAJmZdMbgZHl2K6ZmTWN71A1M0uQw93MLEEOdzOzBDnczcwS5HA3M0uQw93MLEEOdzOzBDnczcwS5HA3M0uQw93MLEEOdzOzBDnczcwS5HA3M0uQw93MLEEOdzOzBDnczcwS5HA3M0uQw93MLEEOdzOzBDU53CUdLOk/JS2TtETSFVl7d0nPSFqePe5fvnLNzKwYpZy5bwP+d0QcDhwHXC5pADABmBkR/YGZ2byZmbWgJod7RKyNiJey6U3AMqA3cAYwOes2GTiz1CLNzKxxyjLmLqkSGALMBg6MiLWQewEADqhnnXGS5kmaV1NTU44yzMwsU3K4S+oM/AdwZUT8pdj1ImJiRFRFRFVFRUWpZZiZWZ6Swl1SR3LBPiUifpM1r5N0ULb8IGB9aSWamVljlXK1jIB7gWURcVveohnA2Gx6LPBY08szM7Om6FDCusOBbwCvSFqYtf0QuBl4WNIlwCrg3NJKNDOzxmpyuEfEC4DqWTyyqds1M7PS+Q5VM7MEOdzNzBLkcDczS5DD3cwsQQ53M7MEOdzNzBLkcDczS5DD3cwsQQ53M7MEOdzNzBLkcDczS5DD3cwsQQ53M7MEOdzNzBLkcDczS5DD3cwsQQ53M7MElfI1e2ZmjVI54fHWLqFZrbz5y61dwk4+czczS1CzhbukUyW9JmmFpAnNtR8zM/ukZgl3Se2BfwVOAwYAF0ga0Bz7MjOzT2quM/dhwIqIeDMi/gZMBc5opn2ZmVkdzfWGam9gdd58NXBsfgdJ44Bx2ewHkl5rplp2Bz2BDS21M93SUnvaY/j313al/rs7pL4FzRXuKtAWH5uJmAhMbKb971YkzYuIqtauw5rGv7+2a0/+3TXXsEw1cHDefB9gTTPty8zM6miucJ8L9JfUV9JewBhgRjPty8zM6miWYZmI2CbpO8BTQHvgvohY0hz7aiP2iOGnhPn313btsb87RUTDvczMrE3xHapmZglyuJuZJcjhbmaWIH8qZDOQ9Hlyd+T2Jnd9/xpgRkQsa9XCzBKX/d/rDcyOiA/y2k+NiCdbr7KW5zP3MpP0A3IftyBgDrnLQgU85A9Qa9skfau1a7D6Sfou8Bjwj8BiSfkfefJPrVNV6/HVMmUm6XXgiIjYWqd9L2BJRPRvncqsVJJWRcRnWrsOK0zSK8AXIuIDSZXAdOCBiLhd0oKIGNKqBbYwD8uU3w6gF/CnOu0HZctsNyZpUX2LgANbshZrtPa1QzERsVLSCGC6pEMo/JEoSXO4l9+VwExJy/nvD0/7DNAP+E6rVWXFOhAYBbxbp13Aiy1fjjXC25IGR8RCgOwM/ivAfcCRrVtay3O4l1lEPCnpUHIfe9ybXChUA3MjYnurFmfF+B3QuTYg8kma1fLlWCN8E9iW3xAR24BvSrqndUpqPR5zNzNLkK+WMTNLkMPdzCxBDnfbY0nqJumyvPkRkn5XxHo3SDq5kftaKalnU+o0awqHu+3JugGXNdirjoi4NiKebYZ6zMrG4W67PUmVkl6V9CtJiyVNkXSypP+StFzSsKxfd0mPSlok6Y+Sjsrar5d0n6RZkt7M7mQEuBn4nKSFkv45a+ssaXq2vymSPnF9tKRJks7JpldK+omklyS9kt3+jqQekp6WtCC7UkN5618oaU6233sktZd0TFZ3J0n7SloiaWDzHVVLncPd2op+wO3AUcDnga8BXwSuAn6Y9fkJsCAijsra7s9b//Pkrl8fBlwnqSMwAXgjIgZHxPis3xBy9yoMAD4LDC+itg0RMRS4K6sH4DrgheyuyBnk7nVA0uHA+cDwiBgMbAe+HhFzs343Aj8DHoyIxUUeG7NP8HXu1la8FRGvAEhaAsyMiMhuOa/M+nwR+CpARDyXnT13zZY9HhFbgC2S1lP/3aZzIqI628/CbNsvNFDbb7LH+cDZ2fQJtdMR8bik2puiRgJHA3OzPwo+BazPlt1A7rOINgO1f12YNYnD3dqKLXnTO/Lmd/Df/44L3WJeeyNH/vrbqf/ffrH9Cq1Tt3+hm0gETI6Iqwss6w50BjoCnYAPi9i3WUEelrGUPA98HXJXvpAbLvnLLvpvArq0QC2nAftn7TOBcyQdkC3rnn32CeS+7/PHwBTglmaqy/YQPnO3lFwP/Dr78K+/AmN31TkiNmZvyi4GngAeL2MtPyH3Mc8vAf8PWJXtc6mkHwFPS2oHbAUul/R3wLaI+DdJ7YEXJZ0UEc+VsSbbg/jjB8zMEuRhGTOzBDnczcwS5HA3M0uQw93MLEEOdzOzBDnczcwS5HA3M0uQw93MLEH/H3CvlMzJg9ejAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAETCAYAAAAveV3LAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAXz0lEQVR4nO3de5QV5Z3u8e/DLRhBkKY1XKLNBDQqSoN4mUV0QIyXM4Mi8ZobJno4J5ozmnUkITEXZJkJZnIyY2ISZWIGvEwYcaKSuLwFh0kcM9wCYitGxHSkB4QGJQEjBOjf+aOqyabppnd37+7mpZ/PWnvtqrfeqvrtani6+t1VeysiMDOz9HTr7ALMzKx1HOBmZolygJuZJcoBbmaWKAe4mVmiHOBmZolygNshTdKXJP2wk2t4QtLUzqyhvUmqkBSSenR2LVY8+TpwO1RIGg88EBFDO7uWw52kauD6iPh5Pl8B/BboGRF7Oq8yawmfgVu78dmcWftygHdBkkZL+rWk7ZL+VdJ8Sbfny66V9FyD/iFpeD79HknfkvSGpE2S7pZ0RL5svKQaSV+Q9Cbwz5KqJE0q2FZPSVskVTbYx5HAE8BgSTvyx2BJMyU9kPep/zP/U5LWS3pb0v+WdIak1ZK2SbqrwXY/LWlN3vcpScfn7ZL0D5I2S/p9vv7IJo7XYknXFx6f/Bi8Lem3ki4+yLGeIWldfqxflnTZQfrOlLRA0gN5/xclnSDpi3md6yVdUNB/sKSFkt6S9Jqk/9lgWw9Jui/f1kuSxubL7geOA36aH+fPF5Txsfxnu0XSrU3VaocGB3gXI6kX8ChwPzAAWAB8pAWbuAM4AagEhgNDgK8WLH9fvt3jgWnAfcDHC5b/D2BjRKwq3GhEvANcDGyIiD75Y0MTNZwFjACuAv4RuBU4HzgFuFLSX+WvdTLwJWAKUA78Evhxvo0LgHPz19I/39bWIo/BWcBvgIHAN4F7JamJvuuAc4B+wG3AA5IGHWTbk8h+NkcDK4GnyP6fDgFmAfcU9P0xUAMMBi4H/k7SxILllwDz89e3ELgLICI+AbwBTMqP8zcL1vkQcCIwEfiqpJMOUqt1tojwows9yEJrA/n7H3nb88Dt+fS1wHMN1gmysBbwDvCBgmV/Cfw2nx4P/AnoXbB8MLAdOCqffxj4fBO1jQdqGrTNJBsXB6jIaxlSsHwrcFXB/L8BN+fTTwDXFSzrBvyR7JfLecCrwNlAt2aO2WKy8eL64/NawbL35jW9r8jjvwq4tIllM4FnCuYnATuA7vl833xf/YH3A3uBvgX9vwHMLdjWzwuWnQy8WzBfDZxfMF9/bIcWtC0Fru7sf7N+NP3wGXjXMxj478j/h+Z+V+S65WSBtSIfrtgGPJm316uNiJ31M5GdRf8n8BFJ/cnOsh9sywsANhVMv9vIfJ98+njgzoJa3yL7JTQkIp4lOyP9HrBJ0hxJRxW5/zfrJyLij/lkn8Y6SvqkpFUFNYwkO3Mv9rVtiYi9BfP1+xoMvBUR2wv6/47sTP2AOsl+cfUu4n2Jhus0+rrs0OAA73o2AkMa/Ml/XMH0O2QhDYCk9xUs20IWIqdERP/80S8iCv+TN3ZZ0zyyYZQrgF9FxH83UVupL4laD/yvglr7R8QREfE8QER8JyJOJxt6OQGYXsqd5+Pt/wR8FiiLiP5AFdkvkbbaAAyQ1Leg7TigqWPbkC8/Oww4wLueXwF7gL+V1EPSFODMguUvAKdIqpTUm+xPcQAioo4skP5B0jEAkoZIurCZfT4KjAFuIhsTb8omoExSvxa+pqbcDXxR0ikAkvpJuiKfPkPSWZJ6kv3S2kk2JFFKR5IFZW2+z0+RnYG3WUSsJxv6+oak3pJOA66j+L9uNgF/UYparPM4wLuYiPgT2Zt61wJvk71595OC5a+SvVn2c2At8FyDTXwBeA34L0l/yPud2Mw+3yUbmx5WuK9G+r1C9sbc6/mQw+CWvLZGtvcI2Zuu8/Naq8iGcACOIvtl9DbZ0MNW4Ftt2V8j+38Z+H9kvzQ3AaeSDSeVyjVkY9cbgEeAr0XEM0Wu+w3gy/lxvqWENVkH8o08hqS5ZG8efrkd9/FV4ISI+Hiznc2sKL7RwtqdpAFkf95/orNrMTuceAjF2lV+c8l64ImI+EVn12N2OPEQiplZonwGbmaWKAe4mVmiOvRNzIEDB0ZFRUVH7tLMLHkrVqzYEhHlDds7NMArKipYvnx5R+7SzCx5khr9uAsPoZiZJcoBbmaWKAe4mVmifCemme1n9+7d1NTUsHPnzuY7W0n17t2boUOH0rNnz6L6O8DNbD81NTX07duXiooKmv6iISu1iGDr1q3U1NQwbNiwotYpaghFUnX+/XyrJC3P2wZIekbS2vz56DbUbmaHiJ07d1JWVubw7mCSKCsra9FfPi0ZA58QEZURMTafnwEsiogRwKJ83swOAw7vztHS496WNzEvJfumFfLnyW3YlpnZIWHbtm18//vf3ze/YcMGLr/88k6sqGnFjoEH8LSkAO6JiDnAsRGxESAiNtZ/Q0tDkqaRfTs5xx13XGNdzA5QMePxzi6hXVXP/uvOLqFopf5ZdNZr37t3L927d2+2X32A33DDDQAMHjyYhx9+uL3La5Viz8DHRcQYsm8zuVHSucXuICLmRMTYiBhbXn7AnaBmZgf4+te/zoknnsj555/PNddcw7e+lX1Z0vjx4/fdzb1lyxbqP5pj7969TJ8+nTPOOIPTTjuNe+65B4DFixczYcIEPvrRj3Lqqafyla98hTvvvHPffm699Va+853v7LfvGTNmsG7dOiorK5k+fTrV1dWMHJl9E97cuXOZPHkykyZNYtiwYdx11118+9vfZvTo0Zx99tm89dZbAKxbt46LLrqI008/nXPOOYdXXnkFgAULFjBy5EhGjRrFuecWHaNNKuoMPP9mcSJis6RHyL5DcZOkQfnZ9yBgc5urMbMub8WKFcyfP5+VK1eyZ88exowZw+mnn37Qde6991769evHsmXL2LVrF+PGjeOCCy4AYOnSpVRVVTFs2DCqq6uZMmUKN910E3V1dcyfP5+lS5fut63Zs2dTVVXFqlWrAKiurt5veVVVFStXrmTnzp0MHz6cO+64g5UrV/K5z32O++67j5tvvplp06Zx9913M2LECJYsWcINN9zAs88+y6xZs3jqqacYMmQI27Zta/OxajbAJR0JdIuI7fn0BWTfmbgQmArMzp8fa3M1Ztbl/fKXv+Syyy7jve99LwCXXHJJs+s8/fTTrF69et9Qx+9//3vWrl1Lr169OPPMM/ddlldRUUFZWRkrV65k06ZNjB49mrKyshbVN2HCBPr27Uvfvn3p168fkyZNAuDUU09l9erV7Nixg+eff54rrrhi3zq7du0CYNy4cVx77bVceeWVTJkypUX7bUwxZ+DHAo/k7472AP4lIp6UtAx4SNJ1wBvAFQfZhplZ0Zq6GqNHjx7U1dUB7He5XUTw3e9+lwsvvHC//osXL+bII4/cr+36669n7ty5vPnmm3z6059ucW3vec979k1369Zt33y3bt3Ys2cPdXV19O/ff98ZfKG7776bJUuW8Pjjj1NZWcmqVata/AukULNj4BHxekSMyh+nRMTX8/atETExIkbkz2+1ugozs9y5557LI488wrvvvsv27dv56U9/um9ZRUUFK1asANjvjcULL7yQH/zgB+zevRuAV199lXfeeafR7V922WU8+eSTLFu27IDAB+jbty/bt29vdf1HHXUUw4YNY8GCBUD2y+WFF14AsrHxs846i1mzZjFw4EDWr1/f6v2A78Q0s0PMmDFjuOqqq6isrOT444/nnHPO2bfslltu4corr+T+++/nvPPO29d+/fXXU11dzZgxY4gIysvLefTRRxvdfq9evZgwYQL9+/dv9KqUsrIyxo0bx8iRI7n44ou58cYbW/waHnzwQT7zmc9w++23s3v3bq6++mpGjRrF9OnTWbt2LRHBxIkTGTVqVIu3XahDvxNz7Nix4c8Dt2L4MsLOs2bNGk466aTOLmOfmTNn0qdPH2655ZaSbK+uro4xY8awYMECRowYUZJtllJjx1/SioKbKPfxpxGaWZfx8ssvM3z4cCZOnHhIhndLHdZDKD6LM0vfzJkzS7atk08+mddff71k2+tsPgM3M0uUA9zMDtCR743Zn7X0uDvAzWw/vXv3ZuvWrQ7xDlb/eeC9e/cuep3DegzczFpu6NCh1NTUUFtb2+pt1Lz9bgkrOvQMPfqIdtlu/TfyFMsBbmb76dmzZ9HfCNOUi30BQYfwEIqZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klqugAl9Rd0kpJP8vnB0h6RtLa/Pno9ivTzMwaaskZ+E3AmoL5GcCiiBgBLMrnzcysgxQV4JKGAn8N/LCg+VJgXj49D5hc2tLMzOxgij0D/0fg80BdQduxEbERIH8+psS1mZnZQTQb4JL+BtgcEStaswNJ0yQtl7S8tra2NZswM7NGFHMGPg64RFI1MB84T9IDwCZJgwDy582NrRwRcyJibESMLS8vL1HZZmbWbIBHxBcjYmhEVABXA89GxMeBhcDUvNtU4LF2q9LMzA7QluvAZwMflrQW+HA+b2ZmHaRHSzpHxGJgcT69FZhY+pLMzKwYvhPTzCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLlAPczCxRDnAzs0Q5wM3MEuUANzNLVLMBLqm3pKWSXpD0kqTb8vYBkp6RtDZ/Prr9yzUzs3rFnIHvAs6LiFFAJXCRpLOBGcCiiBgBLMrnzcysgzQb4JHZkc/2zB8BXArMy9vnAZPbpUIzM2tUUWPgkrpLWgVsBp6JiCXAsRGxESB/Pqb9yjQzs4aKCvCI2BsRlcBQ4ExJI4vdgaRpkpZLWl5bW9vaOs3MrIEWXYUSEduAxcBFwCZJgwDy581NrDMnIsZGxNjy8vI2lmtmZvWKuQqlXFL/fPoI4HzgFWAhMDXvNhV4rL2KNDOzA/Uoos8gYJ6k7mSB/1BE/EzSr4CHJF0HvAFc0Y51mplZA80GeESsBkY30r4VmNgeRZmZWfN8J6aZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaIc4GZmiXKAm5klygFuZpYoB7iZWaKaDXBJ75f075LWSHpJ0k15+wBJz0hamz8f3f7lmplZvWLOwPcA/zciTgLOBm6UdDIwA1gUESOARfm8mZl1kGYDPCI2RsSv8+ntwBpgCHApMC/vNg+Y3F5FmpnZgVo0Bi6pAhgNLAGOjYiNkIU8cEypizMzs6YVHeCS+gD/BtwcEX9owXrTJC2XtLy2trY1NZqZWSOKCnBJPcnC+8GI+EnevEnSoHz5IGBzY+tGxJyIGBsRY8vLy0tRs5mZUdxVKALuBdZExLcLFi0EpubTU4HHSl+emZk1pUcRfcYBnwBelLQqb/sSMBt4SNJ1wBvAFe1TopmZNabZAI+I5wA1sXhiacsxM7Ni+U5MM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUQ5wM7NEOcDNzBLlADczS5QD3MwsUc0GuKQfSdosqaqgbYCkZyStzZ+Pbt8yzcysoWLOwOcCFzVomwEsiogRwKJ83szMOlCzAR4RvwDeatB8KTAvn54HTC5xXWZm1ozWjoEfGxEbAfLnY0pXkpmZFaPd38SUNE3ScknLa2tr23t3ZmZdRmsDfJOkQQD58+amOkbEnIgYGxFjy8vLW7k7MzNrqLUBvhCYmk9PBR4rTTlmZlasYi4j/DHwK+BESTWSrgNmAx+WtBb4cD5vZmYdqEdzHSLimiYWTSxxLWZm1gK+E9PMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFEOcDOzRDnAzcwS5QA3M0uUA9zMLFFtCnBJF0n6jaTXJM0oVVFmZta8Vge4pO7A94CLgZOBaySdXKrCzMzs4NpyBn4m8FpEvB4RfwLmA5eWpiwzM2tOjzasOwRYXzBfA5zVsJOkacC0fHaHpN+0YZ+HuoHAlo7ame7oqD11Cf7Zpe1w//kd31hjWwJcjbTFAQ0Rc4A5bdhPMiQtj4ixnV2HtZx/dmnrqj+/tgyh1ADvL5gfCmxoWzlmZlastgT4MmCEpGGSegFXAwtLU5aZmTWn1UMoEbFH0meBp4DuwI8i4qWSVZamLjFUdJjyzy5tXfLnp4gDhq3NzCwBvhPTzCxRDnAzs0Q5wM3MEtWW68C7NEkfJLvzdAjZ9e8bgIURsaZTCzPrAvL/f0OAJRGxo6D9ooh4svMq61g+A28FSV8g++gAAUvJLqkU8GN/qFfaJH2qs2uwg5P0t8BjwP8BqiQVfoTH33VOVZ3DV6G0gqRXgVMiYneD9l7ASxExonMqs7aS9EZEHNfZdVjTJL0I/GVE7JBUATwM3B8Rd0paGRGjO7XADuQhlNapAwYDv2vQPihfZocwSaubWgQc25G1WKt0rx82iYhqSeOBhyUdT+Mf8XHYcoC3zs3AIklr+fMHeh0HDAc+22lVWbGOBS4E3m7QLuD5ji/HWuhNSZURsQogPxP/G+BHwKmdW1rHcoC3QkQ8KekEso/UHUL2H78GWBYRezu1OCvGz4A+9QFQSNLiji/HWuiTwJ7ChojYA3xS0j2dU1Ln8Bi4mVmifBWKmVmiHOBmZolygNthT1J/STcUzI+X9LMi1psl6fwW7qta0sDW1GnWUg5w6wr6Azc026uBiPhqRPy8HeoxKwkHuB0yJFVIekXSDyVVSXpQ0vmS/lPSWkln5v0GSHpU0mpJ/yXptLx9pqQfSVos6fX8jj2A2cAHJK2S9Pd5Wx9JD+f7e1DSAdcPS5or6fJ8ulrSbZJ+LenF/FZuJJVJelrSyvwKCBWs/3FJS/P93iOpu6Qz8rp7SzpS0kuSRrbfUbXDmQPcDjXDgTuB04APAh8FPgTcAnwp73MbsDIiTsvb7itY/4Nk13ifCXxNUk9gBrAuIiojYnrebzTZ9fwnA38BjCuiti0RMQb4QV4PwNeA5/K7/xaS3Q+ApJOAq4BxEVEJ7AU+FhHL8n63A98EHoiIqiKPjdl+fB24HWp+GxEvAkh6CVgUEZHfPl2R9/kQ8BGAiHg2Pwvuly97PCJ2AbskbabpOyuXRkRNvp9V+bafa6a2n+TPK4Ap+fS59dMR8bik+puDJgKnA8vyk/sjgM35sllkn5+zE6j/K8GsxRzgdqjZVTBdVzBfx5//vTZ2u3T9DQ2F6++l6X/jxfZrbJ2G/Ru7mULAvIj4YiPLBgB9gJ5Ab+CdIvZtdgAPoViKfgF8DLIrSsiGNv5wkP7bgb4dUMvFwNF5+yLgcknH5MsG5J/VAdn3N34FeBC4o53qsi7AZ+CWopnAP+cfSvVHYOrBOkfE1vyN0CrgCeDxEtZyG9nHCP8a+A/gjXyfL0v6MvC0pG7AbuBGSX8F7ImIf5HUHXhe0nkR8WwJa7IuwrfSm5klykMoZmaJcoCbmSXKAW5mligHuJlZohzgZmaJcoCbmSXKAW5mligHuJlZov4/H5fCMrkux94AAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ + "# put all query times in a month for all valid users in a data frame\n", + "start = 0\n", + "\n", "for i in range (len(query_month)):\n", " if query_month[i] is not NaN:\n", - " query_month_df = pd.DataFrame(data = {'query times':query_month[i]})\n", - " query_month_df.plot(xlabel='month index',kind = 'bar',title='query times in a month')" + " query_month_df = pd.DataFrame({'user'+str(i+1):query_month[i]}).rename_axis('month indicies').reset_index()\n", + " query_month_df.set_index(['month indicies'], inplace=True)\n", + " start = i\n", + " break\n", + "query_month_df\n", + "\n", + " \n", + "for t in range(start+1,len(query_month)):\n", + " if query_month[t] is not NaN:\n", + " new_month_df = pd.DataFrame({'user'+str(t+1):query_month[t]}).rename_axis('month indicies').reset_index()\n", + " new_month_df.set_index(['month indicies'], inplace=True)\n", + " query_month_df = query_month_df.join(new_month_df,how='outer')\n", + "\n", + "query_month_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "naked-february", + "metadata": {}, + "outputs": [], + "source": [ + "# plot a graph for all valid users\n", + "graph_month = query_month_df.plot(kind='bar',figsize=(12,8),title='query times in a month',fontsize=18)\n", + "graph_month.title.set_size(20)\n", + "plt.xlabel('month indicies',fontsize=16)\n", + "plt.ylabel('query times', fontsize=16)\n", + "graph_day.yaxis.set_major_locator(MaxNLocator(integer=True))" + ] + }, + { + "cell_type": "markdown", + "id": "satisfied-essay", + "metadata": {}, + "source": [ + "### Get query times for all valid users in January" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "korean-vintage", + "metadata": {}, + "outputs": [], + "source": [ + "date_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "oriented-possibility", + "metadata": {}, + "outputs": [], + "source": [ + "# plot data graph\n", + "graph_date = date_df.plot(kind='bar',figsize=(20,10),title='query times in January',fontsize=18)\n", + "graph_date.title.set_size(20)\n", + "plt.xlabel('date',fontsize=16)\n", + "plt.ylabel('query times', fontsize=16)\n", + "graph_date.yaxis.set_major_locator(MaxNLocator(integer=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "alpha-vermont", + "metadata": {}, + "outputs": [], + "source": [ + "# subplots on January query times\n", + "date_ax_arr = date_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey =True)\n", + "\n", + "for ax in date_ax_arr[-1]:\n", + " ax.set_xlabel(\"date\", fontsize=16)\n", + " \n", + "for ax_arr in date_ax_arr:\n", + " ax_arr[0].yaxis.set_major_locator(MaxNLocator(integer=True))\n", + " ax_arr[0].set_ylabel(\"query times\", fontsize=16)\n", + " \n", + "date_ax_arr[-1][0].set_xticks(list(range(0,len(date_df.index),5)))\n", + "date_ax_arr[-1][0].set_xticklabels(list(range(date_df.index[0],date_df.index[-1]+1,5)))" ] } ], diff --git a/tour_model_eval/v-measurel_all_bins_single_user.ipynb b/tour_model_eval/v-measurel_all_bins_single_user.ipynb index cf2aa3a..0b0e3f0 100644 --- a/tour_model_eval/v-measurel_all_bins_single_user.ipynb +++ b/tour_model_eval/v-measurel_all_bins_single_user.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "republican-pleasure", + "id": "level-offering", "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ { "cell_type": "code", "execution_count": null, - "id": "decent-passion", + "id": "significant-leone", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "preliminary-example", + "id": "graduate-oliver", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "executive-heather", + "id": "quality-aerospace", "metadata": {}, "outputs": [], "source": [ @@ -66,17 +66,17 @@ { "cell_type": "code", "execution_count": null, - "id": "genuine-shipping", + "id": "awful-career", "metadata": {}, "outputs": [], "source": [ - "user = all_users[6]" + "user = all_users[4]" ] }, { "cell_type": "code", "execution_count": null, - "id": "sorted-juvenile", + "id": "designing-fourth", "metadata": {}, "outputs": [], "source": [ @@ -87,7 +87,7 @@ { "cell_type": "code", "execution_count": null, - "id": "primary-friendly", + "id": "interested-carroll", "metadata": {}, "outputs": [], "source": [ @@ -99,31 +99,33 @@ { "cell_type": "code", "execution_count": null, - "id": "medical-spider", + "id": "adverse-female", "metadata": {}, "outputs": [], "source": [ "# filter out trips that are not fully labeled(contain NaN in user_input)\n", - "valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t[\"data\"][\"user_input\"] and \n", - " 'purpose_confirm'in t[\"data\"][\"user_input\"] and 'replaced_mode' in t[\"data\"][\"user_input\"]]\n", + "non_empty_trips_df = pd.DataFrame(t[\"data\"][\"user_input\"]for t in non_empty_trips)\n", + "valid_trips_df = non_empty_trips_df.dropna(axis=0,how='any',thresh=None,subset=None,inplace=False)\n", + "valid_trips_idx_ls = valid_trips_df.index.tolist()\n", + "valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls]\n", "len(valid_trips),valid_trips" ] }, { "cell_type": "code", "execution_count": null, - "id": "latest-reconstruction", + "id": "proof-ridge", "metadata": {}, "outputs": [], "source": [ "sim = similarity.similarity(valid_trips, radius)\n", - "sim.data" + "len(sim.data)" ] }, { "cell_type": "code", "execution_count": null, - "id": "stone-omega", + "id": "fuzzy-mexico", "metadata": {}, "outputs": [], "source": [ @@ -133,7 +135,7 @@ { "cell_type": "code", "execution_count": null, - "id": "individual-insert", + "id": "fourth-disposition", "metadata": {}, "outputs": [], "source": [ @@ -152,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "continued-borough", + "id": "ranging-composition", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +164,7 @@ { "cell_type": "code", "execution_count": null, - "id": "outside-fairy", + "id": "royal-elephant", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "extraordinary-penalty", + "id": "unnecessary-significance", "metadata": {}, "source": [ "### Original output" @@ -184,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "advised-wiring", + "id": "played-passage", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "careful-sally", + "id": "collected-consensus", "metadata": {}, "outputs": [], "source": [ @@ -207,7 +209,7 @@ { "cell_type": "code", "execution_count": null, - "id": "elect-hardwood", + "id": "reverse-hollow", "metadata": {}, "outputs": [], "source": [ @@ -219,7 +221,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dynamic-grace", + "id": "sealed-georgia", "metadata": {}, "outputs": [], "source": [ @@ -231,7 +233,7 @@ { "cell_type": "code", "execution_count": null, - "id": "romance-chrome", + "id": "occupational-attention", "metadata": {}, "outputs": [], "source": [ @@ -241,7 +243,7 @@ { "cell_type": "code", "execution_count": null, - "id": "further-moscow", + "id": "automatic-rebate", "metadata": {}, "outputs": [], "source": [ @@ -256,7 +258,7 @@ { "cell_type": "code", "execution_count": null, - "id": "welsh-trustee", + "id": "raised-hundred", "metadata": {}, "outputs": [], "source": [ @@ -271,7 +273,7 @@ { "cell_type": "code", "execution_count": null, - "id": "minimal-fever", + "id": "noticed-default", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +286,7 @@ { "cell_type": "code", "execution_count": null, - "id": "corporate-missile", + "id": "municipal-rally", "metadata": {}, "outputs": [], "source": [ @@ -297,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "radio-meter", + "id": "flying-hazard", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +310,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fancy-mount", + "id": "outstanding-differential", "metadata": {}, "outputs": [], "source": [ @@ -318,7 +320,7 @@ { "cell_type": "code", "execution_count": null, - "id": "brown-traveler", + "id": "danish-glasgow", "metadata": {}, "outputs": [], "source": [ @@ -328,7 +330,7 @@ { "cell_type": "code", "execution_count": null, - "id": "chemical-rider", + "id": "spoken-bangkok", "metadata": {}, "outputs": [], "source": [ @@ -337,7 +339,7 @@ }, { "cell_type": "markdown", - "id": "excessive-spell", + "id": "moved-inflation", "metadata": {}, "source": [ "### After changing language" @@ -346,7 +348,7 @@ { "cell_type": "code", "execution_count": null, - "id": "christian-legislature", + "id": "cellular-bruce", "metadata": {}, "outputs": [], "source": [ @@ -359,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ready-delay", + "id": "genetic-stuff", "metadata": {}, "outputs": [], "source": [ @@ -374,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "blind-router", + "id": "attempted-confidentiality", "metadata": {}, "outputs": [], "source": [ @@ -387,7 +389,7 @@ { "cell_type": "code", "execution_count": null, - "id": "suited-institution", + "id": "north-bobby", "metadata": { "scrolled": true }, @@ -401,7 +403,7 @@ { "cell_type": "code", "execution_count": null, - "id": "incomplete-confusion", + "id": "supported-judge", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ahead-hawaiian", + "id": "corresponding-officer", "metadata": {}, "outputs": [], "source": [ @@ -428,7 +430,7 @@ { "cell_type": "code", "execution_count": null, - "id": "collectible-australia", + "id": "accepting-waterproof", "metadata": {}, "outputs": [], "source": [ @@ -443,7 +445,7 @@ { "cell_type": "code", "execution_count": null, - "id": "changed-pollution", + "id": "german-sugar", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +455,7 @@ { "cell_type": "code", "execution_count": null, - "id": "judicial-printer", + "id": "impossible-rescue", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +465,7 @@ { "cell_type": "code", "execution_count": null, - "id": "female-confidentiality", + "id": "comfortable-playback", "metadata": {}, "outputs": [], "source": [ @@ -472,7 +474,7 @@ }, { "cell_type": "markdown", - "id": "foster-reading", + "id": "israeli-alexandria", "metadata": {}, "source": [ "### After converting purposes and mode" @@ -481,7 +483,7 @@ { "cell_type": "code", "execution_count": null, - "id": "opposite-aurora", + "id": "removed-gallery", "metadata": {}, "outputs": [], "source": [ @@ -492,7 +494,7 @@ { "cell_type": "code", "execution_count": null, - "id": "western-commissioner", + "id": "temporal-campaign", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +504,7 @@ { "cell_type": "code", "execution_count": null, - "id": "vocal-saudi", + "id": "joint-november", "metadata": {}, "outputs": [], "source": [ @@ -522,7 +524,7 @@ { "cell_type": "code", "execution_count": null, - "id": "recreational-label", + "id": "specialized-airport", "metadata": { "scrolled": true }, @@ -536,7 +538,7 @@ { "cell_type": "code", "execution_count": null, - "id": "adequate-notice", + "id": "meaning-planet", "metadata": {}, "outputs": [], "source": [ @@ -548,7 +550,7 @@ { "cell_type": "code", "execution_count": null, - "id": "stuck-survivor", + "id": "patient-monitoring", "metadata": {}, "outputs": [], "source": [ @@ -563,7 +565,7 @@ { "cell_type": "code", "execution_count": null, - "id": "floppy-flight", + "id": "computational-lightning", "metadata": {}, "outputs": [], "source": [ @@ -578,7 +580,7 @@ { "cell_type": "code", "execution_count": null, - "id": "broken-demand", + "id": "boring-cream", "metadata": {}, "outputs": [], "source": [ @@ -588,7 +590,7 @@ { "cell_type": "code", "execution_count": null, - "id": "nasty-potato", + "id": "animated-presentation", "metadata": {}, "outputs": [], "source": [ @@ -598,7 +600,7 @@ { "cell_type": "code", "execution_count": null, - "id": "regular-glance", + "id": "collaborative-essence", "metadata": {}, "outputs": [], "source": [ diff --git a/tour_model_eval/v-measurel_bins_all_user.ipynb b/tour_model_eval/v-measurel_bins_all_user.ipynb index c74fe3c..a5fdab5 100644 --- a/tour_model_eval/v-measurel_bins_all_user.ipynb +++ b/tour_model_eval/v-measurel_bins_all_user.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "spoken-acrobat", + "id": "superior-snowboard", "metadata": { "scrolled": true }, @@ -28,7 +28,7 @@ { "cell_type": "code", "execution_count": null, - "id": "solid-decimal", + "id": "brave-finnish", "metadata": {}, "outputs": [], "source": [ @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acute-departure", + "id": "maritime-arcade", "metadata": {}, "outputs": [], "source": [ @@ -49,7 +49,17 @@ { "cell_type": "code", "execution_count": null, - "id": "practical-flour", + "id": "final-stability", + "metadata": {}, + "outputs": [], + "source": [ + "user_ls = evaluation.get_user_ls(all_users)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "colored-arctic", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +69,7 @@ }, { "cell_type": "markdown", - "id": "indirect-retrieval", + "id": "complete-excess", "metadata": {}, "source": [ "## Bins above cutoff" @@ -67,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "deadly-microwave", + "id": "higher-charge", "metadata": {}, "source": [ "### Original user input" @@ -76,7 +86,7 @@ { "cell_type": "code", "execution_count": null, - "id": "strange-dining", + "id": "joined-mauritius", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +96,7 @@ { "cell_type": "code", "execution_count": null, - "id": "black-mobility", + "id": "cooked-louis", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +105,7 @@ }, { "cell_type": "markdown", - "id": "composite-possibility", + "id": "personalized-stable", "metadata": {}, "source": [ "### After changing language" @@ -104,7 +114,7 @@ { "cell_type": "code", "execution_count": null, - "id": "wanted-mustang", + "id": "august-breed", "metadata": { "scrolled": true }, @@ -116,7 +126,7 @@ { "cell_type": "code", "execution_count": null, - "id": "productive-palestine", + "id": "frequent-niagara", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +135,7 @@ }, { "cell_type": "markdown", - "id": "heard-florist", + "id": "connected-haven", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -134,7 +144,7 @@ { "cell_type": "code", "execution_count": null, - "id": "smaller-creek", + "id": "animated-tumor", "metadata": {}, "outputs": [], "source": [ @@ -144,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "square-importance", + "id": "primary-tampa", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +163,7 @@ }, { "cell_type": "markdown", - "id": "approximate-groove", + "id": "agricultural-syndicate", "metadata": {}, "source": [ "### DataFrame" @@ -162,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "studied-saint", + "id": "sorted-honey", "metadata": {}, "outputs": [], "source": [ @@ -176,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "wrapped-rebate", + "id": "conservative-pregnancy", "metadata": {}, "source": [ "#### homogeneity_score" @@ -185,20 +195,19 @@ { "cell_type": "code", "execution_count": null, - "id": "suburban-insertion", + "id": "western-safety", "metadata": {}, "outputs": [], "source": [ "homo_df = pd.DataFrame(data={'original user input':homo_score_ori,'after translation':homo_score_sp2en,\n", " 'after converting purposes and replaced mode':homo_score_cvt}, \n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "everyday-conditioning", + "id": "spectacular-haven", "metadata": {}, "outputs": [], "source": [ @@ -207,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "engaged-injury", + "id": "incorporated-google", "metadata": {}, "source": [ "#### completeness_score" @@ -216,20 +225,19 @@ { "cell_type": "code", "execution_count": null, - "id": "mighty-consolidation", + "id": "adapted-insight", "metadata": {}, "outputs": [], "source": [ "comp_df = pd.DataFrame(data={'original user input':comp_score_ori,'after translation':comp_score_sp2en,\n", " 'after converting purposes and replaced mode':comp_score_cvt},\n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "christian-revelation", + "id": "chemical-california", "metadata": {}, "outputs": [], "source": [ @@ -238,7 +246,7 @@ }, { "cell_type": "markdown", - "id": "lovely-particle", + "id": "living-archives", "metadata": {}, "source": [ "#### v_measure_score" @@ -247,20 +255,19 @@ { "cell_type": "code", "execution_count": null, - "id": "documented-perfume", + "id": "indonesian-magnet", "metadata": {}, "outputs": [], "source": [ "v_df = pd.DataFrame(data={'original user input':v_score_ori,'after translation':v_score_sp2en,\n", " 'after converting purposes and replaced mode':v_score_cvt},\n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "published-kruger", + "id": "demonstrated-resident", "metadata": {}, "outputs": [], "source": [ @@ -269,7 +276,7 @@ }, { "cell_type": "markdown", - "id": "large-cement", + "id": "lined-manner", "metadata": {}, "source": [ "## All bins" @@ -277,7 +284,7 @@ }, { "cell_type": "markdown", - "id": "wrapped-channels", + "id": "insured-kingdom", "metadata": {}, "source": [ "### Original user input" @@ -286,7 +293,7 @@ { "cell_type": "code", "execution_count": null, - "id": "assigned-pearl", + "id": "latest-finding", "metadata": {}, "outputs": [], "source": [ @@ -296,7 +303,7 @@ { "cell_type": "code", "execution_count": null, - "id": "rotary-macro", + "id": "extended-processing", "metadata": {}, "outputs": [], "source": [ @@ -305,7 +312,7 @@ }, { "cell_type": "markdown", - "id": "communist-harvard", + "id": "european-philadelphia", "metadata": {}, "source": [ "### After changing language" @@ -314,7 +321,7 @@ { "cell_type": "code", "execution_count": null, - "id": "median-speed", + "id": "developing-password", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +331,7 @@ { "cell_type": "code", "execution_count": null, - "id": "present-installation", + "id": "collective-calculator", "metadata": {}, "outputs": [], "source": [ @@ -333,7 +340,7 @@ }, { "cell_type": "markdown", - "id": "armed-former", + "id": "capable-bathroom", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -342,7 +349,7 @@ { "cell_type": "code", "execution_count": null, - "id": "august-maintenance", + "id": "obvious-faculty", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +359,7 @@ { "cell_type": "code", "execution_count": null, - "id": "shaped-strip", + "id": "transparent-theology", "metadata": {}, "outputs": [], "source": [ @@ -361,7 +368,7 @@ }, { "cell_type": "markdown", - "id": "gentle-plenty", + "id": "charged-greeting", "metadata": {}, "source": [ "### DataFrame" @@ -370,7 +377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "architectural-perspective", + "id": "extraordinary-friday", "metadata": {}, "outputs": [], "source": [ @@ -384,7 +391,7 @@ }, { "cell_type": "markdown", - "id": "periodic-password", + "id": "trying-flesh", "metadata": {}, "source": [ "#### homogeneity_score" @@ -393,20 +400,19 @@ { "cell_type": "code", "execution_count": null, - "id": "theoretical-oliver", + "id": "distinguished-morris", "metadata": {}, "outputs": [], "source": [ "ab_homo_df=pd.DataFrame(data={'original user input':ab_homo_score_ori,'after translation':ab_homo_score_sp2en,\n", " 'after converting purposes and replaced mode':ab_homo_score_cvt},\n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "seasonal-coalition", + "id": "blocked-judge", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +421,7 @@ }, { "cell_type": "markdown", - "id": "outdoor-pavilion", + "id": "english-quick", "metadata": {}, "source": [ "#### completeness_score" @@ -424,20 +430,19 @@ { "cell_type": "code", "execution_count": null, - "id": "activated-schedule", + "id": "speaking-watson", "metadata": {}, "outputs": [], "source": [ "ab_comp_df = pd.DataFrame(data={'original user input':ab_comp_score_ori,'after translation':ab_comp_score_sp2en,\n", " 'after converting purposes and replaced mode':ab_comp_score_cvt},\n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "graphic-revision", + "id": "equivalent-fleece", "metadata": {}, "outputs": [], "source": [ @@ -446,7 +451,7 @@ }, { "cell_type": "markdown", - "id": "golden-philadelphia", + "id": "white-reasoning", "metadata": {}, "source": [ "#### v_measure_score" @@ -455,20 +460,19 @@ { "cell_type": "code", "execution_count": null, - "id": "relevant-nebraska", + "id": "sacred-celebrity", "metadata": {}, "outputs": [], "source": [ "ab_v_df = pd.DataFrame(data={'original user input':ab_v_score_ori,'after translation':ab_v_score_sp2en,\n", " 'after converting purposes and replaced mode':ab_v_score_cvt},\n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "assisted-franchise", + "id": "figured-stack", "metadata": {}, "outputs": [], "source": [ diff --git a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb index f44bfdc..a35b9a6 100644 --- a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb +++ b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "secondary-armor", + "id": "warming-ghana", "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ { "cell_type": "code", "execution_count": null, - "id": "smooth-reviewer", + "id": "hindu-homeless", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "accessible-jaguar", + "id": "serious-guide", "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,17 @@ { "cell_type": "code", "execution_count": null, - "id": "endangered-device", + "id": "split-burden", + "metadata": {}, + "outputs": [], + "source": [ + "user_ls = evaluation.get_user_ls(all_users)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abstract-complaint", "metadata": {}, "outputs": [], "source": [ @@ -57,7 +67,7 @@ }, { "cell_type": "markdown", - "id": "prescription-disease", + "id": "colored-surprise", "metadata": {}, "source": [ "## Evaluate clusters above cutoff based on silhouette_score" @@ -65,7 +75,7 @@ }, { "cell_type": "markdown", - "id": "fatal-castle", + "id": "received-isolation", "metadata": {}, "source": [ "### Original user input" @@ -74,7 +84,7 @@ { "cell_type": "code", "execution_count": null, - "id": "federal-convertible", + "id": "original-parade", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +94,7 @@ { "cell_type": "code", "execution_count": null, - "id": "polyphonic-astronomy", + "id": "professional-evans", "metadata": {}, "outputs": [], "source": [ @@ -93,7 +103,7 @@ }, { "cell_type": "markdown", - "id": "later-share", + "id": "latin-boulder", "metadata": {}, "source": [ "### After changing language" @@ -102,7 +112,7 @@ { "cell_type": "code", "execution_count": null, - "id": "lesbian-realtor", + "id": "prescription-supplement", "metadata": {}, "outputs": [], "source": [ @@ -112,7 +122,7 @@ { "cell_type": "code", "execution_count": null, - "id": "defensive-numbers", + "id": "lesbian-testing", "metadata": {}, "outputs": [], "source": [ @@ -121,7 +131,7 @@ }, { "cell_type": "markdown", - "id": "amended-girlfriend", + "id": "opened-norway", "metadata": {}, "source": [ "### After converting purposes and replaced mode" @@ -130,7 +140,7 @@ { "cell_type": "code", "execution_count": null, - "id": "synthetic-debate", + "id": "wicked-timeline", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +150,7 @@ { "cell_type": "code", "execution_count": null, - "id": "finite-ireland", + "id": "severe-plant", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +159,7 @@ }, { "cell_type": "markdown", - "id": "inclusive-champagne", + "id": "matched-custody", "metadata": {}, "source": [ "### DataFrame" @@ -158,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "protective-appraisal", + "id": "bizarre-retailer", "metadata": {}, "outputs": [], "source": [ @@ -172,7 +182,7 @@ }, { "cell_type": "markdown", - "id": "intended-campaign", + "id": "sacred-applicant", "metadata": {}, "source": [ "#### homogeneity_score" @@ -181,20 +191,19 @@ { "cell_type": "code", "execution_count": null, - "id": "robust-matthew", + "id": "alpha-differential", "metadata": {}, "outputs": [], "source": [ "homo_df = pd.DataFrame(data={'original user input':homo_score_ori,'after translation':homo_score_sp2en,\n", " 'after converting purposes and replaced mode':homo_score_cvt}, \n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "surrounded-karma", + "id": "mental-abraham", "metadata": {}, "outputs": [], "source": [ @@ -203,7 +212,7 @@ }, { "cell_type": "markdown", - "id": "furnished-valve", + "id": "premium-force", "metadata": {}, "source": [ "#### completeness_score" @@ -212,20 +221,19 @@ { "cell_type": "code", "execution_count": null, - "id": "regulation-storage", + "id": "former-store", "metadata": {}, "outputs": [], "source": [ "comp_df = pd.DataFrame(data={'original user input':comp_score_ori,'after translation':comp_score_sp2en,\n", " 'after converting purposes and replaced mode':comp_score_cvt},\n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "liked-palestine", + "id": "brave-kentucky", "metadata": { "scrolled": true }, @@ -236,7 +244,7 @@ }, { "cell_type": "markdown", - "id": "gorgeous-denver", + "id": "affecting-daughter", "metadata": {}, "source": [ "#### v_measure_score" @@ -245,20 +253,19 @@ { "cell_type": "code", "execution_count": null, - "id": "quiet-capital", + "id": "cheap-psychiatry", "metadata": {}, "outputs": [], "source": [ "v_df = pd.DataFrame(data={'original user input':v_score_ori,'after translation':v_score_sp2en,\n", " 'after converting purposes and replaced mode':v_score_cvt},\n", - " index=['user1','user2','user3','user4','user5','user6','user7','user8',\n", - " 'user9','user10','user11','user12','user13'])" + " index=user_ls)" ] }, { "cell_type": "code", "execution_count": null, - "id": "behavioral-embassy", + "id": "violent-consideration", "metadata": {}, "outputs": [], "source": [ diff --git a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb index 271ef9f..a88733a 100644 --- a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb @@ -113,8 +113,10 @@ "outputs": [], "source": [ "# filter out trips that are not fully labeled(contain NaN in user_input)\n", - "valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t[\"data\"][\"user_input\"] and \n", - " 'purpose_confirm'in t[\"data\"][\"user_input\"] and 'replaced_mode' in t[\"data\"][\"user_input\"]]\n", + "non_empty_trips_df = pd.DataFrame(t[\"data\"][\"user_input\"]for t in non_empty_trips)\n", + "valid_trips_df = non_empty_trips_df.dropna(axis=0,how='any',thresh=None,subset=None,inplace=False)\n", + "valid_trips_idx_ls = valid_trips_df.index.tolist()\n", + "valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls]\n", "len(valid_trips),valid_trips" ] }, diff --git a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb index c1c1ea9..e16031c 100644 --- a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb @@ -70,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "user = all_users[6]" + "user = all_users[0]" ] }, { @@ -104,9 +104,11 @@ "outputs": [], "source": [ "# filter out trips that are not fully labeled(contain NaN in user_input)\n", - "valid_trips = [t for t in non_empty_trips if 'mode_confirm' in t[\"data\"][\"user_input\"] and \n", - " 'purpose_confirm'in t[\"data\"][\"user_input\"] and 'replaced_mode' in t[\"data\"][\"user_input\"]]\n", - "len(valid_trips)" + "non_empty_trips_df = pd.DataFrame(t[\"data\"][\"user_input\"]for t in non_empty_trips)\n", + "valid_trips_df = non_empty_trips_df.dropna(axis=0,how='any',thresh=None,subset=None,inplace=False)\n", + "valid_trips_idx_ls = valid_trips_df.index.tolist()\n", + "valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls]\n", + "len(valid_trips),valid_trips" ] }, { From 1be3341b32752acaa35124fbde7afb322545a05c Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Sat, 13 Mar 2021 14:18:39 -0800 Subject: [PATCH 11/16] change'query times' to 'user input request times', add scatter(v-score,user input request proportion median) --- ... user_input_request_times_all_users.ipynb} | 311 ++++++++++++------ 1 file changed, 209 insertions(+), 102 deletions(-) rename tour_model_eval/{query_times_all_users.ipynb => user_input_request_times_all_users.ipynb} (55%) diff --git a/tour_model_eval/query_times_all_users.ipynb b/tour_model_eval/user_input_request_times_all_users.ipynb similarity index 55% rename from tour_model_eval/query_times_all_users.ipynb rename to tour_model_eval/user_input_request_times_all_users.ipynb index a15e626..8d4528d 100644 --- a/tour_model_eval/query_times_all_users.ipynb +++ b/tour_model_eval/user_input_request_times_all_users.ipynb @@ -26,7 +26,7 @@ "import matplotlib.pyplot as plt\n", "from matplotlib.ticker import MaxNLocator\n", "import numpy as np\n", - "# import matplotlib.ticker as ticker" + "from matplotlib import cm" ] }, { @@ -57,7 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "query_day = []" + "req_day = []" ] }, { @@ -67,7 +67,17 @@ "metadata": {}, "outputs": [], "source": [ - "query_month = []" + "req_month = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "incident-hampton", + "metadata": {}, + "outputs": [], + "source": [ + "req_propor_median = []" ] }, { @@ -103,14 +113,50 @@ { "cell_type": "code", "execution_count": null, - "id": "formed-drive", + "id": "frozen-stanford", "metadata": {}, "outputs": [], "source": [ - "# build a base dataframe for query times in January \n", + "# bin trips according to ['start_local_dt']\n", + "def bin_date(trip_ls,day=None,month=None):\n", + " bin_date = []\n", + " for trip_index in trip_ls:\n", + " added = False\n", + " trip = filter_trips[trip_index]\n", + "\n", + " for bin in bin_date:\n", + " if day:\n", + " if match_day(trip,bin):\n", + " bin.append(trip_index)\n", + " added = True\n", + " break\n", + " if month:\n", + " if match_month(trip,bin):\n", + " bin.append(trip_index)\n", + " added = True\n", + " break\n", + "\n", + " if not added:\n", + " bin_date.append([trip_index])\n", + "\n", + " return bin_date " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "formed-drive", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# build a base dataframe for request times in January \n", "date_df = pd.DataFrame(data = {'date':np.arange(1,32),'drop_col':np.arange(1,32)})\n", "date_df.set_index(['date'], inplace=True)\n", "\n", + "# get valid user list\n", + "valid_users = []\n", "\n", "for a in range(len(all_users)):\n", " user = all_users[a]\n", @@ -119,36 +165,39 @@ "\n", " # filter out users that don't have enough valid labeled trips\n", " if not evaluation.valid_user(filter_trips,trips):\n", - " query_day.append(NaN)\n", - " query_month.append(NaN) \n", + " req_day.append(NaN)\n", + " req_month.append(NaN) \n", " continue\n", - " \n", + " val_user = 'user' + str(a + 1)\n", + " valid_users.append(val_user)\n", " sim.bin_data()\n", " sim.delete_bins()\n", " bins = sim.bins\n", " \n", - " # collect query trips and common trips(no need to query) indices above cutoff\n", + " # collect requested trips and common trips(no need to request) indices above cutoff\n", " ab_trip_ls = []\n", - " no_query_trip_ls = []\n", + " no_req_trip_ls = []\n", " for bin in bins:\n", " early_trip = filter_trips[bin[0]]\n", - " trip_index = 0\n", + " index = 0\n", " for i in range(1,len(bin)):\n", " compare_trip = filter_trips[bin[i]]\n", " if early_trip['data']['start_local_dt']['year']>compare_trip['data']['start_local_dt']['year']:\n", " early_trip = compare_trip\n", - " trip_index = i\n", + " index = i\n", " elif early_trip['data']['start_local_dt']['year']==compare_trip['data']['start_local_dt']['year'] and early_trip['data']['start_local_dt']['month']>compare_trip['data']['start_local_dt']['month']:\n", " early_trip = compare_trip\n", - " trip_index = i\n", + " index = i\n", " elif early_trip['data']['start_local_dt']['year']==compare_trip['data']['start_local_dt']['year'] and early_trip['data']['start_local_dt']['month']==compare_trip['data']['start_local_dt']['month'] and early_trip['data']['start_local_dt']['day']>compare_trip['data']['start_local_dt']['day']:\n", " early_trip = compare_trip\n", - " trip_index = i\n", - " ab_trip_ls.append(bin[trip_index])\n", + " index = i\n", + " early_trip_index = bin[index]\n", + " ab_trip_ls.append(early_trip_index)\n", " \n", " for k in range(len(bin)):\n", - " if k != trip_index:\n", - " no_query_trip_ls.append(bin[k])\n", + " if k != index:\n", + " no_req_trip_idx = bin[k]\n", + " no_req_trip_ls.append(no_req_trip_idx)\n", "\n", "\n", " \n", @@ -158,30 +207,21 @@ " # collect query trips indices below cutoff\n", " bl_trip_ls = []\n", " for bin in bl_bins:\n", - " for index in bin:\n", - " bl_trip_ls.append(index)\n", + " for trip_index in bin:\n", + " bl_trip_ls.append(trip_index)\n", " \n", - " # whole list of query trips indices\n", - " query_trips_ls=ab_trip_ls+bl_trip_ls\n", + " # whole list of requested trips indices\n", + " req_trips_ls=ab_trip_ls+bl_trip_ls\n", " \n", " \n", - " # collect query times in a day\n", - " bin_day = []\n", - " for trip_index in query_trips_ls:\n", - " added = False\n", - " trip = filter_trips[trip_index]\n", - " for bin in bin_day:\n", - " if match_day(trip,bin):\n", - " bin.append(trip_index)\n", - " added = True\n", - " if not added:\n", - " bin_day.append([trip_index])\n", - " query_day_ls = []\n", + " # collect request times in a day\n", + " bin_day = bin_date(req_trips_ls,day=True)\n", + " req_day_ls = []\n", " for bin in bin_day:\n", - " query_day_ls.append(len(bin))\n", - " \n", - " # collect 0 query days \n", - " for trip_index in no_query_trip_ls:\n", + " req_day_ls.append(len(bin))\n", + " \n", + " # collect 0 request days \n", + " for trip_index in no_req_trip_ls:\n", " trip = filter_trips[trip_index]\n", " match = False\n", " for bin in bin_day:\n", @@ -189,43 +229,56 @@ " match = True\n", " break\n", " if not match:\n", - " query_day_ls.append(0)\n", + " req_day_ls.append(0)\n", " \n", - "\n", - " # collect query times in a day for every user\n", - " query_day.append(query_day_ls)\n", + " # collect request times in a day for every user\n", + " req_day.append(req_day_ls)\n", + " \n", + " # collect user input request proportion in a day\n", + " filter_trips_df = pd.DataFrame(filter_trips)\n", + " filter_trips_idx_ls = filter_trips_df.index.values.tolist()\n", + " bin_filter_trips_day = bin_date(filter_trips_idx_ls,day=True)\n", + " propor_single_user = []\n", + " for valid_trips_bin in bin_filter_trips_day:\n", + " match = False\n", + " for req_trips_bin in bin_day:\n", + " req_trip = filter_trips[req_trips_bin[0]]\n", + " if match_day(req_trip,valid_trips_bin):\n", + " proportion = round(len(req_trips_bin)/len(valid_trips_bin), 2)\n", + " propor_single_user.append(proportion)\n", + " match = True\n", + " break\n", + " if not match:\n", + " propor_single_user.append(0) \n", + " \n", + " # get user input request proportion median in a day\n", + " median = np.median(propor_single_user)\n", " \n", + " # collect medians for every user\n", + " req_propor_median.append(median)\n", " \n", - " # collect query times in a month\n", - " bin_month = []\n", - " for trip_index in query_trips_ls:\n", - " added = False\n", - " trip = filter_trips[trip_index]\n", - " for bin in bin_month:\n", - " if match_month(trip,bin):\n", - " bin.append(trip_index)\n", - " added = True\n", - " if not added:\n", - " bin_month.append([trip_index])\n", - " query_month_ls = []\n", + " \n", + " # collect request times in a month\n", + " bin_month = bin_date(req_trips_ls,month=True)\n", + " req_month_ls = []\n", " for bin in bin_month:\n", - " query_month_ls.append(len(bin))\n", + " req_month_ls.append(len(bin))\n", "\n", - " # collect query times in a month for every user\n", - " query_month.append(query_month_ls)\n", + " # collect request times in a month for every user\n", + " req_month.append(req_month_ls)\n", " \n", " # select the trips that are in Jan 2021\n", " jan_trips = []\n", - " for trip_index in query_trips_ls:\n", + " for trip_index in req_trips_ls:\n", " if filter_trips[trip_index]['data']['start_local_dt']['year']==2021 and filter_trips[trip_index]['data']['start_local_dt']['month']==1:\n", " jan_trips.append(trip_index)\n", " \n", - " # create the data frame for query times in Jan 2021\n", - " date = []\n", + " # create the data frame for request times in Jan 2021\n", + " jan_date = []\n", " for trip_index in jan_trips:\n", " trip_date = filter_trips[trip_index]['data']['start_local_dt']['day']\n", - " date.append(trip_date)\n", - " new_date_df = pd.DataFrame(data = date)\n", + " jan_date.append(trip_date)\n", + " new_date_df = pd.DataFrame(data = jan_date)\n", " new_date_df=new_date_df.value_counts(sort = False).rename_axis('date').to_frame('user'+str(a+1)).reset_index()\n", " new_date_df.set_index(['date'], inplace=True)\n", " date_df = date_df.join(new_date_df,how='outer')\n", @@ -239,7 +292,7 @@ "metadata": {}, "outputs": [], "source": [ - "query_day" + "req_day" ] }, { @@ -249,7 +302,7 @@ "metadata": {}, "outputs": [], "source": [ - "query_month" + "req_month" ] }, { @@ -257,7 +310,7 @@ "id": "ruled-dictionary", "metadata": {}, "source": [ - "### Plot query times in a day" + "### Plot request times in a day" ] }, { @@ -267,24 +320,24 @@ "metadata": {}, "outputs": [], "source": [ - "# show query times in a data frame\n", + "# show request times in a data frame\n", "base = 0\n", - "for i in range (len(query_day)):\n", - " if query_day[i] is not NaN:\n", - " query_day_ls_df = pd.DataFrame(data = query_day[i])\n", - " query_day_df=query_day_ls_df.value_counts(sort = False).rename_axis('query times').to_frame('user'+str(i+1)).reset_index()\n", - " query_day_df.set_index(['query times'], inplace=True)\n", + "for i in range (len(req_day)):\n", + " if req_day[i] is not NaN:\n", + " req_day_ls_df = pd.DataFrame(data = req_day[i])\n", + " req_day_df=req_day_ls_df.value_counts(sort = False).rename_axis('request times').to_frame('user'+str(i+1)).reset_index()\n", + " req_day_df.set_index(['request times'], inplace=True)\n", " base = i\n", " break\n", " \n", - "for i in range (base+1,len(query_day)):\n", - " if query_day[i] is not NaN:\n", - " new_day_df = pd.DataFrame(data = query_day[i])\n", - " new_day_df = new_day_df.value_counts(sort = False).rename_axis('query times').to_frame('user'+str(i+1)).reset_index()\n", - " new_day_df.set_index(['query times'], inplace=True)\n", - " query_day_df=query_day_df.join(new_day_df,how='outer',sort='query times')\n", + "for i in range (len(req_day)):\n", + " if req_day[i] is not NaN and i is not base:\n", + " new_day_df = pd.DataFrame(data = req_day[i])\n", + " new_day_df = new_day_df.value_counts(sort = False).rename_axis('request times').to_frame('user'+str(i+1)).reset_index()\n", + " new_day_df.set_index(['request times'], inplace=True)\n", + " req_day_df=req_day_df.join(new_day_df,how='outer',sort='request times')\n", " \n", - "query_day_df" + "req_day_df" ] }, { @@ -295,10 +348,10 @@ "outputs": [], "source": [ "# plot a graph for all valid users\n", - "yticks_max = query_day_df.max().max()\n", - "graph_day = query_day_df.plot(kind='bar',figsize=(14,16),title='query times in a day',fontsize=18,yticks=np.arange(0, yticks_max+4, step=4))\n", + "yticks_max = req_day_df.max().max()\n", + "graph_day = req_day_df.plot(kind='bar',figsize=(14,16),title='request times in a day',fontsize=18,yticks=np.arange(0, yticks_max+4, step=4))\n", "graph_day.title.set_size(20)\n", - "plt.xlabel('query times',fontsize=16)\n", + "plt.xlabel('request times',fontsize=16)\n", "plt.ylabel('days', fontsize=16)" ] }, @@ -312,7 +365,7 @@ "outputs": [], "source": [ "# subplots\n", - "day_ax_arr = query_day_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey=True)\n", + "day_ax_arr = req_day_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey=True)\n", "\n", "for ax in day_ax_arr[-1]:\n", " ax.set_xlabel('query times', fontsize=16)\n", @@ -328,7 +381,7 @@ "id": "confident-capacity", "metadata": {}, "source": [ - "### Plot query times in a month" + "### Plot request times in a month" ] }, { @@ -346,25 +399,25 @@ "metadata": {}, "outputs": [], "source": [ - "# put all query times in a month for all valid users in a data frame\n", + "# put all request times in a month for all valid users in a data frame\n", "start = 0\n", "\n", - "for i in range (len(query_month)):\n", - " if query_month[i] is not NaN:\n", - " query_month_df = pd.DataFrame({'user'+str(i+1):query_month[i]}).rename_axis('month indicies').reset_index()\n", - " query_month_df.set_index(['month indicies'], inplace=True)\n", + "for i in range (len(req_month)):\n", + " if req_month[i] is not NaN:\n", + " req_month_df = pd.DataFrame({'user'+str(i+1):req_month[i]}).rename_axis('month indicies').reset_index()\n", + " req_month_df.set_index(['month indicies'], inplace=True)\n", " start = i\n", " break\n", - "query_month_df\n", + "req_month_df\n", "\n", " \n", - "for t in range(start+1,len(query_month)):\n", - " if query_month[t] is not NaN:\n", - " new_month_df = pd.DataFrame({'user'+str(t+1):query_month[t]}).rename_axis('month indicies').reset_index()\n", + "for t in range(len(req_month)):\n", + " if req_month[t] is not NaN and t is not start:\n", + " new_month_df = pd.DataFrame({'user'+str(t+1):req_month[t]}).rename_axis('month indicies').reset_index()\n", " new_month_df.set_index(['month indicies'], inplace=True)\n", - " query_month_df = query_month_df.join(new_month_df,how='outer')\n", + " req_month_df = req_month_df.join(new_month_df,how='outer')\n", "\n", - "query_month_df" + "req_month_df" ] }, { @@ -375,11 +428,11 @@ "outputs": [], "source": [ "# plot a graph for all valid users\n", - "graph_month = query_month_df.plot(kind='bar',figsize=(12,8),title='query times in a month',fontsize=18)\n", + "graph_month = req_month_df.plot(kind='bar',figsize=(12,8),title='request times in a month',fontsize=18)\n", "graph_month.title.set_size(20)\n", "plt.xlabel('month indicies',fontsize=16)\n", - "plt.ylabel('query times', fontsize=16)\n", - "graph_day.yaxis.set_major_locator(MaxNLocator(integer=True))" + "plt.ylabel('request times', fontsize=16)\n", + "graph_month.yaxis.set_major_locator(MaxNLocator(integer=True))" ] }, { @@ -387,7 +440,7 @@ "id": "satisfied-essay", "metadata": {}, "source": [ - "### Get query times for all valid users in January" + "### Get request times for all valid users in January" ] }, { @@ -408,10 +461,10 @@ "outputs": [], "source": [ "# plot data graph\n", - "graph_date = date_df.plot(kind='bar',figsize=(20,10),title='query times in January',fontsize=18)\n", + "graph_date = date_df.plot(kind='bar',figsize=(20,10),title='request times in January',fontsize=18)\n", "graph_date.title.set_size(20)\n", "plt.xlabel('date',fontsize=16)\n", - "plt.ylabel('query times', fontsize=16)\n", + "plt.ylabel('request times', fontsize=16)\n", "graph_date.yaxis.set_major_locator(MaxNLocator(integer=True))" ] }, @@ -419,10 +472,12 @@ "cell_type": "code", "execution_count": null, "id": "alpha-vermont", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "# subplots on January query times\n", + "# subplots on January request times\n", "date_ax_arr = date_df.plot(subplots=True,layout=(2,4),kind='bar',figsize=(16,8),fontsize=15,sharey =True)\n", "\n", "for ax in date_ax_arr[-1]:\n", @@ -430,11 +485,63 @@ " \n", "for ax_arr in date_ax_arr:\n", " ax_arr[0].yaxis.set_major_locator(MaxNLocator(integer=True))\n", - " ax_arr[0].set_ylabel(\"query times\", fontsize=16)\n", + " ax_arr[0].set_ylabel(\"request times\", fontsize=16)\n", " \n", "date_ax_arr[-1][0].set_xticks(list(range(0,len(date_df.index),5)))\n", "date_ax_arr[-1][0].set_xticklabels(list(range(date_df.index[0],date_df.index[-1]+1,5)))" ] + }, + { + "cell_type": "markdown", + "id": "applied-residence", + "metadata": {}, + "source": [ + "## Scatter(v-score on valid trips above cutoff, user input request proportion median)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "olympic-guarantee", + "metadata": {}, + "outputs": [], + "source": [ + "# Here we use v-score on bins above cutoff after changing language and converting purposes and modes\n", + "homo_score_cvt, comp_score_cvt, v_score_cvt = evaluation.v_measure_bins(all_users,radius,cvt_pur_mo=True,cutoff=True)\n", + "v_score_df = pd.DataFrame(data = {'v score':v_score_cvt}).dropna().reset_index(drop=True)\n", + "valid_v_score = v_score_df['v score'].values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "current-crime", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "x=req_propor_median\n", + "y=valid_v_score\n", + "v=valid_users\n", + "cmp = cm.get_cmap('Dark2', len(valid_users))\n", + "\n", + "for i in range(len(valid_users)):\n", + " plt.scatter(x[i], y[i], color=cmp.colors[i], label=v[i], s=70, alpha=0.7)\n", + "plt.legend(markerscale=0.7, scatterpoints=1)\n", + "plt.xlabel('user input request proportion median',fontsize=16)\n", + "plt.ylabel('v measure score',fontsize=16)\n", + "plt.xticks(fontsize=14)\n", + "plt.yticks(fontsize=14)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "velvet-sweden", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 302c6811d2a454d6a8f931c97336d2158ac44682 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Sat, 27 Mar 2021 17:38:52 -0700 Subject: [PATCH 12/16] modified evaluation code, changed to use homogeneity score for evaluation --- .../confirmed_trips_eval_bins_clusters.py | 116 +++++++++-------- .../user_input_request_times_all_users.ipynb | 32 ++--- .../v-measurel_bins_all_user.ipynb | 121 +++++------------- ...urel_clusters_above_cutoff_all_users.ipynb | 61 +++------ 4 files changed, 126 insertions(+), 204 deletions(-) diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index a538910..9131e61 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -22,12 +22,20 @@ map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home', 'insurance_payment':'insurance'} -def get_user_ls(all_users): +def get_user_ls(all_users,radius): user_ls = [] + valid_user_ls = [] for i in range(len(all_users)): curr_user = 'user' + str(i + 1) - user_ls.append(curr_user) - return user_ls + user = all_users[i] + filter_trips,sim,trips = filter_data(user,radius) + if valid_user(filter_trips,trips): + valid_user_ls.append(curr_user) + user_ls.append(curr_user) + else: + user_ls.append(curr_user) + continue + return user_ls,valid_user_ls def filter_data(user,radius): @@ -48,6 +56,44 @@ def valid_user(filter_trips,trips): valid = True return valid +def map_labels(user_input_df,span_eng_dict,map_pur_dict,sp2en,cvt_pur_mo): + if sp2en: + # change language + user_input_df = user_input_df.replace(span_eng_dict) + elif cvt_pur_mo: + # change language first + user_input_df = user_input_df.replace(span_eng_dict) + # convert purpose + user_input_df = user_input_df.replace(map_pur_dict) + # convert mode + for a in range(len(user_input_df)): + if user_input_df.iloc[a]["replaced_mode"] == "same_mode": + # to see which row will be converted + logging.debug("The following rows will be changed: %s", user_input_df.iloc[a]) + user_input_df.iloc[a]["replaced_mode"] = user_input_df.iloc[a]['mode_confirm'] + return user_input_df + + +def valid_user_check(filter_trips,trips,homo_score,comp_score,v_score): + if not valid_user(filter_trips, trips): + homo_score.append(NaN) + comp_score.append(NaN) + v_score.append(NaN) + skip = True + else: + skip = False + return homo_score,comp_score,v_score,skip + + +def compute_score(labels_true,labels_pred,homo_score,comp_score,v_score): + homo = metrics.homogeneity_score(labels_true, labels_pred) + homo_score.append(float('%.3f' % homo)) + comp = metrics.completeness_score(labels_true, labels_pred) + comp_score.append(float('%.3f' % comp)) + v = metrics.v_measure_score(labels_true, labels_pred) + v_score.append(float('%.3f' % v)) + return homo_score,comp_score,v_score + # v_measure_bins takes 5 parameters # - sp2en=True: change Spanish to English @@ -63,13 +109,10 @@ def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): user = all_users[i] filter_trips,sim,trips = filter_data(user,radius) - # filter out users that haven't enough trips (at least 10 valid trips - # and 50% of total trips are valid) to analyze - if not valid_user(filter_trips,trips): - homo_score.append(NaN) - comp_score.append(NaN) - v_score.append(NaN) + homo_score,comp_score,v_score,skip = valid_user_check(filter_trips,trips,homo_score,comp_score,v_score) + if skip: continue + sim.bin_data() if cutoff is None: trip_index_ls = [] @@ -84,23 +127,13 @@ def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): bin_trips = sim.newdata bins = sim.bins - bin_trips_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips]) - - if sp2en: - bin_trips_df = bin_trips_df.replace(span_eng_dict) - elif cvt_pur_mo: - bin_trips_df = bin_trips_df.replace(span_eng_dict) - bin_trips_df = bin_trips_df.replace(map_pur_dict) - for a in range(len(bin_trips_df)): - if bin_trips_df.iloc[a]["replaced_mode"] == "same_mode": - # to see which row will be converted - logging.debug("The following rows will be changed: %s", bin_trips_df.iloc[a]) - bin_trips_df.iloc[a]["replaced_mode"] = bin_trips_df.iloc[a]['mode_confirm'] + bin_trips_user_input_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips]) + bin_trips_user_input_df = map_labels(bin_trips_user_input_df, span_eng_dict, map_pur_dict, sp2en, cvt_pur_mo) # turn all user_input into list without binning - bin_trips_user_input_ls = bin_trips_df.values.tolist() + bin_trips_user_input_ls = bin_trips_user_input_df.values.tolist() # drop duplicate user_input - no_dup_df = bin_trips_df.drop_duplicates() + no_dup_df = bin_trips_user_input_df.drop_duplicates() # turn non-duplicate user_input into list no_dup_list = no_dup_df.values.tolist() @@ -125,12 +158,7 @@ def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls]) # compare two data frames, the program will continue to score calculation if two data frames are the same assert_frame_equal(bins_ts, bin_trips_ts) - homo = metrics.homogeneity_score(labels_true, labels_pred) - homo_score.append(float('%.3f' % homo)) - comp = metrics.completeness_score(labels_true, labels_pred) - comp_score.append(float('%.3f' % comp)) - v = metrics.v_measure_score(labels_true, labels_pred) - v_score.append(float('%.3f' % v)) + homo_score, comp_score, v_score = compute_score(labels_true, labels_pred, homo_score, comp_score, v_score) return homo_score, comp_score, v_score @@ -148,12 +176,10 @@ def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): user = all_users[i] filter_trips,sim,trips = filter_data(user,radius) - # filter out users that haven't enough trips to analyze - if not valid_user(filter_trips,trips): - homo_score.append(NaN) - comp_score.append(NaN) - v_score.append(NaN) + homo_score,comp_score,v_score,skip = valid_user_check(filter_trips,trips,homo_score,comp_score,v_score) + if skip: continue + sim.bin_data() sim.delete_bins() bin_trips = sim.newdata @@ -166,19 +192,7 @@ def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): feat.cluster(min_clusters=min, max_clusters=max) cluster_trips = feat.data cluster_user_input_df = pd.DataFrame(data=[i["data"]["user_input"] for i in cluster_trips]) - if sp2en: - # change language - cluster_user_input_df = cluster_user_input_df.replace(span_eng_dict) - cluster_user_input_ls = cluster_user_input_df.values.tolist() - elif cvt_pur_mo: - # change language first - cluster_user_input_df = cluster_user_input_df.replace(span_eng_dict) - # convert purpose - cluster_user_input_df = cluster_user_input_df.replace(map_pur_dict) - # convert mode - for a in range(len(cluster_user_input_df)): - if cluster_user_input_df.iloc[a]["replaced_mode"] == "same_mode": - cluster_user_input_df.iloc[a]["replaced_mode"] = cluster_user_input_df.iloc[a]['mode_confirm'] + cluster_user_input_df = map_labels(cluster_user_input_df, span_eng_dict, map_pur_dict, sp2en, cvt_pur_mo) # turn cluster_trips to list without any changes cluster_user_input_ls = cluster_user_input_df.values.tolist() # drop duplicate user_input @@ -203,13 +217,7 @@ def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): cluster_ps_df = pd.DataFrame(data=cluster_ps) label_ps_df = pd.DataFrame(data=feat.points) assert_frame_equal(cluster_ps_df, label_ps_df) - - homo = metrics.homogeneity_score(labels_true, labels_pred) - homo_score.append(float('%.3f' % homo)) - comp = metrics.completeness_score(labels_true, labels_pred) - comp_score.append(float('%.3f' % comp)) - v = metrics.v_measure_score(labels_true, labels_pred) - v_score.append(float('%.3f' % v)) + homo_score, comp_score, v_score = compute_score(labels_true, labels_pred, homo_score, comp_score, v_score) return homo_score, comp_score, v_score diff --git a/tour_model_eval/user_input_request_times_all_users.ipynb b/tour_model_eval/user_input_request_times_all_users.ipynb index 8d4528d..f78a808 100644 --- a/tour_model_eval/user_input_request_times_all_users.ipynb +++ b/tour_model_eval/user_input_request_times_all_users.ipynb @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "incident-hampton", + "id": "arabic-growth", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "frozen-stanford", + "id": "progressive-playback", "metadata": {}, "outputs": [], "source": [ @@ -156,7 +156,7 @@ "date_df.set_index(['date'], inplace=True)\n", "\n", "# get valid user list\n", - "valid_users = []\n", + "user_ls,valid_users = evaluation.get_user_ls(all_users,radius)\n", "\n", "for a in range(len(all_users)):\n", " user = all_users[a]\n", @@ -168,8 +168,6 @@ " req_day.append(NaN)\n", " req_month.append(NaN) \n", " continue\n", - " val_user = 'user' + str(a + 1)\n", - " valid_users.append(val_user)\n", " sim.bin_data()\n", " sim.delete_bins()\n", " bins = sim.bins\n", @@ -424,7 +422,9 @@ "cell_type": "code", "execution_count": null, "id": "naked-february", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# plot a graph for all valid users\n", @@ -493,36 +493,36 @@ }, { "cell_type": "markdown", - "id": "applied-residence", + "id": "endless-copper", "metadata": {}, "source": [ - "## Scatter(v-score on valid trips above cutoff, user input request proportion median)" + "## Scatter(homogeneity score on valid trips above cutoff, user input request proportion median)" ] }, { "cell_type": "code", "execution_count": null, - "id": "olympic-guarantee", + "id": "creative-restaurant", "metadata": {}, "outputs": [], "source": [ - "# Here we use v-score on bins above cutoff after changing language and converting purposes and modes\n", + "# Here we use homogeneity score on bins above cutoff after changing language and converting purposes and modes\n", "homo_score_cvt, comp_score_cvt, v_score_cvt = evaluation.v_measure_bins(all_users,radius,cvt_pur_mo=True,cutoff=True)\n", - "v_score_df = pd.DataFrame(data = {'v score':v_score_cvt}).dropna().reset_index(drop=True)\n", - "valid_v_score = v_score_df['v score'].values.tolist()" + "homo_score_df = pd.DataFrame(data = {'homo score':homo_score_cvt}).dropna().reset_index(drop=True)\n", + "valid_homo_score = homo_score_df['homo score'].values.tolist()" ] }, { "cell_type": "code", "execution_count": null, - "id": "current-crime", + "id": "serious-default", "metadata": { "scrolled": true }, "outputs": [], "source": [ "x=req_propor_median\n", - "y=valid_v_score\n", + "y=valid_homo_score\n", "v=valid_users\n", "cmp = cm.get_cmap('Dark2', len(valid_users))\n", "\n", @@ -530,7 +530,7 @@ " plt.scatter(x[i], y[i], color=cmp.colors[i], label=v[i], s=70, alpha=0.7)\n", "plt.legend(markerscale=0.7, scatterpoints=1)\n", "plt.xlabel('user input request proportion median',fontsize=16)\n", - "plt.ylabel('v measure score',fontsize=16)\n", + "plt.ylabel('homogeneity score',fontsize=16)\n", "plt.xticks(fontsize=14)\n", "plt.yticks(fontsize=14)" ] @@ -538,7 +538,7 @@ { "cell_type": "code", "execution_count": null, - "id": "velvet-sweden", + "id": "double-newman", "metadata": {}, "outputs": [], "source": [] diff --git a/tour_model_eval/v-measurel_bins_all_user.ipynb b/tour_model_eval/v-measurel_bins_all_user.ipynb index a5fdab5..bb7a27a 100644 --- a/tour_model_eval/v-measurel_bins_all_user.ipynb +++ b/tour_model_eval/v-measurel_bins_all_user.ipynb @@ -22,7 +22,8 @@ "from numpy import *\n", "import confirmed_trips_eval_bins_clusters as evaluation\n", "from sklearn import metrics\n", - "from pandas.testing import assert_frame_equal" + "from pandas.testing import assert_frame_equal\n", + "import numpy as np" ] }, { @@ -53,7 +54,7 @@ "metadata": {}, "outputs": [], "source": [ - "user_ls = evaluation.get_user_ls(all_users)" + "user_ls,_ = evaluation.get_user_ls(all_users,radius)" ] }, { @@ -93,16 +94,6 @@ "homo_score_ori, comp_score_ori, v_score_ori = evaluation.v_measure_bins(all_users,radius,cutoff=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "cooked-louis", - "metadata": {}, - "outputs": [], - "source": [ - "mean_v_ori=round(mean([x for x in v_score_ori if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "personalized-stable", @@ -123,16 +114,6 @@ "homo_score_sp2en, comp_score_sp2en, v_score_sp2en = evaluation.v_measure_bins(all_users,radius,sp2en=True,cutoff=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "frequent-niagara", - "metadata": {}, - "outputs": [], - "source": [ - "mean_v_sp2en=round(mean([x for x in v_score_sp2en if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "connected-haven", @@ -151,16 +132,6 @@ "homo_score_cvt, comp_score_cvt, v_score_cvt = evaluation.v_measure_bins(all_users,radius,cvt_pur_mo=True,cutoff=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "primary-tampa", - "metadata": {}, - "outputs": [], - "source": [ - "mean_v_cvt=round(mean([x for x in v_score_cvt if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "agricultural-syndicate", @@ -176,10 +147,9 @@ "metadata": {}, "outputs": [], "source": [ - "cutoff_df = pd.DataFrame(data={'homogeneity_score':[homo_score_ori,homo_score_sp2en,homo_score_cvt],\n", - " 'completeness_score':[comp_score_ori,comp_score_sp2en,comp_score_cvt],\n", - " 'v_measure_score':[v_score_ori,v_score_sp2en,v_score_cvt],\n", - " 'mean v_measure_score':[mean_v_ori,mean_v_sp2en,mean_v_cvt]},\n", + "cutoff_df = pd.DataFrame(data={'homogeneity score':[homo_score_ori,homo_score_sp2en,homo_score_cvt],\n", + " 'completeness score':[comp_score_ori,comp_score_sp2en,comp_score_cvt],\n", + " 'v-measure score':[v_score_ori,v_score_sp2en,v_score_cvt]},\n", " index = ['original user input','after translation','after converting purposes and replaced mode'])\n", "cutoff_df" ] @@ -189,7 +159,7 @@ "id": "conservative-pregnancy", "metadata": {}, "source": [ - "#### homogeneity_score" + "#### homogeneity score" ] }, { @@ -201,7 +171,8 @@ "source": [ "homo_df = pd.DataFrame(data={'original user input':homo_score_ori,'after translation':homo_score_sp2en,\n", " 'after converting purposes and replaced mode':homo_score_cvt}, \n", - " index=user_ls)" + " index=user_ls).dropna()\n", + "homo_df" ] }, { @@ -211,7 +182,7 @@ "metadata": {}, "outputs": [], "source": [ - "homo_df.plot(kind = 'bar',title='homogeneity_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "homo_df.plot(kind = 'bar',title='homogeneity score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] }, { @@ -219,7 +190,7 @@ "id": "incorporated-google", "metadata": {}, "source": [ - "#### completeness_score" + "#### completeness score" ] }, { @@ -231,7 +202,7 @@ "source": [ "comp_df = pd.DataFrame(data={'original user input':comp_score_ori,'after translation':comp_score_sp2en,\n", " 'after converting purposes and replaced mode':comp_score_cvt},\n", - " index=user_ls)" + " index=user_ls).dropna()" ] }, { @@ -241,7 +212,7 @@ "metadata": {}, "outputs": [], "source": [ - "comp_df.plot(kind = 'bar',title='completeness_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "comp_df.plot(kind = 'bar',title='completeness score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] }, { @@ -249,7 +220,7 @@ "id": "living-archives", "metadata": {}, "source": [ - "#### v_measure_score" + "#### v-measure score" ] }, { @@ -261,17 +232,20 @@ "source": [ "v_df = pd.DataFrame(data={'original user input':v_score_ori,'after translation':v_score_sp2en,\n", " 'after converting purposes and replaced mode':v_score_cvt},\n", - " index=user_ls)" + " index=user_ls).dropna()\n", + "v_df" ] }, { "cell_type": "code", "execution_count": null, "id": "demonstrated-resident", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "v_df.plot(kind = 'bar',title='v_measure_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "v_df.plot(kind = 'bar',title='v-measure score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] }, { @@ -300,16 +274,6 @@ "ab_homo_score_ori, ab_comp_score_ori, ab_v_score_ori = evaluation.v_measure_bins(all_users,radius)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "extended-processing", - "metadata": {}, - "outputs": [], - "source": [ - "ab_mean_v_ori=round(mean([x for x in ab_v_score_ori if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "european-philadelphia", @@ -328,16 +292,6 @@ "ab_homo_score_sp2en, ab_comp_score_sp2en, ab_v_score_sp2en = evaluation.v_measure_bins(all_users,radius,sp2en=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "collective-calculator", - "metadata": {}, - "outputs": [], - "source": [ - "ab_mean_v_sp2en=round(mean([x for x in ab_v_score_sp2en if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "capable-bathroom", @@ -356,16 +310,6 @@ "ab_homo_score_cvt, ab_comp_score_cvt, ab_v_score_cvt = evaluation.v_measure_bins(all_users,radius,cvt_pur_mo=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "transparent-theology", - "metadata": {}, - "outputs": [], - "source": [ - "ab_mean_v_cvt=round(mean([x for x in ab_v_score_cvt if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "charged-greeting", @@ -381,10 +325,9 @@ "metadata": {}, "outputs": [], "source": [ - "all_df = pd.DataFrame(data={'homogeneity_score':[ab_homo_score_ori,ab_homo_score_sp2en,ab_homo_score_cvt],\n", - " 'completeness_score':[ab_comp_score_ori,ab_comp_score_sp2en,ab_comp_score_cvt],\n", - " 'v_measure_score':[ab_v_score_ori,ab_v_score_sp2en,ab_v_score_cvt],\n", - " 'mean v_measure_score':[ab_mean_v_ori,ab_mean_v_sp2en,ab_mean_v_cvt]},\n", + "all_df = pd.DataFrame(data={'homogeneity score':[ab_homo_score_ori,ab_homo_score_sp2en,ab_homo_score_cvt],\n", + " 'completeness score':[ab_comp_score_ori,ab_comp_score_sp2en,ab_comp_score_cvt],\n", + " 'v-measure score':[ab_v_score_ori,ab_v_score_sp2en,ab_v_score_cvt]},\n", " index = ['original user input','after translation','after converting purposes and replaced mode'])\n", "all_df" ] @@ -394,7 +337,7 @@ "id": "trying-flesh", "metadata": {}, "source": [ - "#### homogeneity_score" + "#### homogeneity score" ] }, { @@ -406,7 +349,7 @@ "source": [ "ab_homo_df=pd.DataFrame(data={'original user input':ab_homo_score_ori,'after translation':ab_homo_score_sp2en,\n", " 'after converting purposes and replaced mode':ab_homo_score_cvt},\n", - " index=user_ls)" + " index=user_ls).dropna()" ] }, { @@ -416,7 +359,7 @@ "metadata": {}, "outputs": [], "source": [ - "ab_homo_df.plot(kind = 'bar',title='homogeneity_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "ab_homo_df.plot(kind = 'bar',title='homogeneity score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] }, { @@ -424,7 +367,7 @@ "id": "english-quick", "metadata": {}, "source": [ - "#### completeness_score" + "#### completeness score" ] }, { @@ -436,7 +379,7 @@ "source": [ "ab_comp_df = pd.DataFrame(data={'original user input':ab_comp_score_ori,'after translation':ab_comp_score_sp2en,\n", " 'after converting purposes and replaced mode':ab_comp_score_cvt},\n", - " index=user_ls)" + " index=user_ls).dropna()" ] }, { @@ -446,7 +389,7 @@ "metadata": {}, "outputs": [], "source": [ - "ab_comp_df.plot(kind = 'bar',title='completeness_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "ab_comp_df.plot(kind = 'bar',title='completeness score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] }, { @@ -454,7 +397,7 @@ "id": "white-reasoning", "metadata": {}, "source": [ - "#### v_measure_score" + "#### v-measure score" ] }, { @@ -466,7 +409,7 @@ "source": [ "ab_v_df = pd.DataFrame(data={'original user input':ab_v_score_ori,'after translation':ab_v_score_sp2en,\n", " 'after converting purposes and replaced mode':ab_v_score_cvt},\n", - " index=user_ls)" + " index=user_ls).dropna()" ] }, { @@ -476,7 +419,7 @@ "metadata": {}, "outputs": [], "source": [ - "ab_v_df.plot(kind = 'bar',title='v_measure_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "ab_v_df.plot(kind = 'bar',title='v-measure score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] } ], diff --git a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb index a35b9a6..af38b51 100644 --- a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb +++ b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb @@ -20,7 +20,8 @@ "from numpy import *\n", "import confirmed_trips_eval_bins_clusters as evaluation\n", "from sklearn import metrics\n", - "from pandas.testing import assert_frame_equal" + "from pandas.testing import assert_frame_equal\n", + "import numpy as np" ] }, { @@ -51,7 +52,7 @@ "metadata": {}, "outputs": [], "source": [ - "user_ls = evaluation.get_user_ls(all_users)" + "user_ls,_ = evaluation.get_user_ls(all_users,radius)" ] }, { @@ -91,16 +92,6 @@ "homo_score_ori, comp_score_ori, v_score_ori = evaluation.v_measure_clusters(all_users,radius)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "professional-evans", - "metadata": {}, - "outputs": [], - "source": [ - "mean_v_ori=round(mean([x for x in v_score_ori if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "latin-boulder", @@ -119,16 +110,6 @@ "homo_score_sp2en, comp_score_sp2en, v_score_sp2en = evaluation.v_measure_clusters(all_users,radius,sp2en=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "lesbian-testing", - "metadata": {}, - "outputs": [], - "source": [ - "mean_v_sp2en=round(mean([x for x in v_score_sp2en if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "opened-norway", @@ -147,16 +128,6 @@ "homo_score_cvt, comp_score_cvt, v_score_cvt = evaluation.v_measure_clusters(all_users,radius,cvt_pur_mo=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "severe-plant", - "metadata": {}, - "outputs": [], - "source": [ - "mean_v_cvt=round(mean([x for x in v_score_cvt if str(x) != 'nan']),3)" - ] - }, { "cell_type": "markdown", "id": "matched-custody", @@ -172,10 +143,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame(data={'homogeneity_score':[homo_score_ori,homo_score_sp2en,homo_score_cvt],\n", - " 'completeness_score':[comp_score_ori,comp_score_sp2en,comp_score_cvt],\n", - " 'v_measure_score':[v_score_ori,v_score_sp2en,v_score_cvt],\n", - " 'mean v_measure_score':[mean_v_ori,mean_v_sp2en,mean_v_cvt]},\n", + "df = pd.DataFrame(data={'homogeneity score':[homo_score_ori,homo_score_sp2en,homo_score_cvt],\n", + " 'completeness score':[comp_score_ori,comp_score_sp2en,comp_score_cvt],\n", + " 'v-measure score':[v_score_ori,v_score_sp2en,v_score_cvt]},\n", " index = ['original user input','after translation','after converting purposes and replaced mode'])\n", "df" ] @@ -185,7 +155,7 @@ "id": "sacred-applicant", "metadata": {}, "source": [ - "#### homogeneity_score" + "#### homogeneity score" ] }, { @@ -197,7 +167,7 @@ "source": [ "homo_df = pd.DataFrame(data={'original user input':homo_score_ori,'after translation':homo_score_sp2en,\n", " 'after converting purposes and replaced mode':homo_score_cvt}, \n", - " index=user_ls)" + " index=user_ls).dropna()" ] }, { @@ -207,7 +177,7 @@ "metadata": {}, "outputs": [], "source": [ - "homo_df.plot(kind = 'bar',title='homogeneity_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "homo_df.plot(kind = 'bar',title='homogeneity score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] }, { @@ -215,7 +185,7 @@ "id": "premium-force", "metadata": {}, "source": [ - "#### completeness_score" + "#### completeness score" ] }, { @@ -227,7 +197,7 @@ "source": [ "comp_df = pd.DataFrame(data={'original user input':comp_score_ori,'after translation':comp_score_sp2en,\n", " 'after converting purposes and replaced mode':comp_score_cvt},\n", - " index=user_ls)" + " index=user_ls).dropna()" ] }, { @@ -239,7 +209,7 @@ }, "outputs": [], "source": [ - "comp_df.plot(kind = 'bar',title='completeness_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "comp_df.plot(kind = 'bar',title='completeness score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] }, { @@ -247,7 +217,7 @@ "id": "affecting-daughter", "metadata": {}, "source": [ - "#### v_measure_score" + "#### v-measure score" ] }, { @@ -259,7 +229,8 @@ "source": [ "v_df = pd.DataFrame(data={'original user input':v_score_ori,'after translation':v_score_sp2en,\n", " 'after converting purposes and replaced mode':v_score_cvt},\n", - " index=user_ls)" + " index=user_ls).dropna()\n", + "v_df" ] }, { @@ -269,7 +240,7 @@ "metadata": {}, "outputs": [], "source": [ - "v_df.plot(kind = 'bar',title='v_measure_score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" + "v_df.plot(kind = 'bar',title='v-measure score',yticks=(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1))" ] } ], From 0e7f803645bfa8cb8926b2cf356c2848f5ad85e8 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Mon, 29 Mar 2021 19:51:36 -0700 Subject: [PATCH 13/16] modified evaluation code and added comments in the notebooks --- .../confirmed_trips_eval_bins_clusters.py | 26 ++++++++++--------- .../user_input_request_times_all_users.ipynb | 18 ++++++------- .../v-measurel_all_bins_single_user.ipynb | 8 ++++++ .../v-measurel_bins_all_user.ipynb | 8 ++++++ ...urel_clusters_above_cutoff_all_users.ipynb | 8 ++++++ .../v-measurel_cutoff_bins_single_user.ipynb | 8 ++++++ ...measurel_cutoff_clusters_single_user.ipynb | 8 ++++++ 7 files changed, 63 insertions(+), 21 deletions(-) diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index 9131e61..fb4dffa 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -12,15 +12,6 @@ from sklearn import metrics from pandas.testing import assert_frame_equal -# Spanish words to English -span_eng_dict = {'revisado_bike':'test ride with bike','placas_de carro':'car plates','aseguranza':'insurance', - 'iglesia':'church','curso':'course','mi_hija recién aliviada':'my daughter just had a new baby', - 'servicio_comunitario':'community service','pago_de aseguranza':'insurance payment', - 'grupo_comunitario':'community group','caminata_comunitaria':'community walk'} - -# Convert purpose -map_pur_dict = {'course':'school','work_- lunch break':'lunch_break','on_the way home':'home', - 'insurance_payment':'insurance'} def get_user_ls(all_users,radius): user_ls = [] @@ -56,7 +47,18 @@ def valid_user(filter_trips,trips): valid = True return valid -def map_labels(user_input_df,span_eng_dict,map_pur_dict,sp2en,cvt_pur_mo): +def map_labels(user_input_df,sp2en,cvt_pur_mo): + # Spanish words to English + span_eng_dict = {'revisado_bike': 'test ride with bike', 'placas_de carro': 'car plates', 'aseguranza': 'insurance', + 'iglesia': 'church', 'curso': 'course', + 'mi_hija recién aliviada': 'my daughter just had a new baby', + 'servicio_comunitario': 'community service', 'pago_de aseguranza': 'insurance payment', + 'grupo_comunitario': 'community group', 'caminata_comunitaria': 'community walk'} + + # Convert purpose + map_pur_dict = {'course': 'school', 'work_- lunch break': 'lunch_break', 'on_the way home': 'home', + 'insurance_payment': 'insurance'} + if sp2en: # change language user_input_df = user_input_df.replace(span_eng_dict) @@ -128,7 +130,7 @@ def v_measure_bins(all_users,radius,sp2en=None,cvt_pur_mo=None,cutoff=None): bins = sim.bins bin_trips_user_input_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips]) - bin_trips_user_input_df = map_labels(bin_trips_user_input_df, span_eng_dict, map_pur_dict, sp2en, cvt_pur_mo) + bin_trips_user_input_df = map_labels(bin_trips_user_input_df, sp2en, cvt_pur_mo) # turn all user_input into list without binning bin_trips_user_input_ls = bin_trips_user_input_df.values.tolist() @@ -192,7 +194,7 @@ def v_measure_clusters(all_users,radius,sp2en=None,cvt_pur_mo=None): feat.cluster(min_clusters=min, max_clusters=max) cluster_trips = feat.data cluster_user_input_df = pd.DataFrame(data=[i["data"]["user_input"] for i in cluster_trips]) - cluster_user_input_df = map_labels(cluster_user_input_df, span_eng_dict, map_pur_dict, sp2en, cvt_pur_mo) + cluster_user_input_df = map_labels(cluster_user_input_df, sp2en, cvt_pur_mo) # turn cluster_trips to list without any changes cluster_user_input_ls = cluster_user_input_df.values.tolist() # drop duplicate user_input diff --git a/tour_model_eval/user_input_request_times_all_users.ipynb b/tour_model_eval/user_input_request_times_all_users.ipynb index f78a808..c6c3973 100644 --- a/tour_model_eval/user_input_request_times_all_users.ipynb +++ b/tour_model_eval/user_input_request_times_all_users.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "competitive-involvement", + "metadata": {}, + "source": [ + "This notebook integrates user inputs request times in a day, in a month, and in January 2021 for all users. The scatter at the bottom shows the relationship between homogeneity score and the median of user input request proportion in a day on valid common trips after the first round clustering for all users." + ] + }, { "cell_type": "code", "execution_count": null, @@ -202,7 +210,7 @@ " # bins below cutoff\n", " bl_bins = sim.below_cutoff\n", " \n", - " # collect query trips indices below cutoff\n", + " # collect requested trips indices below cutoff\n", " bl_trip_ls = []\n", " for bin in bl_bins:\n", " for trip_index in bin:\n", @@ -534,14 +542,6 @@ "plt.xticks(fontsize=14)\n", "plt.yticks(fontsize=14)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "double-newman", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tour_model_eval/v-measurel_all_bins_single_user.ipynb b/tour_model_eval/v-measurel_all_bins_single_user.ipynb index 0b0e3f0..d2fb745 100644 --- a/tour_model_eval/v-measurel_all_bins_single_user.ipynb +++ b/tour_model_eval/v-measurel_all_bins_single_user.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "operating-disabled", + "metadata": {}, + "source": [ + "This notebook is for evaluating all valid bins and exploring data for a single user after first round clustering." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/tour_model_eval/v-measurel_bins_all_user.ipynb b/tour_model_eval/v-measurel_bins_all_user.ipynb index bb7a27a..6486ff6 100644 --- a/tour_model_eval/v-measurel_bins_all_user.ipynb +++ b/tour_model_eval/v-measurel_bins_all_user.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "visible-annual", + "metadata": {}, + "source": [ + "This notebook shows the homogeneity scores, completeness scores, and v-measure scores based on original user inputs, user inputs after converting language, and user inputs after converting purposes and replaced mode on valid bins above cutoff and on all valid bins for all users after the first round clustering." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb index af38b51..4c40c4a 100644 --- a/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb +++ b/tour_model_eval/v-measurel_clusters_above_cutoff_all_users.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "proper-research", + "metadata": {}, + "source": [ + "This notebook shows the homogeneity scores, completeness scores, and v-measure scores based on original user inputs, user inputs after converting language, and user inputs after converting purposes and replaced mode on valid clusters above cutoff after the first round clustering for all users. The clustering process is based on silhouette score, setting the min cluster to 0." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb index a88733a..2906745 100644 --- a/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_bins_single_user.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "aging-evaluation", + "metadata": {}, + "source": [ + "This notebook is for evaluating bins above cutoff and exploring the data for a single user after first round clustering." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb index e16031c..7bab80b 100644 --- a/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb +++ b/tour_model_eval/v-measurel_cutoff_clusters_single_user.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "developing-commerce", + "metadata": {}, + "source": [ + "This notebook is for evaluating clusters above cutoff and exploring data for a single user after first round clustering." + ] + }, { "cell_type": "code", "execution_count": null, From 8db9a7235783b5f9e6ed4ecfc16b83b8e8fb3ec1 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Mon, 5 Apr 2021 23:06:48 -0700 Subject: [PATCH 14/16] add more comments in evaluation code --- .../confirmed_trips_eval_bins_clusters.py | 16 ++++ .../user_input_request_times_all_users.ipynb | 76 +++++++++---------- 2 files changed, 54 insertions(+), 38 deletions(-) diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index fb4dffa..af11212 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -13,6 +13,8 @@ from pandas.testing import assert_frame_equal +# - user_ls: a list of all users +# - valid_user_ls: a list of valid users def get_user_ls(all_users,radius): user_ls = [] valid_user_ls = [] @@ -29,6 +31,8 @@ def get_user_ls(all_users,radius): return user_ls,valid_user_ls +# - trips: all trips read from database +# - filter_trips: valid trips that have user labels and are not points def filter_data(user,radius): trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] @@ -37,16 +41,25 @@ def filter_data(user,radius): valid_trips_idx_ls = valid_trips_df.index.tolist() valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls] + # similarity codes can filter out trips that are points in valid_trips sim = similarity.similarity(valid_trips, radius) filter_trips = sim.data return filter_trips,sim,trips + +# to determine if the user is valid: +# valid user should have >= 10 trips for further analysis and the proportion of filter_trips is >=50% def valid_user(filter_trips,trips): valid = False if len(filter_trips) >= 10 and len(filter_trips) / len(trips) >= 0.5: valid = True return valid + +# to map the user labels +# - user_input_df: pass in original user input dataframe, return changed user input dataframe +# - sp2en: change Spanish to English +# - cvt_pur_mo: convert purposes and replaced mode def map_labels(user_input_df,sp2en,cvt_pur_mo): # Spanish words to English span_eng_dict = {'revisado_bike': 'test ride with bike', 'placas_de carro': 'car plates', 'aseguranza': 'insurance', @@ -76,6 +89,8 @@ def map_labels(user_input_df,sp2en,cvt_pur_mo): return user_input_df +# check if the user is valid +# append NaN to the score lists when the user invalid def valid_user_check(filter_trips,trips,homo_score,comp_score,v_score): if not valid_user(filter_trips, trips): homo_score.append(NaN) @@ -87,6 +102,7 @@ def valid_user_check(filter_trips,trips,homo_score,comp_score,v_score): return homo_score,comp_score,v_score,skip +# This function is to get homogeneity score, complete score, and v-score def compute_score(labels_true,labels_pred,homo_score,comp_score,v_score): homo = metrics.homogeneity_score(labels_true, labels_pred) homo_score.append(float('%.3f' % homo)) diff --git a/tour_model_eval/user_input_request_times_all_users.ipynb b/tour_model_eval/user_input_request_times_all_users.ipynb index c6c3973..a3b0167 100644 --- a/tour_model_eval/user_input_request_times_all_users.ipynb +++ b/tour_model_eval/user_input_request_times_all_users.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "competitive-involvement", + "id": "compact-toolbox", "metadata": {}, "source": [ "This notebook integrates user inputs request times in a day, in a month, and in January 2021 for all users. The scatter at the bottom shows the relationship between homogeneity score and the median of user input request proportion in a day on valid common trips after the first round clustering for all users." @@ -95,12 +95,12 @@ "metadata": {}, "outputs": [], "source": [ - "def match_day(trip,bin):\n", - " if bin:\n", - " t = filter_trips[bin[0]]\n", - " if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:\n", - " return True\n", - " return False" + "# def match_day(trip,bin):\n", + "# if bin:\n", + "# t = filter_trips[bin[0]]\n", + "# if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:\n", + "# return True\n", + "# return False" ] }, { @@ -110,12 +110,12 @@ "metadata": {}, "outputs": [], "source": [ - "def match_month(trip,bin):\n", - " if bin:\n", - " t = filter_trips[bin[0]]\n", - " if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:\n", - " return True\n", - " return False" + "# def match_month(trip,bin):\n", + "# if bin:\n", + "# t = filter_trips[bin[0]]\n", + "# if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:\n", + "# return True\n", + "# return False" ] }, { @@ -125,29 +125,29 @@ "metadata": {}, "outputs": [], "source": [ - "# bin trips according to ['start_local_dt']\n", - "def bin_date(trip_ls,day=None,month=None):\n", - " bin_date = []\n", - " for trip_index in trip_ls:\n", - " added = False\n", - " trip = filter_trips[trip_index]\n", + "# # bin trips according to ['start_local_dt']\n", + "# def ,filter_trips,filter_trips,filter_trips,filter_trips(trip_ls,day=None,month=None):\n", + "# bin_date = []\n", + "# for trip_index in trip_ls:\n", + "# added = False\n", + "# trip = filter_trips[trip_index]\n", "\n", - " for bin in bin_date:\n", - " if day:\n", - " if match_day(trip,bin):\n", - " bin.append(trip_index)\n", - " added = True\n", - " break\n", - " if month:\n", - " if match_month(trip,bin):\n", - " bin.append(trip_index)\n", - " added = True\n", - " break\n", + "# for bin in bin_date:\n", + "# if day:\n", + "# if evaluation.match_day(trip,bin,filter_trips):\n", + "# bin.append(trip_index)\n", + "# added = True\n", + "# break\n", + "# if month:\n", + "# if evaluation.match_month(trip,bin,filter_trips):\n", + "# bin.append(trip_index)\n", + "# added = True\n", + "# break\n", "\n", - " if not added:\n", - " bin_date.append([trip_index])\n", + "# if not added:\n", + "# bin_date.append([trip_index])\n", "\n", - " return bin_date " + "# return bin_date " ] }, { @@ -221,7 +221,7 @@ " \n", " \n", " # collect request times in a day\n", - " bin_day = bin_date(req_trips_ls,day=True)\n", + " bin_day = evaluation.bin_date(req_trips_ls,filter_trips,day=True)\n", " req_day_ls = []\n", " for bin in bin_day:\n", " req_day_ls.append(len(bin))\n", @@ -231,7 +231,7 @@ " trip = filter_trips[trip_index]\n", " match = False\n", " for bin in bin_day:\n", - " if match_day(trip,bin):\n", + " if evaluation.match_day(trip,bin,filter_trips):\n", " match = True\n", " break\n", " if not match:\n", @@ -243,13 +243,13 @@ " # collect user input request proportion in a day\n", " filter_trips_df = pd.DataFrame(filter_trips)\n", " filter_trips_idx_ls = filter_trips_df.index.values.tolist()\n", - " bin_filter_trips_day = bin_date(filter_trips_idx_ls,day=True)\n", + " bin_filter_trips_day = evaluation.bin_date(filter_trips_idx_ls,filter_trips,day=True)\n", " propor_single_user = []\n", " for valid_trips_bin in bin_filter_trips_day:\n", " match = False\n", " for req_trips_bin in bin_day:\n", " req_trip = filter_trips[req_trips_bin[0]]\n", - " if match_day(req_trip,valid_trips_bin):\n", + " if evaluation.match_day(req_trip,valid_trips_bin,filter_trips):\n", " proportion = round(len(req_trips_bin)/len(valid_trips_bin), 2)\n", " propor_single_user.append(proportion)\n", " match = True\n", @@ -265,7 +265,7 @@ " \n", " \n", " # collect request times in a month\n", - " bin_month = bin_date(req_trips_ls,month=True)\n", + " bin_month = evaluation.bin_date(req_trips_ls,filter_trips,month=True)\n", " req_month_ls = []\n", " for bin in bin_month:\n", " req_month_ls.append(len(bin))\n", From 56049d23b31b50fac59041d754399ce0ba1e32cb Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Wed, 7 Apr 2021 10:32:34 -0700 Subject: [PATCH 15/16] add more functions in the evaluation code --- .../confirmed_trips_eval_bins_clusters.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index af11212..01aaef4 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -113,6 +113,66 @@ def compute_score(labels_true,labels_pred,homo_score,comp_score,v_score): return homo_score,comp_score,v_score +# This function is to compare a trip with a group of trips to see if they happened in a same day +def match_day(trip,bin,filter_trips): + if bin: + t = filter_trips[bin[0]] + if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\ + and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']\ + and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']: + return True + return False + + +# This function is to compare a trip with a group of trips to see if they happened in a same month +def match_month(trip,bin,filter_trips): + if bin: + t = filter_trips[bin[0]] + if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\ + and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']: + return True + return False + + +# This function bins trips according to ['start_local_dt'] +def bin_date(trip_ls,filter_trips,day=None,month=None): + bin_date = [] + for trip_index in trip_ls: + added = False + trip = filter_trips[trip_index] + + for bin in bin_date: + if day: + if match_day(trip,bin,filter_trips): + bin.append(trip_index) + added = True + break + if month: + if match_month(trip,bin,filter_trips): + bin.append(trip_index) + added = True + break + + if not added: + bin_date.append([trip_index]) + + return bin_date + + +# compare the trip orders in bin_trips with those in filter_trips above cutoff +def compare_trip_orders(bins,bin_trips,filter_trips): + # compare the trips order in bins and those in valid_trips using timestamp + bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips]) + bin_ls = [] + for bin in bins: + for index in bin: + bin_ls.append(index) + bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls]) + # compare two data frames, the program will continue to score calculation if two data frames are the same + assert_frame_equal(bins_ts, bin_trips_ts) + + + # v_measure_bins takes 5 parameters # - sp2en=True: change Spanish to English # - cvt_pur_mo=True: convert purposes and replaced mode From 644c2dd1e64734d194270ccba0920f3bfe64d209 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Thu, 8 Apr 2021 01:47:24 -0700 Subject: [PATCH 16/16] add find_first_trip function using start_ts --- .../confirmed_trips_eval_bins_clusters.py | 15 ++++ .../user_input_request_times_all_users.ipynb | 79 +------------------ 2 files changed, 17 insertions(+), 77 deletions(-) diff --git a/tour_model_eval/confirmed_trips_eval_bins_clusters.py b/tour_model_eval/confirmed_trips_eval_bins_clusters.py index 01aaef4..f28f271 100644 --- a/tour_model_eval/confirmed_trips_eval_bins_clusters.py +++ b/tour_model_eval/confirmed_trips_eval_bins_clusters.py @@ -172,6 +172,21 @@ def compare_trip_orders(bins,bin_trips,filter_trips): assert_frame_equal(bins_ts, bin_trips_ts) +def find_first_trip(filter_trips,bin): + early_trip = filter_trips[bin[0]] + index = 0 + for i in range(1,len(bin)): + compare_trip = filter_trips[bin[i]] + if early_trip['data']["start_ts"] > compare_trip['data']["start_ts"]: + early_trip = compare_trip + index = i + early_trip_index = bin[index] + return early_trip_index, index + + + + + # v_measure_bins takes 5 parameters # - sp2en=True: change Spanish to English diff --git a/tour_model_eval/user_input_request_times_all_users.ipynb b/tour_model_eval/user_input_request_times_all_users.ipynb index a3b0167..95623b8 100644 --- a/tour_model_eval/user_input_request_times_all_users.ipynb +++ b/tour_model_eval/user_input_request_times_all_users.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "compact-toolbox", + "id": "cellular-panel", "metadata": {}, "source": [ "This notebook integrates user inputs request times in a day, in a month, and in January 2021 for all users. The scatter at the bottom shows the relationship between homogeneity score and the median of user input request proportion in a day on valid common trips after the first round clustering for all users." @@ -88,68 +88,6 @@ "req_propor_median = []" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "destroyed-attention", - "metadata": {}, - "outputs": [], - "source": [ - "# def match_day(trip,bin):\n", - "# if bin:\n", - "# t = filter_trips[bin[0]]\n", - "# if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:\n", - "# return True\n", - "# return False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "breeding-cream", - "metadata": {}, - "outputs": [], - "source": [ - "# def match_month(trip,bin):\n", - "# if bin:\n", - "# t = filter_trips[bin[0]]\n", - "# if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:\n", - "# return True\n", - "# return False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "progressive-playback", - "metadata": {}, - "outputs": [], - "source": [ - "# # bin trips according to ['start_local_dt']\n", - "# def ,filter_trips,filter_trips,filter_trips,filter_trips(trip_ls,day=None,month=None):\n", - "# bin_date = []\n", - "# for trip_index in trip_ls:\n", - "# added = False\n", - "# trip = filter_trips[trip_index]\n", - "\n", - "# for bin in bin_date:\n", - "# if day:\n", - "# if evaluation.match_day(trip,bin,filter_trips):\n", - "# bin.append(trip_index)\n", - "# added = True\n", - "# break\n", - "# if month:\n", - "# if evaluation.match_month(trip,bin,filter_trips):\n", - "# bin.append(trip_index)\n", - "# added = True\n", - "# break\n", - "\n", - "# if not added:\n", - "# bin_date.append([trip_index])\n", - "\n", - "# return bin_date " - ] - }, { "cell_type": "code", "execution_count": null, @@ -184,20 +122,7 @@ " ab_trip_ls = []\n", " no_req_trip_ls = []\n", " for bin in bins:\n", - " early_trip = filter_trips[bin[0]]\n", - " index = 0\n", - " for i in range(1,len(bin)):\n", - " compare_trip = filter_trips[bin[i]]\n", - " if early_trip['data']['start_local_dt']['year']>compare_trip['data']['start_local_dt']['year']:\n", - " early_trip = compare_trip\n", - " index = i\n", - " elif early_trip['data']['start_local_dt']['year']==compare_trip['data']['start_local_dt']['year'] and early_trip['data']['start_local_dt']['month']>compare_trip['data']['start_local_dt']['month']:\n", - " early_trip = compare_trip\n", - " index = i\n", - " elif early_trip['data']['start_local_dt']['year']==compare_trip['data']['start_local_dt']['year'] and early_trip['data']['start_local_dt']['month']==compare_trip['data']['start_local_dt']['month'] and early_trip['data']['start_local_dt']['day']>compare_trip['data']['start_local_dt']['day']:\n", - " early_trip = compare_trip\n", - " index = i\n", - " early_trip_index = bin[index]\n", + " early_trip_index, index = evaluation.find_first_trip(filter_trips,bin)\n", " ab_trip_ls.append(early_trip_index)\n", " \n", " for k in range(len(bin)):\n",