diff --git a/MANIFEST.in b/MANIFEST.in
index ed0ca0e87274e..db605f55f748c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,7 @@ include *.rst
 recursive-include doc *
 recursive-include examples *
 recursive-include sklearn *.c *.h *.pyx *.pxd *.pxi
-recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt
+recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt *.arff.gz *.json.gz
 include COPYING
 include AUTHORS.rst
 include README.rst
diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
new file mode 100644
index 0000000000000..53ab211df903e
--- /dev/null
+++ b/doc/datasets/openml.rst
@@ -0,0 +1,148 @@
+..
+    For doctests:
+
+    >>> import numpy as np
+    >>> import os
+
+
+.. _openml:
+
+Downloading datasets from the openml.org repository
+===================================================
+
+`openml.org <https://openml.org>`_ is a public repository for machine learning
+data and experiments, that allows everybody to upload open datasets.
+
+The ``sklearn.datasets`` package is able to download datasets
+from the repository using the function
+:func:`sklearn.datasets.fetch_openml`.
+
+For example, to download a dataset of gene expressions in mice brains::
+
+  >>> from sklearn.datasets import fetch_openml
+  >>> mice = fetch_openml(name='miceprotein', version=4)
+
+To fully specify a dataset, you need to provide a name and a version, though
+the version is optional, see :ref:`openml_versions`_ below.
+The dataset contains a total of 1080 examples belonging to 8 different
+classes::
+
+  >>> mice.data.shape
+  (1080, 77)
+  >>> mice.target.shape
+  (1080,)
+  >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE
+  array(['c-CS-m', 'c-CS-s', 'c-SC-m', 'c-SC-s', 't-CS-m', 't-CS-s', 't-SC-m', 't-SC-s'], dtype=object)
+
+You can get more information on the dataset by looking at the ``DESCR``
+and ``details`` attributes::
+
+  >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios
+  **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015
+  **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing
+  Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down
+  Syndrome. PLoS ONE 10(6): e0129126...
+
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  {'id': '40966', 'name': 'MiceProtein', 'version': '4', 'format': 'ARFF',
+  'upload_date': '2017-11-08T16:00:15', 'licence': 'Public',
+  'url': 'https://www.openml.org/data/v1/download/17928620/MiceProtein.arff',
+  'file_id': '17928620', 'default_target_attribute': 'class',
+  'row_id_attribute': 'MouseID',
+  'ignore_attribute': ['Genotype', 'Treatment', 'Behavior'],
+  'tag': ['OpenML-CC18', 'study_135', 'study_98', 'study_99'],
+  'visibility': 'public', 'status': 'active',
+  'md5_checksum': '3c479a6885bfa0438971388283a1ce32'}
+
+
+The ``DESCR`` contains a free-text description of the data, while ``details``
+contains a dictionary of meta-data stored by openml, like the dataset id.
+For more details, see the `OpenML documentation
+<https://docs.openml.org/#data>`_ The ``data_id`` of the mice protein dataset
+is 40966, and you can use this (or the name) to get more information on the
+dataset on the openml website::
+
+  >>> mice.url
+  'https://www.openml.org/d/40966'
+
+The ``data_id`` also uniquely identifies a dataset from OpenML::
+
+  >>> mice = fetch_openml(data_id=40966)
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
+  'creator': ...,
+  'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
+  'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id':
+  '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C,
+  Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins
+  Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6):
+  e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14',
+  'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum':
+  '3c479a6885bfa0438971388283a1ce32'}
+
+.. _openml_versions:
+
+Dataset Versions
+----------------
+
+A dataset is uniquely specified by its ``data_id``, but not necessarily by its
+name. Several different "versions" of a dataset with the same name can exist
+which can contain entirely different datasets.
+If a particular version of a dataset has been found to contain significant
+issues, it might be deactivated. Using a name to specify a dataset will yield
+the earliest version of a dataset that is still active. That means that
+``fetch_openml(name="miceprotein")`` can yield different results at different
+times if earlier versions become inactive.
+You can see that the dataset with ``data_id`` 40966 that we fetched above is
+the version 1 of the "miceprotein" dataset::
+
+  >>> mice.details['version']  #doctest: +SKIP
+  '1'
+
+In fact, this dataset only has one version. The iris dataset on the other hand
+has multiple versions::
+
+  >>> iris = fetch_openml(name="iris")
+  >>> iris.details['version']  #doctest: +SKIP
+  '1'
+  >>> iris.details['id']  #doctest: +SKIP
+  '61'
+
+  >>> iris_61 = fetch_openml(data_id=61)
+  >>> iris_61.details['version']
+  '1'
+  >>> iris_61.details['id']
+  '61'
+
+  >>> iris_969 = fetch_openml(data_id=969)
+  >>> iris_969.details['version']
+  '3'
+  >>> iris_969.details['id']
+  '969'
+
+Specifying the dataset by the name "iris" yields the lowest version, version 1,
+with the ``data_id`` 61. To make sure you always get this exact dataset, it is
+safest to specify it by the dataset ``data_id``. The other dataset, with
+``data_id`` 969, is version 3 (version 2 has become inactive), and contains a
+binarized version of the data::
+
+  >>> np.unique(iris_969.target)
+  array(['N', 'P'], dtype=object)
+
+You can also specify both the name and the version, which also uniquely
+identifies the dataset::
+
+  >>> iris_version_3 = fetch_openml(name="iris", version=3)
+  >>> iris_version_3.details['version']
+  '3'
+  >>> iris_version_3.details['id']
+  '969'
+
+
+.. topic:: References:
+
+ * Vanschoren, van Rijn, Bischl and Torgo
+   `"OpenML: networked science in machine learning"
+   <https://arxiv.org/pdf/1407.7722.pdf>`_,
+   ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014.
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 4888d9205d9f0..31f5e5ef840a7 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -79,6 +79,7 @@ link to it from your website, or simply star to say "I use it":
    * `joblib <https://github.com/joblib/joblib/issues>`__
    * `sphinx-gallery <https://github.com/sphinx-gallery/sphinx-gallery/issues>`__
    * `numpydoc <https://github.com/numpy/numpydoc/issues>`__
+   * `liac-arff <https://github.com/renatopp/liac-arff>`__
 
    and larger projects:
 
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 44ff2ff0aefa3..fb38eab8fde0d 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -259,6 +259,7 @@ Loaders
    datasets.fetch_lfw_people
    datasets.fetch_mldata
    datasets.fetch_olivetti_faces
+   datasets.fetch_openml
    datasets.fetch_rcv1
    datasets.fetch_species_distributions
    datasets.get_data_home
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 90597c535accd..df346696734a0 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -178,6 +178,11 @@ Support for Python 3.3 has been officially dropped.
 :mod:`sklearn.datasets`
 .......................
 
+- |MajorFeature| Added :func:`datasets.fetch_openml` to fetch datasets from
+  `OpenML <http://openml.org>`. OpenML is a free, open data sharing platform
+  and will be used instead of mldata as it provides better service availability.
+  :issue:`9908` by `Andreas Müller`_ and :user:`Jan N. van Rijn <janvanrijn>`.
+
 - |Feature| In :func:`datasets.make_blobs`, one can now pass a list to the
   `n_samples` parameter to indicate the number of samples to generate per
   cluster. :issue:`8617` by :user:`Maskani Filali Mohamed <maskani-moh>` and
@@ -204,7 +209,6 @@ Support for Python 3.3 has been officially dropped.
   data points could be generated. :issue:`10045` by :user:`Christian Braune
   <christianbraune79>`.
 
-
 :mod:`sklearn.decomposition`
 ............................
 
diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
index c43c0c4758b10..c7d78e633493d 100644
--- a/sklearn/datasets/__init__.py
+++ b/sklearn/datasets/__init__.py
@@ -23,6 +23,7 @@
 from .twenty_newsgroups import fetch_20newsgroups
 from .twenty_newsgroups import fetch_20newsgroups_vectorized
 from .mldata import fetch_mldata, mldata_filename
+from .openml import fetch_openml
 from .samples_generator import make_classification
 from .samples_generator import make_multilabel_classification
 from .samples_generator import make_hastie_10_2
@@ -65,6 +66,7 @@
            'fetch_covtype',
            'fetch_rcv1',
            'fetch_kddcup99',
+           'fetch_openml',
            'get_data_home',
            'load_boston',
            'load_diabetes',
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
new file mode 100644
index 0000000000000..d7693ffd6fea9
--- /dev/null
+++ b/sklearn/datasets/openml.py
@@ -0,0 +1,557 @@
+import gzip
+import json
+import os
+import shutil
+from os.path import join
+from warnings import warn
+
+try:
+    # Python 3+
+    from urllib.request import urlopen
+except ImportError:
+    # Python 2
+    from urllib2 import urlopen
+
+
+import numpy as np
+import scipy.sparse
+
+from sklearn.externals import _arff
+from .base import get_data_home
+from ..externals.six import string_types, PY2
+from ..externals.six.moves.urllib.error import HTTPError
+from ..utils import Bunch
+
+__all__ = ['fetch_openml']
+
+_OPENML_PREFIX = "https://openml.org/"
+_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/1"
+_DATA_INFO = "api/v1/json/data/{}"
+_DATA_FEATURES = "api/v1/json/data/features/{}"
+_DATA_FILE = "data/v1/download/{}"
+
+
+def _open_openml_url(openml_path, data_home):
+    """
+    Returns a resource from OpenML.org. Caches it to data_home if required.
+
+    Parameters
+    ----------
+    openml_path : str
+        OpenML URL that will be accessed. This will be prefixes with
+        _OPENML_PREFIX
+
+    data_home : str
+        Directory to which the files will be cached. If None, no caching will
+        be applied.
+
+    Returns
+    -------
+    result : stream
+        A stream to the OpenML resource
+    """
+    if data_home is None:
+        return urlopen(_OPENML_PREFIX + openml_path)
+    local_path = os.path.join(data_home, 'openml.org', openml_path + ".gz")
+    if not os.path.exists(local_path):
+        try:
+            os.makedirs(os.path.dirname(local_path))
+        except OSError:
+            # potentially, the directory has been created already
+            pass
+
+        try:
+            with gzip.GzipFile(local_path, 'wb') as fdst:
+                fsrc = urlopen(_OPENML_PREFIX + openml_path)
+                shutil.copyfileobj(fsrc, fdst)
+                fsrc.close()
+        except Exception:
+            os.unlink(local_path)
+            raise
+    # XXX: unnecessary decompression on first access
+    return gzip.GzipFile(local_path, 'rb')
+
+
+def _get_json_content_from_openml_api(url, error_message, raise_if_error,
+                                      data_home):
+    """
+    Loads json data from the openml api
+
+    Parameters
+    ----------
+    url : str
+        The URL to load from. Should be an official OpenML endpoint
+
+    error_message : str or None
+        The error message to raise if an acceptable OpenML error is thrown
+        (acceptable error is, e.g., data id not found. Other errors, like 404's
+        will throw the native error message)
+
+    raise_if_error : bool
+        Whether to raise an error if OpenML returns an acceptable error (e.g.,
+        date not found). If this argument is set to False, a None is returned
+        in case of acceptable errors. Note that all other errors (e.g., 404)
+        will still be raised as normal.
+
+    data_home : str or None
+        Location to cache the response. None if no cache is required.
+
+    Returns
+    -------
+    json_data : json or None
+        the json result from the OpenML server if the call was successful;
+        None otherwise iff raise_if_error was set to False and the error was
+        ``acceptable``
+    """
+    data_found = True
+    try:
+        response = _open_openml_url(url, data_home)
+    except HTTPError as error:
+        # 412 is an OpenML specific error code, indicating a generic error
+        # (e.g., data not found)
+        if error.code == 412:
+            data_found = False
+        else:
+            raise error
+    if not data_found:
+        # not in except for nicer traceback
+        if raise_if_error:
+            raise ValueError(error_message)
+        else:
+            return None
+    json_data = json.loads(response.read().decode("utf-8"))
+    response.close()
+    return json_data
+
+
+def _split_sparse_columns(arff_data, include_columns):
+    """
+    obtains several columns from sparse arff representation. Additionally, the
+    column indices are re-labelled, given the columns that are not included.
+    (e.g., when including [1, 2, 3], the columns will be relabelled to
+    [0, 1, 2])
+
+    Parameters
+    ----------
+    arff_data : tuple
+        A tuple of three lists of equal size; first list indicating the value,
+        second the x coordinate and the third the y coordinate.
+
+    include_columns : list
+        A list of columns to include.
+
+    Returns
+    -------
+    arff_data_new : tuple
+        Subset of arff data with only the include columns indicated by the
+        include_columns argument.
+    """
+    arff_data_new = (list(), list(), list())
+    reindexed_columns = {column_idx: array_idx for array_idx, column_idx
+                         in enumerate(include_columns)}
+    for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
+        if col_idx in include_columns:
+            arff_data_new[0].append(val)
+            arff_data_new[1].append(row_idx)
+            arff_data_new[2].append(reindexed_columns[col_idx])
+    return arff_data_new
+
+
+def _sparse_data_to_array(arff_data, include_columns):
+    # turns the sparse data back into an array (can't use toarray() function,
+    # as this does only work on numeric data)
+    num_obs = max(arff_data[1]) + 1
+    y_shape = (num_obs, len(include_columns))
+    reindexed_columns = {column_idx: array_idx for array_idx, column_idx
+                         in enumerate(include_columns)}
+    # TODO: improve for efficiency
+    y = np.empty(y_shape, dtype=np.float64)
+    for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
+        if col_idx in include_columns:
+            y[row_idx, reindexed_columns[col_idx]] = val
+    return y
+
+
+def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
+    """
+    converts the arff object into the appropriate matrix type (np.array or
+    scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
+    liac-arff dict, the object from the 'data' key)
+
+    Parameters
+    ----------
+    arff_data : list or dict
+        as obtained from liac-arff object
+
+    col_slice_x : list
+        The column indices that are sliced from the original array to return
+        as X data
+
+    col_slice_y : list
+        The column indices that are sliced from the original array to return
+        as y data
+
+    Returns
+    -------
+    X : np.array or scipy.sparse.csr_matrix
+    y : np.array
+    """
+    if isinstance(arff_data, list):
+        data = np.array(arff_data, dtype=np.float64)
+        X = np.array(data[:, col_slice_x], dtype=np.float64)
+        y = np.array(data[:, col_slice_y], dtype=np.float64)
+        return X, y
+    elif isinstance(arff_data, tuple):
+        arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
+        num_obs = max(arff_data[1]) + 1
+        X_shape = (num_obs, len(col_slice_x))
+        X = scipy.sparse.coo_matrix(
+            (arff_data_X[0], (arff_data_X[1], arff_data_X[2])),
+            shape=X_shape, dtype=np.float64)
+        X = X.tocsr()
+        y = _sparse_data_to_array(arff_data, col_slice_y)
+        return X, y
+    else:
+        # This should never happen
+        raise ValueError('Unexpected Data Type obtained from arff.')
+
+
+def _get_data_info_by_name(name, version, data_home):
+    """
+    Utilizes the openml dataset listing api to find a dataset by
+    name/version
+    OpenML api function:
+    https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name
+
+    Parameters
+    ----------
+    name : str
+        name of the dataset
+
+    version : int or str
+        If version is an integer, the exact name/version will be obtained from
+        OpenML. If version is a string (value: "active") it will take the first
+        version from OpenML that is annotated as active. Any other string
+        values except "active" are treated as integer.
+
+    data_home : str or None
+        Location to cache the response. None if no cache is required.
+
+    Returns
+    -------
+    first_dataset : json
+        json representation of the first dataset object that adhired to the
+        search criteria
+
+    """
+    if version == "active":
+        # situation in which we return the oldest active version
+        url = _SEARCH_NAME.format(name) + "/status/active/"
+        error_msg = "No active dataset {} found.".format(name)
+        json_data = _get_json_content_from_openml_api(url, error_msg, True,
+                                                      data_home)
+        return json_data['data']['dataset'][0]
+
+    # an integer version has been provided
+    url = (_SEARCH_NAME + "/data_version/{}").format(name, version)
+    json_data = _get_json_content_from_openml_api(url, None, False,
+                                                  data_home)
+    if json_data is None:
+        # we can do this in 1 function call if OpenML does not require the
+        # specification of the dataset status (i.e., return datasets with a
+        # given name / version regardless of active, deactivated, etc. )
+        # TODO: feature request OpenML.
+        url += "/status/deactivated"
+        error_msg = "Dataset {} with version {} not found.".format(name,
+                                                                   version)
+        json_data = _get_json_content_from_openml_api(url, error_msg, True,
+                                                      data_home)
+
+    return json_data['data']['dataset'][0]
+
+
+def _get_data_description_by_id(data_id, data_home):
+    # OpenML API function: https://www.openml.org/api_docs#!/data/get_data_id
+    url = _DATA_INFO.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(url, error_message, True,
+                                                  data_home)
+    return json_data['data_set_description']
+
+
+def _get_data_features(data_id, data_home):
+    # OpenML function:
+    # https://www.openml.org/api_docs#!/data/get_data_features_id
+    url = _DATA_FEATURES.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(url, error_message, True,
+                                                  data_home)
+    return json_data['data_features']['feature']
+
+
+def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
+    # Accesses an ARFF file on the OpenML server. Documentation:
+    # https://www.openml.org/api_data_docs#!/data/get_download_id
+    # encode_nominal argument is to ensure unit testing, do not alter in
+    # production!
+    url = _DATA_FILE.format(file_id)
+    response = _open_openml_url(url, data_home)
+    if sparse is True:
+        return_type = _arff.COO
+    else:
+        return_type = _arff.DENSE
+
+    if PY2:
+        arff_file = _arff.load(response, encode_nominal=encode_nominal,
+                               return_type=return_type, )
+    else:
+        arff_file = _arff.loads(response.read().decode('utf-8'),
+                                encode_nominal=encode_nominal,
+                                return_type=return_type)
+    response.close()
+    return arff_file
+
+
+def _verify_target_data_type(features_dict, target_columns):
+    # verifies the data type of the y array in case there are multiple targets
+    # (throws an error if these targets do not comply with sklearn support)
+    if not isinstance(target_columns, list):
+        raise ValueError('target_column should be list, '
+                         'got: %s' % type(target_columns))
+    found_types = set()
+    for target_column in target_columns:
+        if target_column not in features_dict:
+            raise KeyError('Could not find target_column={}')
+        if features_dict[target_column]['data_type'] == "numeric":
+            found_types.add(np.float64)
+        else:
+            found_types.add(object)
+
+        # note: we compare to a string, not boolean
+        if features_dict[target_column]['is_ignore'] == 'true':
+            warn('target_column={} has flag is_ignore.'.format(
+                target_column))
+        if features_dict[target_column]['is_row_identifier'] == 'true':
+            warn('target_column={} has flag is_row_identifier.'.format(
+                target_column))
+    if len(found_types) > 1:
+        raise ValueError('Can only handle homogeneous multi-target datasets, '
+                         'i.e., all targets are either numeric or '
+                         'categorical.')
+
+
+def fetch_openml(name=None, version='active', data_id=None, data_home=None,
+                 target_column='default-target', cache=True):
+    """Fetch dataset from openml by name or dataset id.
+
+    Datasets are uniquely identified by either an integer ID or by a
+    combination of name and version (i.e. there might be multiple
+    versions of the 'iris' dataset). Please give either name or data_id
+    (not both). In case a name is given, a version can also be
+    provided.
+
+    .. note:: EXPERIMENTAL
+
+        The API is experimental in version 0.20 (particularly the return value
+        structure), and might have small backward-incompatible changes in
+        future releases.
+
+    Parameters
+    ----------
+    name : str or None
+        String identifier of the dataset. Note that OpenML can have multiple
+        datasets with the same name.
+
+    version : integer or 'active', default='active'
+        Version of the dataset. Can only be provided if also ``name`` is given.
+        If 'active' the oldest version that's still active is used.
+
+    data_id : int or None
+        OpenML ID of the dataset. The most specific way of retrieving a
+        dataset. If data_id is not given, name (and potential version) are
+        used to obtain a dataset.
+
+    data_home : string or None, default None
+        Specify another download and cache folder for the data sets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    target_column : string, list or None, default 'default-target'
+        Specify the column name in the data to use as target. If
+        'default-target', the standard target column a stored on the server
+        is used. If ``None``, all columns are returned as data and the
+        target is ``None``. If list (of strings), all columns with these names
+        are returned as multi-target (Note: not all scikit-learn classifiers
+        can handle all types of multi-output combinations)
+
+    cache : boolean, default=True
+        Whether to cache downloaded datasets using joblib.
+
+    Returns
+    -------
+
+    data : Bunch
+        Dictionary-like object, with attributes:
+
+        data : np.array or scipy.sparse.csr_matrix of floats
+            The feature matrix. Categorical features are encoded as ordinals.
+        target : np.array
+            The regression target or classification labels, if applicable.
+            Dtype is float if numeric, and object if categorical.
+        DESCR : str
+            The full description of the dataset
+        feature_names : list
+            The names of the dataset columns
+        categories : dict
+            Maps each categorical feature name to a list of values, such
+            that the value encoded as i is ith in the list.
+        details : dict
+            More metadata from OpenML
+
+        .. note:: EXPERIMENTAL
+
+            This interface is **experimental** as at version 0.20 and
+            subsequent releases may change attributes without notice
+            (although there should only be minor changes to ``data``
+            and ``target``).
+
+        Missing values in the 'data' are represented as NaN's. Missing values
+        in 'target' are represented as NaN's (numerical target) or None
+        (categorical target)
+    """
+    data_home = get_data_home(data_home=data_home)
+    data_home = join(data_home, 'openml')
+    if cache is False:
+        # no caching will be applied
+        data_home = None
+
+    # check valid function arguments. data_id XOR (name, version) should be
+    # provided
+    if name is not None:
+        # OpenML is case-insensitive, but the caching mechanism is not
+        # convert all data names (str) to lower case
+        name = name.lower()
+        if data_id is not None:
+            raise ValueError(
+                "Dataset data_id={} and name={} passed, but you can only "
+                "specify a numeric data_id or a name, not "
+                "both.".format(data_id, name))
+        data_info = _get_data_info_by_name(name, version, data_home)
+        data_id = data_info['did']
+    elif data_id is not None:
+        # from the previous if statement, it is given that name is None
+        if version is not "active":
+            raise ValueError(
+                "Dataset data_id={} and version={} passed, but you can only "
+                "specify a numeric data_id or a version, not "
+                "both.".format(data_id, name))
+    else:
+        raise ValueError(
+            "Neither name nor data_id are provided. Please provide name or "
+            "data_id.")
+
+    data_description = _get_data_description_by_id(data_id, data_home)
+    if data_description['status'] != "active":
+        warn("Version {} of dataset {} is inactive, meaning that issues have "
+             "been found in the dataset. Try using a newer version from "
+             "this URL: {}".format(
+                data_description['version'],
+                data_description['name'],
+                data_description['url']))
+
+    # download data features, meta-info about column types
+    features_list = _get_data_features(data_id, data_home)
+
+    for feature in features_list:
+        if 'true' in (feature['is_ignore'], feature['is_row_identifier']):
+            continue
+        if feature['data_type'] == 'string':
+            raise ValueError('STRING attributes are not yet supported')
+
+    if target_column == "default-target":
+        # determines the default target based on the data feature results
+        # (which is currently more reliable than the data description;
+        # see issue: https://github.com/openml/OpenML/issues/768)
+        target_column = [feature['name'] for feature in features_list
+                         if feature['is_target'] == 'true']
+    elif isinstance(target_column, string_types):
+        # for code-simplicity, make target_column by default a list
+        target_column = [target_column]
+    elif target_column is None:
+        target_column = []
+    elif not isinstance(target_column, list):
+        raise TypeError("Did not recognize type of target_column"
+                        "Should be six.string_type, list or None. Got: "
+                        "{}".format(type(target_column)))
+    data_columns = [feature['name'] for feature in features_list
+                    if (feature['name'] not in target_column and
+                        feature['is_ignore'] != 'true' and
+                        feature['is_row_identifier'] != 'true')]
+
+    # prepare which columns and data types should be returned for the X and y
+    features_dict = {feature['name']: feature for feature in features_list}
+
+    # XXX: col_slice_y should be all nominal or all numeric
+    _verify_target_data_type(features_dict, target_column)
+
+    col_slice_y = [int(features_dict[col_name]['index'])
+                   for col_name in target_column]
+
+    col_slice_x = [int(features_dict[col_name]['index'])
+                   for col_name in data_columns]
+    for col_idx in col_slice_y:
+        feat = features_list[col_idx]
+        nr_missing = int(feat['number_of_missing_values'])
+        if nr_missing > 0:
+            raise ValueError('Target column {} has {} missing values. '
+                             'Missing values are not supported for target '
+                             'columns. '.format(feat['name'], nr_missing))
+
+    # determine arff encoding to return
+    return_sparse = False
+    if data_description['format'].lower() == 'sparse_arff':
+        return_sparse = True
+
+    # obtain the data
+    arff = _download_data_arff(data_description['file_id'], return_sparse,
+                               data_home)
+    arff_data = arff['data']
+    nominal_attributes = {k: v for k, v in arff['attributes']
+                          if isinstance(v, list)}
+    for feature in features_list:
+        if 'true' in (feature['is_row_identifier'],
+                      feature['is_ignore']) and (feature['name'] not in
+                                                 target_column):
+            del nominal_attributes[feature['name']]
+    X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y)
+
+    is_classification = {col_name in nominal_attributes
+                         for col_name in target_column}
+    if not is_classification:
+        # No target
+        pass
+    elif all(is_classification):
+        y = np.hstack([np.take(np.asarray(nominal_attributes.pop(col_name),
+                                          dtype='O'),
+                               y[:, i:i+1].astype(int))
+                       for i, col_name in enumerate(target_column)])
+    elif any(is_classification):
+        raise ValueError('Mix of nominal and non-nominal targets is not '
+                         'currently supported')
+
+    description = u"{}\n\nDownloaded from openml.org.".format(
+        data_description.pop('description'))
+
+    # reshape y back to 1-D array, if there is only 1 target column; back
+    # to None if there are not target columns
+    if y.shape[1] == 1:
+        y = y.reshape((-1,))
+    elif y.shape[1] == 0:
+        y = None
+
+    bunch = Bunch(
+        data=X, target=y, feature_names=data_columns,
+        DESCR=description, details=data_description,
+        categories=nominal_attributes,
+        url="https://www.openml.org/d/{}".format(data_id))
+
+    return bunch
diff --git a/sklearn/datasets/tests/data/openml/2/data.arff.gz b/sklearn/datasets/tests/data/openml/2/data.arff.gz
new file mode 100644
index 0000000000000..cdf3254add760
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data.arff.gz differ
diff --git a/sklearn/datasets/tests/data/openml/2/data_description.json.gz b/sklearn/datasets/tests/data/openml/2/data_description.json.gz
new file mode 100644
index 0000000000000..22dfb6ff61c1b
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data_description.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/2/data_features.json.gz b/sklearn/datasets/tests/data/openml/2/data_features.json.gz
new file mode 100644
index 0000000000000..cb3d2750095bd
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data_features.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/2/data_list__anneal_1_active.json.gz b/sklearn/datasets/tests/data/openml/2/data_list__anneal_1_active.json.gz
new file mode 100644
index 0000000000000..d19e4a633740c
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data_list__anneal_1_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/2/data_list__anneal_None_active.json.gz b/sklearn/datasets/tests/data/openml/2/data_list__anneal_None_active.json.gz
new file mode 100644
index 0000000000000..ee94aa32a3fce
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data_list__anneal_None_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/292/data.arff.gz b/sklearn/datasets/tests/data/openml/292/data.arff.gz
new file mode 100644
index 0000000000000..6821829e1e43a
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data.arff.gz differ
diff --git a/sklearn/datasets/tests/data/openml/292/data_description.json.gz b/sklearn/datasets/tests/data/openml/292/data_description.json.gz
new file mode 100644
index 0000000000000..888140f92b360
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_description.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/292/data_features.json.gz b/sklearn/datasets/tests/data/openml/292/data_features.json.gz
new file mode 100644
index 0000000000000..29016cc36bab6
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_features.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/292/data_list__australian_1_active.json.gz b/sklearn/datasets/tests/data/openml/292/data_list__australian_1_active.json.gz
new file mode 100644
index 0000000000000..0e2c4395f1c23
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_list__australian_1_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/292/data_list__australian_1_deactivated.json.gz b/sklearn/datasets/tests/data/openml/292/data_list__australian_1_deactivated.json.gz
new file mode 100644
index 0000000000000..5ee200d7c056b
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_list__australian_1_deactivated.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/292/data_list__australian_None_active.json.gz b/sklearn/datasets/tests/data/openml/292/data_list__australian_None_active.json.gz
new file mode 100644
index 0000000000000..08d319727fb52
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_list__australian_None_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40589/data.arff.gz b/sklearn/datasets/tests/data/openml/40589/data.arff.gz
new file mode 100644
index 0000000000000..96ed11d969557
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data.arff.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40589/data_description.json.gz b/sklearn/datasets/tests/data/openml/40589/data_description.json.gz
new file mode 100644
index 0000000000000..9c71553ce5137
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data_description.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40589/data_features.json.gz b/sklearn/datasets/tests/data/openml/40589/data_features.json.gz
new file mode 100644
index 0000000000000..155460906a7b7
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data_features.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40589/data_list__emotions_3_active.json.gz b/sklearn/datasets/tests/data/openml/40589/data_list__emotions_3_active.json.gz
new file mode 100644
index 0000000000000..c5d35604ce956
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data_list__emotions_3_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40589/data_list__emotions_None_active.json.gz b/sklearn/datasets/tests/data/openml/40589/data_list__emotions_None_active.json.gz
new file mode 100644
index 0000000000000..5a27505c5d2f4
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data_list__emotions_None_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40675/data.arff.gz b/sklearn/datasets/tests/data/openml/40675/data.arff.gz
new file mode 100644
index 0000000000000..c59c3b769e114
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data.arff.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40675/data_description.json.gz b/sklearn/datasets/tests/data/openml/40675/data_description.json.gz
new file mode 100644
index 0000000000000..42b876f0a4723
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_description.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40675/data_features.json.gz b/sklearn/datasets/tests/data/openml/40675/data_features.json.gz
new file mode 100644
index 0000000000000..2d5c6f8a302e0
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_features.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_active.json.gz b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_active.json.gz
new file mode 100644
index 0000000000000..df1665b1db714
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_deactivated.json.gz b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_deactivated.json.gz
new file mode 100644
index 0000000000000..2f48ea985b4c9
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_deactivated.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40675/data_list__glass2_None_active.json.gz b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_None_active.json.gz
new file mode 100644
index 0000000000000..ff46d678f6457
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_None_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40945/data.arff.gz b/sklearn/datasets/tests/data/openml/40945/data.arff.gz
new file mode 100644
index 0000000000000..ef170f47e63dd
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40945/data.arff.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40945/data_description.json.gz b/sklearn/datasets/tests/data/openml/40945/data_description.json.gz
new file mode 100644
index 0000000000000..aaafa4a2def68
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40945/data_description.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40945/data_features.json.gz b/sklearn/datasets/tests/data/openml/40945/data_features.json.gz
new file mode 100644
index 0000000000000..24cb46957f27d
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40945/data_features.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40945/data_list__titanic_1_active.json.gz b/sklearn/datasets/tests/data/openml/40945/data_list__titanic_1_active.json.gz
new file mode 100644
index 0000000000000..b2310138197d8
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40945/data_list__titanic_1_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40966/data.arff.gz b/sklearn/datasets/tests/data/openml/40966/data.arff.gz
new file mode 100644
index 0000000000000..43ec977bf67ac
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data.arff.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40966/data_description.json.gz b/sklearn/datasets/tests/data/openml/40966/data_description.json.gz
new file mode 100644
index 0000000000000..02b25d717f925
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data_description.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40966/data_features.json.gz b/sklearn/datasets/tests/data/openml/40966/data_features.json.gz
new file mode 100644
index 0000000000000..a372f9a7be751
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data_features.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_4_active.json.gz b/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_4_active.json.gz
new file mode 100644
index 0000000000000..6c292b4196d06
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_4_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_None_active.json.gz b/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_None_active.json.gz
new file mode 100644
index 0000000000000..712545f0b72d5
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_None_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/561/data.arff.gz b/sklearn/datasets/tests/data/openml/561/data.arff.gz
new file mode 100644
index 0000000000000..eeb088c224a01
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data.arff.gz differ
diff --git a/sklearn/datasets/tests/data/openml/561/data_description.json.gz b/sklearn/datasets/tests/data/openml/561/data_description.json.gz
new file mode 100644
index 0000000000000..e4df6060ca0b8
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data_description.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/561/data_features.json.gz b/sklearn/datasets/tests/data/openml/561/data_features.json.gz
new file mode 100644
index 0000000000000..54a3ab6a7a976
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data_features.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/561/data_list__cpu_1_active.json.gz b/sklearn/datasets/tests/data/openml/561/data_list__cpu_1_active.json.gz
new file mode 100644
index 0000000000000..bb0f8103b8079
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data_list__cpu_1_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/561/data_list__cpu_None_active.json.gz b/sklearn/datasets/tests/data/openml/561/data_list__cpu_None_active.json.gz
new file mode 100644
index 0000000000000..4436afa6bc760
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data_list__cpu_None_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/61/data.arff.gz b/sklearn/datasets/tests/data/openml/61/data.arff.gz
new file mode 100644
index 0000000000000..b05dadf99fb0e
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data.arff.gz differ
diff --git a/sklearn/datasets/tests/data/openml/61/data_description.json.gz b/sklearn/datasets/tests/data/openml/61/data_description.json.gz
new file mode 100644
index 0000000000000..83c3ececcfab9
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data_description.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/61/data_features.json.gz b/sklearn/datasets/tests/data/openml/61/data_features.json.gz
new file mode 100644
index 0000000000000..6df4cf0dadbf9
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data_features.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/61/data_list__iris_1_active.json.gz b/sklearn/datasets/tests/data/openml/61/data_list__iris_1_active.json.gz
new file mode 100644
index 0000000000000..6dd5e202aeccc
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data_list__iris_1_active.json.gz differ
diff --git a/sklearn/datasets/tests/data/openml/61/data_list__iris_None_active.json.gz b/sklearn/datasets/tests/data/openml/61/data_list__iris_None_active.json.gz
new file mode 100644
index 0000000000000..b1824cde71fe2
Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data_list__iris_None_active.json.gz differ
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
new file mode 100644
index 0000000000000..7f978ef0595c6
--- /dev/null
+++ b/sklearn/datasets/tests/test_openml.py
@@ -0,0 +1,517 @@
+"""Test the openml loader.
+"""
+import gzip
+import json
+import numpy as np
+import os
+import scipy.sparse
+import sklearn
+
+from sklearn.datasets import fetch_openml
+from sklearn.datasets.openml import (_open_openml_url,
+                                     _get_data_description_by_id,
+                                     _download_data_arff)
+from sklearn.utils.testing import (assert_warns_message,
+                                   assert_raise_message)
+from sklearn.externals.six import string_types
+from sklearn.externals.six.moves.urllib.error import HTTPError
+
+
+currdir = os.path.dirname(os.path.abspath(__file__))
+# if True, urlopen will be monkey patched to only use local files
+test_offline = True
+test_gzip = True
+
+
+def _test_features_list(data_id):
+    # XXX Test is intended to verify/ensure correct decoding behavior
+    # Not usable with sparse data or datasets that have columns marked as
+    # {row_identifier, ignore}
+    def decode_column(data_bunch, col_idx):
+        col_name = data_bunch.feature_names[col_idx]
+        if col_name in data_bunch.categories:
+            # XXX: This would be faster with np.take, although it does not
+            # handle missing values fast (also not with mode='wrap')
+            cat = data_bunch.categories[col_name]
+            result = [cat[idx] if 0 <= idx < len(cat) else None for idx in
+                      data_bunch.data[:, col_idx].astype(int)]
+            return np.array(result, dtype='O')
+        else:
+            # non-nominal attribute
+            return data_bunch.data[:, col_idx]
+
+    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)
+
+    # also obtain decoded arff
+    data_description = _get_data_description_by_id(data_id, None)
+    sparse = data_description['format'].lower() == 'sparse_arff'
+    if sparse is True:
+        raise ValueError('This test is not intended for sparse data, to keep '
+                         'code relatively simple')
+    data_arff = _download_data_arff(data_description['file_id'],
+                                    sparse, None, False)
+    data_downloaded = np.array(data_arff['data'], dtype='O')
+
+    for i in range(len(data_bunch.feature_names)):
+        # XXX: Test per column, as this makes it easier to avoid problems with
+        # missing values
+
+        np.testing.assert_array_equal(data_downloaded[:, i],
+                                      decode_column(data_bunch, i))
+
+
+def _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               expected_data_dtype, expected_target_dtype,
+                               expect_sparse, compare_default_target):
+    # fetches a dataset in three various ways from OpenML, using the
+    # fetch_openml function, and does various checks on the validity of the
+    # result. Note that this function can be mocked (by invoking
+    # _monkey_patch_webbased_functions before invoking this function)
+    data_by_name_id = fetch_openml(name=data_name, version=data_version,
+                                   cache=False)
+    assert int(data_by_name_id.details['id']) == data_id
+
+    fetch_openml(name=data_name, cache=False)
+    # without specifying the version, there is no guarantee that the data id
+    # will be the same
+
+    # fetch with dataset id
+    data_by_id = fetch_openml(data_id=data_id, cache=False,
+                              target_column=target_column)
+    assert data_by_id.details['name'] == data_name
+    assert data_by_id.data.shape == (expected_observations, expected_features)
+    if isinstance(target_column, str):
+        # single target, so target is vector
+        assert data_by_id.target.shape == (expected_observations, )
+    elif isinstance(target_column, list):
+        # multi target, so target is array
+        assert data_by_id.target.shape == (expected_observations,
+                                           len(target_column))
+    assert data_by_id.data.dtype == np.float64
+    assert data_by_id.target.dtype == expected_target_dtype
+    assert len(data_by_id.feature_names) == expected_features
+    for feature in data_by_id.feature_names:
+        assert isinstance(feature, string_types)
+
+    # TODO: pass in a list of expected nominal features
+    for feature, categories in data_by_id.categories.items():
+        feature_idx = data_by_id.feature_names.index(feature)
+        values = np.unique(data_by_id.data[:, feature_idx])
+        values = values[np.isfinite(values)]
+        assert set(values) <= set(range(len(categories)))
+
+    if compare_default_target:
+        # check whether the data by id and data by id target are equal
+        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
+        if data_by_id.data.dtype == np.float64:
+            np.testing.assert_allclose(data_by_id.data,
+                                       data_by_id_default.data)
+        else:
+            assert np.array_equal(data_by_id.data, data_by_id_default.data)
+        if data_by_id.target.dtype == np.float64:
+            np.testing.assert_allclose(data_by_id.target,
+                                       data_by_id_default.target)
+        else:
+            assert np.array_equal(data_by_id.target, data_by_id_default.target)
+
+    if expect_sparse:
+        assert isinstance(data_by_id.data, scipy.sparse.csr_matrix)
+    else:
+        assert isinstance(data_by_id.data, np.ndarray)
+        # np.isnan doesn't work on CSR matrix
+        assert (np.count_nonzero(np.isnan(data_by_id.data)) ==
+                expected_missing)
+    return data_by_id
+
+
+def _monkey_patch_webbased_functions(context, data_id, gziped_files):
+    url_prefix_data_description = "https://openml.org/api/v1/json/data/"
+    url_prefix_data_features = "https://openml.org/api/v1/json/data/features/"
+    url_prefix_download_data = "https://openml.org/data/v1/"
+    url_prefix_data_list = "https://openml.org/api/v1/json/data/list/"
+
+    path_suffix = ''
+    read_fn = open
+    if gziped_files:
+        path_suffix = '.gz'
+        read_fn = gzip.open
+
+    def _mock_urlopen_data_description(url):
+        assert url.startswith(url_prefix_data_description)
+
+        path = os.path.join(currdir, 'data', 'openml', str(data_id),
+                            'data_description.json%s' % path_suffix)
+        return read_fn(path, 'rb')
+
+    def _mock_urlopen_data_features(url):
+        assert url.startswith(url_prefix_data_features)
+
+        path = os.path.join(currdir, 'data', 'openml', str(data_id),
+                            'data_features.json%s' % path_suffix)
+        return read_fn(path, 'rb')
+
+    def _mock_urlopen_download_data(url):
+        assert (url.startswith(url_prefix_download_data))
+
+        path = os.path.join(currdir, 'data', 'openml', str(data_id),
+                            'data.arff%s' % path_suffix)
+        return read_fn(path, 'rb')
+
+    def _mock_urlopen_data_list(url):
+        # url contains key value pairs of attributes, e.g.,
+        # openml.org/api/v1/json/data_name/iris/data_version/1 should
+        # ideally become {data_name: 'iris', data_version: '1'}
+        assert url.startswith(url_prefix_data_list)
+        att_list = url[len(url_prefix_data_list):].split('/')
+        key_val_dict = dict(zip(att_list[::2], att_list[1::2]))
+        # add defaults, so we can make assumptions about the content
+        if 'data_version' not in key_val_dict:
+            key_val_dict['data_version'] = None
+        if 'status' not in key_val_dict:
+            key_val_dict['status'] = "active"
+        mock_file = "data_list__%s_%s_%s.json%s" % \
+                    (key_val_dict['data_name'], key_val_dict['data_version'],
+                     key_val_dict['status'], path_suffix)
+        json_file_path = os.path.join(currdir, 'data', 'openml',
+                                      str(data_id), mock_file)
+        # load the file itself, to simulate a http error
+        json_data = json.loads(read_fn(json_file_path, 'rb').
+                               read().decode('utf-8'))
+        if 'error' in json_data:
+            raise HTTPError(url=None, code=412,
+                            msg='Simulated mock error',
+                            hdrs=None, fp=None)
+        return read_fn(json_file_path, 'rb')
+
+    def _mock_urlopen(url):
+        if url.startswith(url_prefix_data_list):
+            return _mock_urlopen_data_list(url)
+        elif url.startswith(url_prefix_data_features):
+            return _mock_urlopen_data_features(url)
+        elif url.startswith(url_prefix_download_data):
+            return _mock_urlopen_download_data(url)
+        elif url.startswith(url_prefix_data_description):
+            return _mock_urlopen_data_description(url)
+        else:
+            raise ValueError('Unknown mocking URL pattern: %s' % url)
+
+    # XXX: Global variable
+    if test_offline:
+        context.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen)
+
+
+def test_fetch_openml_iris(monkeypatch):
+    # classification dataset with numeric only columns
+    data_id = 61
+    data_name = 'iris'
+    data_version = 1
+    target_column = 'class'
+    expected_observations = 150
+    expected_features = 4
+    expected_missing = 0
+
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               np.float64, object, expect_sparse=False,
+                               compare_default_target=True)
+
+
+def test_decode_iris():
+    data_id = 61
+    _test_features_list(data_id)
+
+
+def test_fetch_openml_iris_multitarget(monkeypatch):
+    # classification dataset with numeric only columns
+    data_id = 61
+    data_name = 'iris'
+    data_version = 1
+    target_column = ['sepallength', 'sepalwidth']
+    expected_observations = 150
+    expected_features = 3
+    expected_missing = 0
+
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               object, np.float64, expect_sparse=False,
+                               compare_default_target=False)
+
+
+def test_fetch_openml_anneal(monkeypatch):
+    # classification dataset with numeric and categorical columns
+    data_id = 2
+    data_name = 'anneal'
+    data_version = 1
+    target_column = 'class'
+    # Not all original instances included for space reasons
+    expected_observations = 11
+    expected_features = 38
+    expected_missing = 267
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               object, object, expect_sparse=False,
+                               compare_default_target=True)
+
+
+def test_decode_anneal():
+    data_id = 2
+    _test_features_list(data_id)
+
+
+def test_fetch_openml_anneal_multitarget(monkeypatch):
+    # classification dataset with numeric and categorical columns
+    data_id = 2
+    data_name = 'anneal'
+    data_version = 1
+    target_column = ['class', 'product-type', 'shape']
+    # Not all original instances included for space reasons
+    expected_observations = 11
+    expected_features = 36
+    expected_missing = 267
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               object, object, expect_sparse=False,
+                               compare_default_target=False)
+
+
+def test_fetch_openml_cpu(monkeypatch):
+    # regression dataset with numeric and categorical columns
+    data_id = 561
+    data_name = 'cpu'
+    data_version = 1
+    target_column = 'class'
+    expected_observations = 209
+    expected_features = 7
+    expected_missing = 0
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               object, np.float64, expect_sparse=False,
+                               compare_default_target=True)
+
+
+def test_decode_cpu():
+    data_id = 561
+    _test_features_list(data_id)
+
+
+def test_fetch_openml_australian(monkeypatch):
+    # sparse dataset
+    # Australian is the only sparse dataset that is reasonably small
+    # as it is inactive, we need to catch the warning. Due to mocking
+    # framework, it is not deactivated in our tests
+    data_id = 292
+    data_name = 'Australian'
+    data_version = 1
+    target_column = 'Y'
+    # Not all original instances included for space reasons
+    expected_observations = 85
+    expected_features = 14
+    expected_missing = 0
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    assert_warns_message(
+        UserWarning,
+        "Version 1 of dataset Australian is inactive,",
+        _fetch_dataset_from_openml,
+        **{'data_id': data_id, 'data_name': data_name,
+           'data_version': data_version,
+           'target_column': target_column,
+           'expected_observations': expected_observations,
+           'expected_features': expected_features,
+           'expected_missing': expected_missing,
+           'expect_sparse': True,
+           'expected_data_dtype': np.float64,
+           'expected_target_dtype': object,
+           'compare_default_target': False}  # numpy specific check
+    )
+
+
+def test_fetch_openml_miceprotein(monkeypatch):
+    # JvR: very important check, as this dataset defined several row ids
+    # and ignore attributes. Note that data_features json has 82 attributes,
+    # and row id (1), ignore attributes (3) have been removed (and target is
+    # stored in data.target)
+    data_id = 40966
+    data_name = 'MiceProtein'
+    data_version = 4
+    target_column = 'class'
+    # Not all original instances included for space reasons
+    expected_observations = 7
+    expected_features = 77
+    expected_missing = 7
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               np.float64, object, expect_sparse=False,
+                               compare_default_target=True)
+
+
+def test_fetch_openml_emotions(monkeypatch):
+    # classification dataset with multiple targets (natively)
+    data_id = 40589
+    data_name = 'emotions'
+    data_version = 3
+    target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm',
+                     'quiet.still', 'sad.lonely', 'angry.aggresive']
+    expected_observations = 13
+    expected_features = 72
+    expected_missing = 0
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+
+    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               np.float64, object, expect_sparse=False,
+                               compare_default_target=True)
+
+
+def test_decode_emotions():
+    data_id = 40589
+    _test_features_list(data_id)
+
+
+def test_open_openml_url_cache(monkeypatch):
+    data_id = 61
+
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
+    test_directory = os.path.join(os.path.expanduser('~'), 'scikit_learn_data')
+    # first fill the cache
+    response1 = _open_openml_url(openml_path, test_directory)
+    # assert file exists
+    location = os.path.join(test_directory, 'openml.org', openml_path + '.gz')
+    assert os.path.isfile(location)
+    # redownload, to utilize cache
+    response2 = _open_openml_url(openml_path, test_directory)
+    assert response1.read() == response2.read()
+
+
+def test_fetch_openml_notarget(monkeypatch):
+    data_id = 61
+    target_column = None
+    expected_observations = 150
+    expected_features = 5
+
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    data = fetch_openml(data_id=data_id, target_column=target_column,
+                        cache=False)
+    assert data.data.shape == (expected_observations, expected_features)
+    assert data.target is None
+
+
+def test_fetch_openml_inactive(monkeypatch):
+    # fetch inactive dataset by id
+    data_id = 40675
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    glas2 = assert_warns_message(
+        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
+        data_id=data_id, cache=False)
+    # fetch inactive dataset by name and version
+    assert glas2.data.shape == (163, 9)
+    glas2_by_version = assert_warns_message(
+        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
+        data_id=None, name="glass2", version=1, cache=False)
+    assert int(glas2_by_version.details['id']) == data_id
+
+
+def test_fetch_nonexiting(monkeypatch):
+    # there is no active version of glass2
+    data_id = 40675
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    # Note that we only want to search by name (not data id)
+    assert_raise_message(ValueError, "No active dataset glass2 found",
+                         fetch_openml, name='glass2', cache=False)
+
+
+def test_raises_illegal_multitarget(monkeypatch):
+    data_id = 61
+    targets = ['sepalwidth', 'class']
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    # Note that we only want to search by name (not data id)
+    assert_raise_message(ValueError,
+                         "Can only handle homogeneous multi-target datasets,",
+                         fetch_openml, data_id=data_id,
+                         target_column=targets, cache=False)
+
+
+def test_warn_ignore_attribute(monkeypatch):
+    data_id = 40966
+    expected_row_id_msg = "target_column={} has flag is_row_identifier."
+    expected_ignore_msg = "target_column={} has flag is_ignore."
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    # single column test
+    assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
+                         fetch_openml, data_id=data_id,
+                         target_column='MouseID',
+                         cache=False)
+    assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
+                         fetch_openml, data_id=data_id,
+                         target_column='Genotype',
+                         cache=False)
+    # multi column test
+    assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
+                         fetch_openml, data_id=data_id,
+                         target_column=['MouseID', 'class'],
+                         cache=False)
+    assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
+                         fetch_openml, data_id=data_id,
+                         target_column=['Genotype', 'class'],
+                         cache=False)
+
+
+def test_string_attribute(monkeypatch):
+    data_id = 40945
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    # single column test
+    assert_raise_message(ValueError,
+                         'STRING attributes are not yet supported',
+                         fetch_openml, data_id=data_id, cache=False)
+
+
+def test_illegal_column(monkeypatch):
+    data_id = 61
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    assert_raise_message(KeyError, "Could not find target_column=",
+                         fetch_openml, data_id=data_id,
+                         target_column='undefined', cache=False)
+
+    assert_raise_message(KeyError, "Could not find target_column=",
+                         fetch_openml, data_id=data_id,
+                         target_column=['undefined', 'class'],
+                         cache=False)
+
+
+def test_fetch_openml_raises_missing_values_target(monkeypatch):
+    data_id = 2
+    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    assert_raise_message(ValueError, "Target column ",
+                         fetch_openml, data_id=data_id, target_column='family')
+
+
+def test_fetch_openml_raises_illegal_argument():
+    assert_raise_message(ValueError, "Dataset data_id=",
+                         fetch_openml, data_id=-1, name="name")
+
+    assert_raise_message(ValueError, "Dataset data_id=",
+                         fetch_openml, data_id=-1, name=None,
+                         version="version")
+
+    assert_raise_message(ValueError, "Dataset data_id=",
+                         fetch_openml, data_id=-1, name="name",
+                         version="version")
+
+    assert_raise_message(ValueError, "Neither name nor data_id are provided. "
+                         "Please provide name or data_id.", fetch_openml)
diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
new file mode 100644
index 0000000000000..6225dfc3691c3
--- /dev/null
+++ b/sklearn/externals/_arff.py
@@ -0,0 +1,1059 @@
+# -*- coding: utf-8 -*-
+# =============================================================================
+# Federal University of Rio Grande do Sul (UFRGS)
+# Connectionist Artificial Intelligence Laboratory (LIAC)
+# Renato de Pontes Pereira - rppereira@inf.ufrgs.br
+# =============================================================================
+# Copyright (c) 2011 Renato de Pontes Pereira, renato.ppontes at gmail dot com
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# =============================================================================
+
+'''
+The liac-arff module implements functions to read and write ARFF files in
+Python. It was created in the Connectionist Artificial Intelligence Laboratory
+(LIAC), which takes place at the Federal University of Rio Grande do Sul 
+(UFRGS), in Brazil.
+
+ARFF (Attribute-Relation File Format) is an file format specially created for
+describe datasets which are commonly used for machine learning experiments and
+softwares. This file format was created to be used in Weka, the best 
+representative software for machine learning automated experiments.
+
+An ARFF file can be divided into two sections: header and data. The Header 
+describes the metadata of the dataset, including a general description of the 
+dataset, its name and its attributes. The source below is an example of a 
+header section in a XOR dataset::
+
+    % 
+    % XOR Dataset
+    % 
+    % Created by Renato Pereira
+    %            rppereira@inf.ufrgs.br
+    %            http://inf.ufrgs.br/~rppereira
+    % 
+    % 
+    @RELATION XOR
+
+    @ATTRIBUTE input1 REAL
+    @ATTRIBUTE input2 REAL
+    @ATTRIBUTE y REAL
+
+The Data section of an ARFF file describes the observations of the dataset, in 
+the case of XOR dataset::
+
+    @DATA
+    0.0,0.0,0.0
+    0.0,1.0,1.0
+    1.0,0.0,1.0
+    1.0,1.0,0.0
+    % 
+    % 
+    % 
+
+Notice that several lines are starting with an ``%`` symbol, denoting a 
+comment, thus, lines with ``%`` at the beginning will be ignored, except by the
+description part at the beginning of the file. The declarations ``@RELATION``, 
+``@ATTRIBUTE``, and ``@DATA`` are all case insensitive and obligatory.
+
+For more information and details about the ARFF file description, consult
+http://www.cs.waikato.ac.nz/~ml/weka/arff.html
+
+
+ARFF Files in Python
+~~~~~~~~~~~~~~~~~~~~
+
+This module uses built-ins python objects to represent a deserialized ARFF 
+file. A dictionary is used as the container of the data and metadata of ARFF,
+and have the following keys:
+
+- **description**: (OPTIONAL) a string with the description of the dataset.
+- **relation**: (OBLIGATORY) a string with the name of the dataset.
+- **attributes**: (OBLIGATORY) a list of attributes with the following 
+  template::
+
+    (attribute_name, attribute_type)
+
+  the attribute_name is a string, and attribute_type must be an string
+  or a list of strings.
+- **data**: (OBLIGATORY) a list of data instances. Each data instance must be 
+  a list with values, depending on the attributes.
+
+The above keys must follow the case which were described, i.e., the keys are 
+case sensitive. The attribute type ``attribute_type`` must be one of these 
+strings (they are not case sensitive): ``NUMERIC``, ``INTEGER``, ``REAL`` or 
+``STRING``. For nominal attributes, the ``atribute_type`` must be a list of 
+strings.
+
+In this format, the XOR dataset presented above can be represented as a python 
+object as::
+
+    xor_dataset = {
+        'description': 'XOR Dataset',
+        'relation': 'XOR',
+        'attributes': [
+            ('input1', 'REAL'),
+            ('input2', 'REAL'),
+            ('y', 'REAL'),
+        ],
+        'data': [
+            [0.0, 0.0, 0.0],
+            [0.0, 1.0, 1.0],
+            [1.0, 0.0, 1.0],
+            [1.0, 1.0, 0.0]
+        ]
+    }
+
+
+Features
+~~~~~~~~
+
+This module provides several features, including:
+
+- Read and write ARFF files using python built-in structures, such dictionaries
+  and lists;
+- Supports `scipy.sparse.coo <http://docs.scipy
+  .org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html#scipy.sparse.coo_matrix>`_
+  and lists of dictionaries as used by SVMLight
+- Supports the following attribute types: NUMERIC, REAL, INTEGER, STRING, and
+  NOMINAL;
+- Has an interface similar to other built-in modules such as ``json``, or 
+  ``zipfile``;
+- Supports read and write the descriptions of files;
+- Supports missing values and names with spaces;
+- Supports unicode values and names;
+- Fully compatible with Python 2.7+, Python 3.3+, pypy and pypy3;
+- Under `MIT License <http://opensource.org/licenses/MIT>`_
+
+'''
+__author__ = 'Renato de Pontes Pereira, Matthias Feurer, Joel Nothman'
+__author_email__ = ('renato.ppontes@gmail.com, '
+                    'feurerm@informatik.uni-freiburg.de, '
+                    'joel.nothman@gmail.com')
+__version__ = '2.3'
+
+import re
+import sys
+import csv
+
+# CONSTANTS ===================================================================
+_SIMPLE_TYPES = ['NUMERIC', 'REAL', 'INTEGER', 'STRING']
+
+_TK_DESCRIPTION = '%'
+_TK_COMMENT     = '%'
+_TK_RELATION    = '@RELATION'
+_TK_ATTRIBUTE   = '@ATTRIBUTE'
+_TK_DATA        = '@DATA'
+
+_RE_RELATION     = re.compile(r'^([^\{\}%,\s]*|\".*\"|\'.*\')$', re.UNICODE)
+_RE_ATTRIBUTE    = re.compile(r'^(\".*\"|\'.*\'|[^\{\}%,\s]*)\s+(.+)$', re.UNICODE)
+_RE_TYPE_NOMINAL = re.compile(r'^\{\s*((\".*\"|\'.*\'|\S*)\s*,\s*)*(\".*\"|\'.*\'|\S*)\s*\}$', re.UNICODE)
+_RE_QUOTE_CHARS = re.compile(r'["\'\\ \t%,]')
+_RE_ESCAPE_CHARS = re.compile(r'(?=["\'\\%])')  # don't need to capture anything
+_RE_SPARSE_LINE = re.compile(r'^\{.*\}$')
+_RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]')
+
+
+def _build_re_values():
+    quoted_re = r'''(?x)
+                    "      # open quote followed by zero or more of:
+                    (?:
+                        (?<!\\)    # no additional backslash
+                        (?:\\\\)*  # maybe escaped backslashes
+                        \\"        # escaped quote
+                    |
+                        \\[^"]     # escaping a non-quote
+                    |
+                        [^"\\]     # non-quote char
+                    )*
+                    "      # close quote
+                    '''
+    # a value is surrounded by " or by ' or contains no quotables
+    value_re = r'''(?x)(?:
+        %s|          # a value may be surrounded by "
+        %s|          # or by '
+        [^,\s"'{}]+  # or may contain no characters requiring quoting
+        )''' % (quoted_re,
+                quoted_re.replace('"', "'"))
+
+    # This captures (value, error) groups. Because empty values are allowed,
+    # we cannot just look for empty values to handle syntax errors.
+    # We presume the line has had ',' prepended...
+    dense = re.compile(r'''(?x)
+        ,                # may follow ','
+        \s*
+        ((?=,)|$|%(value_re)s)  # empty or value
+        |
+        (\S.*)           # error
+        ''' % {'value_re': value_re})
+
+    # This captures (key, value) groups and will have an empty key/value
+    # in case of syntax errors.
+    # It does not ensure that the line starts with '{' or ends with '}'.
+    sparse = re.compile(r'''(?x)
+        (?:^\s*\{|,)   # may follow ',', or '{' at line start
+        \s*
+        (\d+)          # attribute key
+        \s+
+        (%(value_re)s) # value
+        |
+        (?!}\s*$)      # not an error if it's }$
+        (?!^\s*{\s*}\s*$)  # not an error if it's ^{}$
+        \S.*           # error
+        ''' % {'value_re': value_re})
+    return dense, sparse
+
+
+_RE_DENSE_VALUES, _RE_SPARSE_KEY_VALUES = _build_re_values()
+
+
+def _unquote(v):
+    if v[:1] in ('"', "'"):
+        return re.sub(r'\\(.)', r'\1', v[1:-1])
+    elif v in ('?', ''):
+        return None
+    else:
+        return v
+
+
+def _parse_values(s):
+    '''(INTERNAL) Split a line into a list of values'''
+    if not _RE_NONTRIVIAL_DATA.search(s):
+        # Fast path for trivial cases (unfortunately we have to handle missing
+        # values because of the empty string case :(.)
+        return [None if s in ('?', '') else s
+                for s in next(csv.reader([s]))]
+
+    # _RE_DENSE_VALUES tokenizes despite quoting, whitespace, etc.
+    values, errors = zip(*_RE_DENSE_VALUES.findall(',' + s))
+    if not any(errors):
+        return [_unquote(v) for v in values]
+    if _RE_SPARSE_LINE.match(s):
+        try:
+            return {int(k): _unquote(v)
+                    for k, v in _RE_SPARSE_KEY_VALUES.findall(s)}
+        except ValueError as exc:
+            # an ARFF syntax error in sparse data
+            for match in _RE_SPARSE_KEY_VALUES.finditer(s):
+                if not match.group(1):
+                    raise BadLayout('Error parsing %r' % match.group())
+            raise
+    else:
+        # an ARFF syntax error
+        for match in _RE_DENSE_VALUES.finditer(s):
+            if match.group(2):
+                raise BadLayout('Error parsing %r' % match.group())
+        raise BadLayout('Unknown parsing error')
+
+
+DENSE = 0   # Constant value representing a dense matrix
+COO = 1     # Constant value representing a sparse matrix in coordinate format
+LOD = 2     # Constant value representing a sparse matrix in list of
+            # dictionaries format
+_SUPPORTED_DATA_STRUCTURES = [DENSE, COO, LOD]
+
+# =============================================================================
+
+# COMPATIBILITY WITH PYTHON 3 =================================================
+PY3 = sys.version_info[0] == 3
+if PY3:
+    unicode = str
+    basestring = str
+    xrange = range
+# COMPABILITY WITH PYTHON 2 ===================================================
+# =============================================================================
+PY2 = sys.version_info[0] == 2
+if PY2:
+    from itertools import izip as zip
+
+# EXCEPTIONS ==================================================================
+class ArffException(Exception):
+    message = None
+
+    def __init__(self):
+        self.line = -1
+
+    def __str__(self):
+        return self.message%self.line
+
+class BadRelationFormat(ArffException):
+    '''Error raised when the relation declaration is in an invalid format.'''
+    message = 'Bad @RELATION format, at line %d.'
+
+class BadAttributeFormat(ArffException):
+    '''Error raised when some attribute declaration is in an invalid format.'''
+    message = 'Bad @ATTRIBUTE format, at line %d.'
+
+class BadDataFormat(ArffException):
+    '''Error raised when some data instance is in an invalid format.'''
+    def __init__(self, value):
+        super(BadDataFormat, self).__init__()
+        self.message = (
+            'Bad @DATA instance format in line %d: ' +
+            ('%s' % value)
+        )
+
+class BadAttributeType(ArffException):
+    '''Error raised when some invalid type is provided into the attribute
+    declaration.'''
+    message = 'Bad @ATTRIBUTE type, at line %d.'
+
+class BadAttributeName(ArffException):
+    '''Error raised when an attribute name is provided twice the attribute
+    declaration.'''
+
+    def __init__(self, value, value2):
+        super(BadAttributeName, self).__init__()
+        self.message = (
+            ('Bad @ATTRIBUTE name %s at line' % value) +
+            ' %d, this name is already in use in line' +
+            (' %d.' % value2)
+        )
+
+class BadNominalValue(ArffException):
+    '''Error raised when a value in used in some data instance but is not
+    declared into it respective attribute declaration.'''
+
+    def __init__(self, value):
+        super(BadNominalValue, self).__init__()
+        self.message = (
+            ('Data value %s not found in nominal declaration, ' % value)
+            + 'at line %d.'
+        )
+
+class BadNominalFormatting(ArffException):
+    '''Error raised when a nominal value with space is not properly quoted.'''
+    def __init__(self, value):
+        super(BadNominalFormatting, self).__init__()
+        self.message = (
+            ('Nominal data value "%s" not properly quoted in line ' % value) +
+            '%d.'
+        )
+
+class BadNumericalValue(ArffException):
+    '''Error raised when and invalid numerical value is used in some data
+    instance.'''
+    message = 'Invalid numerical value, at line %d.'
+
+class BadStringValue(ArffException):
+    '''Error raise when a string contains space but is not quoted.'''
+    message = 'Invalid string value at line %d.'
+
+class BadLayout(ArffException):
+    '''Error raised when the layout of the ARFF file has something wrong.'''
+    message = 'Invalid layout of the ARFF file, at line %d.'
+
+    def __init__(self, msg=''):
+        super(BadLayout, self).__init__()
+        if msg:
+            self.message = BadLayout.message + ' ' + msg.replace('%', '%%')
+
+class BadObject(ArffException):
+    '''Error raised when the object representing the ARFF file has something
+    wrong.'''
+
+    def __str__(self):
+        return 'Invalid object.'
+
+class BadObject(ArffException):
+    '''Error raised when the object representing the ARFF file has something
+    wrong.'''
+    def __init__(self, msg=''):
+        self.msg = msg
+
+    def __str__(self):
+        return '%s'%self.msg
+# =============================================================================
+
+# INTERNAL ====================================================================
+def encode_string(s):
+    if _RE_QUOTE_CHARS.search(s):
+        return u"'%s'" % _RE_ESCAPE_CHARS.sub(r'\\', s)
+    return s
+
+
+class EncodedNominalConversor(object):
+    def __init__(self, values):
+        self.values = {v: i for i, v in enumerate(values)}
+        self.values[0] = 0
+
+    def __call__(self, value):
+        try:
+            return self.values[value]
+        except KeyError:
+            raise BadNominalValue(value)
+
+
+class NominalConversor(object):
+    def __init__(self, values):
+        self.values = set(values)
+        self.zero_value = values[0]
+
+    def __call__(self, value):
+        if value not in self.values:
+            if value == 0:
+                # Sparse decode
+                # See issue #52: nominals should take their first value when
+                # unspecified in a sparse matrix. Naturally, this is consistent
+                # with EncodedNominalConversor.
+                return self.zero_value
+            raise BadNominalValue(value)
+        return unicode(value)
+
+
+class Data(object):
+    '''Internal helper class to allow for different matrix types without
+    making the code a huge collection of if statements.'''
+    def __init__(self):
+        self.data = []
+
+    def decode_data(self, s, conversors):
+        values = _parse_values(s)
+
+        if isinstance(values, dict):
+            if max(values) >= len(conversors):
+                raise BadDataFormat(s)
+            # XXX: int 0 is used for implicit values, not '0'
+            values = [values[i] if i in values else 0 for i in
+                      xrange(len(conversors))]
+        else:
+            if len(values) != len(conversors):
+                raise BadDataFormat(s)
+
+        self.data.append(self._decode_values(values, conversors))
+
+    @staticmethod
+    def _decode_values(values, conversors):
+        try:
+            values = [None if value is None else conversor(value)
+                      for conversor, value
+                      in zip(conversors, values)]
+        except ValueError as exc:
+            if 'float: ' in str(exc):
+                raise BadNumericalValue()
+        return values
+
+    def _tuplify_sparse_data(self, x):
+        if len(x) != 2:
+            raise BadDataFormat(x)
+        return (int(x[0].strip('"').strip("'")), x[1])
+
+    def encode_data(self, data, attributes):
+        '''(INTERNAL) Encodes a line of data.
+
+        Data instances follow the csv format, i.e, attribute values are
+        delimited by commas. After converted from csv.
+
+        :param data: a list of values.
+        :param attributes: a list of attributes. Used to check if data is valid.
+        :return: a string with the encoded data line.
+        '''
+        current_row = 0
+
+        for inst in data:
+            if len(inst) != len(attributes):
+                raise BadObject(
+                    'Instance %d has %d attributes, expected %d' %
+                     (current_row, len(inst), len(attributes))
+                )
+
+            new_data = []
+            for value in inst:
+                if value is None or value == u'' or value != value:
+                    s = '?'
+                else:
+                    s = encode_string(unicode(value))
+                new_data.append(s)
+
+            current_row += 1
+            yield u','.join(new_data)
+
+class COOData(Data):
+    def __init__(self):
+        self.data = ([], [], [])
+        self._current_num_data_points = 0
+
+    def decode_data(self, s, conversors):
+        values = _parse_values(s)
+
+        if not isinstance(values, dict):
+            raise BadLayout()
+        if not values:
+            self._current_num_data_points += 1
+            return
+        col, values = zip(*sorted(values.items()))
+        try:
+            values = [value if value is None else conversors[key](value)
+                      for key, value in zip(col, values)]
+        except ValueError as exc:
+            if 'float: ' in str(exc):
+                raise BadNumericalValue()
+            raise
+        except IndexError:
+            # conversor out of range
+            raise BadDataFormat(s)
+        self.data[0].extend(values)
+        self.data[1].extend([self._current_num_data_points] * len(values))
+        self.data[2].extend(col)
+
+        self._current_num_data_points += 1
+
+    def encode_data(self, data, attributes):
+        num_attributes = len(attributes)
+        new_data = []
+        current_row = 0
+
+        row = data.row
+        col = data.col
+        data = data.data
+
+        # Check if the rows are sorted
+        if not all(row[i] <= row[i + 1] for i in xrange(len(row) - 1)):
+            raise ValueError("liac-arff can only output COO matrices with "
+                             "sorted rows.")
+
+        for v, col, row in zip(data, col, row):
+            if row > current_row:
+                # Add empty rows if necessary
+                while current_row < row:
+                    yield " ".join([u"{", u','.join(new_data), u"}"])
+                    new_data = []
+                    current_row += 1
+
+            if col >= num_attributes:
+                raise BadObject(
+                    'Instance %d has at least %d attributes, expected %d' %
+                    (current_row, col + 1, num_attributes)
+                )
+
+            if v is None or v == u'' or v != v:
+                s = '?'
+            else:
+                s = encode_string(unicode(v))
+            new_data.append("%d %s" % (col, s))
+
+        yield " ".join([u"{", u','.join(new_data), u"}"])
+
+class LODData(Data):
+    def __init__(self):
+        self.data = []
+
+    def decode_data(self, s, conversors):
+        values = _parse_values(s)
+        n_conversors = len(conversors)
+
+        if not isinstance(values, dict):
+            raise BadLayout()
+        try:
+            self.data.append({key: None if value is None else conversors[key](value)
+                              for key, value in values.items()})
+        except ValueError as exc:
+            if 'float: ' in str(exc):
+                raise BadNumericalValue()
+            raise
+        except IndexError:
+            # conversor out of range
+            raise BadDataFormat(s)
+
+    def encode_data(self, data, attributes):
+        current_row = 0
+
+        num_attributes = len(attributes)
+        for row in data:
+            new_data = []
+
+            if len(row) > 0 and max(row) >= num_attributes:
+                raise BadObject(
+                    'Instance %d has %d attributes, expected %d' %
+                    (current_row, max(row) + 1, num_attributes)
+                )
+
+            for col in sorted(row):
+                v = row[col]
+                if v is None or v == u'' or v != v:
+                    s = '?'
+                else:
+                    s = encode_string(unicode(v))
+                new_data.append("%d %s" % (col, s))
+
+            current_row += 1
+            yield " ".join([u"{", u','.join(new_data), u"}"])
+
+def _get_data_object_for_decoding(matrix_type):
+    if matrix_type == DENSE:
+        return Data()
+    elif matrix_type == COO:
+        return COOData()
+    elif matrix_type == LOD:
+        return LODData()
+    else:
+        raise ValueError("Matrix type %s not supported." % str(matrix_type))
+
+def _get_data_object_for_encoding(matrix):
+    # Probably a scipy.sparse
+    if hasattr(matrix, 'format'):
+        if matrix.format == 'coo':
+            return COOData()
+        else:
+            raise ValueError('Cannot guess matrix format!')
+    elif isinstance(matrix[0], dict):
+        return LODData()
+    else:
+        return Data()
+
+# =============================================================================
+
+# ADVANCED INTERFACE ==========================================================
+class ArffDecoder(object):
+    '''An ARFF decoder.'''
+
+    def __init__(self):
+        '''Constructor.'''
+        self._conversors = []
+        self._current_line = 0
+
+    def _decode_comment(self, s):
+        '''(INTERNAL) Decodes a comment line.
+
+        Comments are single line strings starting, obligatorily, with the ``%``
+        character, and can have any symbol, including whitespaces or special
+        characters.
+
+        This method must receive a normalized string, i.e., a string without
+        padding, including the "\r\n" characters.
+
+        :param s: a normalized string.
+        :return: a string with the decoded comment.
+        '''
+        res = re.sub('^\%( )?', '', s)
+        return res
+
+    def _decode_relation(self, s):
+        '''(INTERNAL) Decodes a relation line.
+
+        The relation declaration is a line with the format ``@RELATION
+        <relation-name>``, where ``relation-name`` is a string. The string must
+        start with alphabetic character and must be quoted if the name includes
+        spaces, otherwise this method will raise a `BadRelationFormat` exception.
+
+        This method must receive a normalized string, i.e., a string without
+        padding, including the "\r\n" characters.
+
+        :param s: a normalized string.
+        :return: a string with the decoded relation name.
+        '''
+        _, v = s.split(' ', 1)
+        v = v.strip()
+
+        if not _RE_RELATION.match(v):
+            raise BadRelationFormat()
+
+        res = unicode(v.strip('"\''))
+        return res
+
+    def _decode_attribute(self, s):
+        '''(INTERNAL) Decodes an attribute line.
+
+        The attribute is the most complex declaration in an arff file. All
+        attributes must follow the template::
+
+             @attribute <attribute-name> <datatype>
+
+        where ``attribute-name`` is a string, quoted if the name contains any
+        whitespace, and ``datatype`` can be:
+
+        - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.
+        - Strings as ``STRING``.
+        - Dates (NOT IMPLEMENTED).
+        - Nominal attributes with format:
+
+            {<nominal-name1>, <nominal-name2>, <nominal-name3>, ...}
+
+        The nominal names follow the rules for the attribute names, i.e., they
+        must be quoted if the name contains whitespaces.
+
+        This method must receive a normalized string, i.e., a string without
+        padding, including the "\r\n" characters.
+
+        :param s: a normalized string.
+        :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES).
+        '''
+        _, v = s.split(' ', 1)
+        v = v.strip()
+
+        # Verify the general structure of declaration
+        m = _RE_ATTRIBUTE.match(v)
+        if not m:
+            raise BadAttributeFormat()
+
+        # Extracts the raw name and type
+        name, type_ = m.groups()
+
+        # Extracts the final name
+        name = unicode(name.strip('"\''))
+
+        # Extracts the final type
+        if _RE_TYPE_NOMINAL.match(type_):
+            try:
+                type_ = _parse_values(type_.strip('{} '))
+            except Exception:
+                raise BadAttributeType()
+            if isinstance(type_, dict):
+                raise BadAttributeType()
+
+        else:
+            # If not nominal, verify the type name
+            type_ = unicode(type_).upper()
+            if type_ not in ['NUMERIC', 'REAL', 'INTEGER', 'STRING']:
+                raise BadAttributeType()
+
+        return (name, type_)
+
+    def _decode(self, s, encode_nominal=False, matrix_type=DENSE):
+        '''Do the job the ``encode``.'''
+
+        # Make sure this method is idempotent
+        self._current_line = 0
+
+        # If string, convert to a list of lines
+        if isinstance(s, basestring):
+            s = s.strip('\r\n ').replace('\r\n', '\n').split('\n')
+
+        # Create the return object
+        obj = {
+            u'description': u'',
+            u'relation': u'',
+            u'attributes': [],
+            u'data': []
+        }
+        attribute_names = {}
+
+        # Create the data helper object
+        data = _get_data_object_for_decoding(matrix_type)
+
+        # Read all lines
+        STATE = _TK_DESCRIPTION
+        for row in s:
+            self._current_line += 1
+            # Ignore empty lines
+            row = row.strip(' \r\n')
+            if not row: continue
+
+            u_row = row.upper()
+
+            # DESCRIPTION -----------------------------------------------------
+            if u_row.startswith(_TK_DESCRIPTION) and STATE == _TK_DESCRIPTION:
+                obj['description'] += self._decode_comment(row) + '\n'
+            # -----------------------------------------------------------------
+
+            # RELATION --------------------------------------------------------
+            elif u_row.startswith(_TK_RELATION):
+                if STATE != _TK_DESCRIPTION:
+                    raise BadLayout()
+
+                STATE = _TK_RELATION
+                obj['relation'] = self._decode_relation(row)
+            # -----------------------------------------------------------------
+
+            # ATTRIBUTE -------------------------------------------------------
+            elif u_row.startswith(_TK_ATTRIBUTE):
+                if STATE != _TK_RELATION and STATE != _TK_ATTRIBUTE:
+                    raise BadLayout()
+
+                STATE = _TK_ATTRIBUTE
+
+                attr = self._decode_attribute(row)
+                if attr[0] in attribute_names:
+                    raise BadAttributeName(attr[0], attribute_names[attr[0]])
+                else:
+                    attribute_names[attr[0]] = self._current_line
+                obj['attributes'].append(attr)
+
+                if isinstance(attr[1], (list, tuple)):
+                    if encode_nominal:
+                        conversor = EncodedNominalConversor(attr[1])
+                    else:
+                        conversor = NominalConversor(attr[1])
+                else:
+                    CONVERSOR_MAP = {'STRING': unicode,
+                                     'INTEGER': lambda x: int(float(x)),
+                                     'NUMERIC': float,
+                                     'REAL': float}
+                    conversor = CONVERSOR_MAP[attr[1]]
+
+                self._conversors.append(conversor)
+            # -----------------------------------------------------------------
+
+            # DATA ------------------------------------------------------------
+            elif u_row.startswith(_TK_DATA):
+                if STATE != _TK_ATTRIBUTE:
+                    raise BadLayout()
+
+                STATE = _TK_DATA
+            # -----------------------------------------------------------------
+
+            # COMMENT ---------------------------------------------------------
+            elif u_row.startswith(_TK_COMMENT):
+                pass
+            # -----------------------------------------------------------------
+
+            # DATA INSTANCES --------------------------------------------------
+            elif STATE == _TK_DATA:
+                data.decode_data(row, self._conversors)
+            # -----------------------------------------------------------------
+
+            # UNKNOWN INFORMATION ---------------------------------------------
+            else:
+                raise BadLayout()
+            # -----------------------------------------------------------------
+
+        # Alter the data object
+        obj['data'] = data.data
+        if obj['description'].endswith('\n'):
+            obj['description'] = obj['description'][:-1]
+
+        return obj
+
+    def decode(self, s, encode_nominal=False, return_type=DENSE):
+        '''Returns the Python representation of a given ARFF file.
+
+        When a file object is passed as an argument, this method reads lines
+        iteratively, avoiding to load unnecessary information to the memory.
+
+        :param s: a string or file object with the ARFF file.
+        :param encode_nominal: boolean, if True perform a label encoding
+            while reading the .arff file.
+        :param return_type: determines the data structure used to store the
+            dataset. Can be one of `arff.DENSE`, `arff.COO` and `arff.LOD`.
+            Consult the section on `working with sparse data`_
+        '''
+        try:
+            return self._decode(s, encode_nominal=encode_nominal,
+                                matrix_type=return_type)
+        except ArffException as e:
+            e.line = self._current_line
+            raise e
+
+
+class ArffEncoder(object):
+    '''An ARFF encoder.'''
+
+    def _encode_comment(self, s=''):
+        '''(INTERNAL) Encodes a comment line.
+
+        Comments are single line strings starting, obligatorily, with the ``%``
+        character, and can have any symbol, including whitespaces or special
+        characters.
+
+        If ``s`` is None, this method will simply return an empty comment.
+
+        :param s: (OPTIONAL) string.
+        :return: a string with the encoded comment line.
+        '''
+        if s:
+            return u'%s %s'%(_TK_COMMENT, s)
+        else:
+            return u'%s' % _TK_COMMENT
+
+    def _encode_relation(self, name):
+        '''(INTERNAL) Decodes a relation line.
+
+        The relation declaration is a line with the format ``@RELATION
+        <relation-name>``, where ``relation-name`` is a string.
+
+        :param name: a string.
+        :return: a string with the encoded relation declaration.
+        '''
+        for char in ' %{},':
+            if char in name:
+                name = '"%s"'%name
+                break
+
+        return u'%s %s'%(_TK_RELATION, name)
+
+    def _encode_attribute(self, name, type_):
+        '''(INTERNAL) Encodes an attribute line.
+
+        The attribute follow the template::
+
+             @attribute <attribute-name> <datatype>
+
+        where ``attribute-name`` is a string, and ``datatype`` can be:
+
+        - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.
+        - Strings as ``STRING``.
+        - Dates (NOT IMPLEMENTED).
+        - Nominal attributes with format:
+
+            {<nominal-name1>, <nominal-name2>, <nominal-name3>, ...}
+
+        This method must receive a the name of the attribute and its type, if
+        the attribute type is nominal, ``type`` must be a list of values.
+
+        :param name: a string.
+        :param type_: a string or a list of string.
+        :return: a string with the encoded attribute declaration.
+        '''
+        for char in ' %{},':
+            if char in name:
+                name = '"%s"'%name
+                break
+
+        if isinstance(type_, (tuple, list)):
+            type_tmp = []
+            for i in range(len(type_)):
+                type_tmp.append(u'%s' % encode_string(type_[i]))
+            type_ = u'{%s}'%(u', '.join(type_tmp))
+
+        return u'%s %s %s'%(_TK_ATTRIBUTE, name, type_)
+
+    def encode(self, obj):
+        '''Encodes a given object to an ARFF file.
+
+        :param obj: the object containing the ARFF information.
+        :return: the ARFF file as an unicode string.
+        '''
+        data = [row for row in self.iter_encode(obj)]
+
+        return u'\n'.join(data)
+
+    def iter_encode(self, obj):
+        '''The iterative version of `arff.ArffEncoder.encode`.
+
+        This encodes iteratively a given object and return, one-by-one, the
+        lines of the ARFF file.
+
+        :param obj: the object containing the ARFF information.
+        :return: (yields) the ARFF file as unicode strings.
+        '''
+        # DESCRIPTION
+        if obj.get('description', None):
+            for row in obj['description'].split('\n'):
+                yield self._encode_comment(row)
+
+        # RELATION
+        if not obj.get('relation'):
+            raise BadObject('Relation name not found or with invalid value.')
+
+        yield self._encode_relation(obj['relation'])
+        yield u''
+
+        # ATTRIBUTES
+        if not obj.get('attributes'):
+            raise BadObject('Attributes not found.')
+
+        attribute_names = set()
+        for attr in obj['attributes']:
+            # Verify for bad object format
+            if not isinstance(attr, (tuple, list)) or \
+               len(attr) != 2 or \
+               not isinstance(attr[0], basestring):
+                raise BadObject('Invalid attribute declaration "%s"'%str(attr))
+
+            if isinstance(attr[1], basestring):
+                # Verify for invalid types
+                if attr[1] not in _SIMPLE_TYPES:
+                    raise BadObject('Invalid attribute type "%s"'%str(attr))
+
+            # Verify for bad object format
+            elif not isinstance(attr[1], (tuple, list)):
+                raise BadObject('Invalid attribute type "%s"'%str(attr))
+
+            # Verify attribute name is not used twice
+            if attr[0] in attribute_names:
+                raise BadObject('Trying to use attribute name "%s" for the '
+                                'second time.' % str(attr[0]))
+            else:
+                attribute_names.add(attr[0])
+
+            yield self._encode_attribute(attr[0], attr[1])
+        yield u''
+        attributes = obj['attributes']
+
+        # DATA
+        yield _TK_DATA
+        if 'data' in obj:
+            data = _get_data_object_for_encoding(obj.get('data'))
+            for line in data.encode_data(obj.get('data'), attributes):
+                yield line
+
+        yield u''
+
+# =============================================================================
+
+# BASIC INTERFACE =============================================================
+def load(fp, encode_nominal=False, return_type=DENSE):
+    '''Load a file-like object containing the ARFF document and convert it into
+    a Python object. 
+
+    :param fp: a file-like object.
+    :param encode_nominal: boolean, if True perform a label encoding
+        while reading the .arff file.
+    :param return_type: determines the data structure used to store the
+        dataset. Can be one of `arff.DENSE`, `arff.COO` and `arff.LOD`.
+        Consult the section on `working with sparse data`_
+    :return: a dictionary.
+     '''
+    decoder = ArffDecoder()
+    return decoder.decode(fp, encode_nominal=encode_nominal,
+                          return_type=return_type)
+
+def loads(s, encode_nominal=False, return_type=DENSE):
+    '''Convert a string instance containing the ARFF document into a Python
+    object.
+
+    :param s: a string object.
+    :param encode_nominal: boolean, if True perform a label encoding
+        while reading the .arff file.
+    :param return_type: determines the data structure used to store the
+        dataset. Can be one of `arff.DENSE`, `arff.COO` and `arff.LOD`.
+        Consult the section on `working with sparse data`_
+    :return: a dictionary.
+    '''
+    decoder = ArffDecoder()
+    return decoder.decode(s, encode_nominal=encode_nominal,
+                          return_type=return_type)
+
+def dump(obj, fp):
+    '''Serialize an object representing the ARFF document to a given file-like 
+    object.
+
+    :param obj: a dictionary.
+    :param fp: a file-like object.
+    '''
+    encoder = ArffEncoder()
+    generator = encoder.iter_encode(obj)
+
+    last_row = next(generator)
+    for row in generator:
+        fp.write(last_row + u'\n')
+        last_row = row
+    fp.write(last_row)
+
+    return fp
+
+def dumps(obj):
+    '''Serialize an object representing the ARFF document, returning a string.
+
+    :param obj: a dictionary.
+    :return: a string with the ARFF document.
+    '''
+    encoder = ArffEncoder()
+    return encoder.encode(obj)
+# =============================================================================