diff --git a/MANIFEST.in b/MANIFEST.in index ed0ca0e87274e..db605f55f748c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,7 +2,7 @@ include *.rst recursive-include doc * recursive-include examples * recursive-include sklearn *.c *.h *.pyx *.pxd *.pxi -recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt +recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt *.arff.gz *.json.gz include COPYING include AUTHORS.rst include README.rst diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst new file mode 100644 index 0000000000000..53ab211df903e --- /dev/null +++ b/doc/datasets/openml.rst @@ -0,0 +1,148 @@ +.. + For doctests: + + >>> import numpy as np + >>> import os + + +.. _openml: + +Downloading datasets from the openml.org repository +=================================================== + +`openml.org `_ is a public repository for machine learning +data and experiments, that allows everybody to upload open datasets. + +The ``sklearn.datasets`` package is able to download datasets +from the repository using the function +:func:`sklearn.datasets.fetch_openml`. + +For example, to download a dataset of gene expressions in mice brains:: + + >>> from sklearn.datasets import fetch_openml + >>> mice = fetch_openml(name='miceprotein', version=4) + +To fully specify a dataset, you need to provide a name and a version, though +the version is optional, see :ref:`openml_versions`_ below. +The dataset contains a total of 1080 examples belonging to 8 different +classes:: + + >>> mice.data.shape + (1080, 77) + >>> mice.target.shape + (1080,) + >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE + array(['c-CS-m', 'c-CS-s', 'c-SC-m', 'c-SC-s', 't-CS-m', 't-CS-s', 't-SC-m', 't-SC-s'], dtype=object) + +You can get more information on the dataset by looking at the ``DESCR`` +and ``details`` attributes:: + + >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios + **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015 + **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing + Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down + Syndrome. PLoS ONE 10(6): e0129126... + + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + {'id': '40966', 'name': 'MiceProtein', 'version': '4', 'format': 'ARFF', + 'upload_date': '2017-11-08T16:00:15', 'licence': 'Public', + 'url': 'https://www.openml.org/data/v1/download/17928620/MiceProtein.arff', + 'file_id': '17928620', 'default_target_attribute': 'class', + 'row_id_attribute': 'MouseID', + 'ignore_attribute': ['Genotype', 'Treatment', 'Behavior'], + 'tag': ['OpenML-CC18', 'study_135', 'study_98', 'study_99'], + 'visibility': 'public', 'status': 'active', + 'md5_checksum': '3c479a6885bfa0438971388283a1ce32'} + + +The ``DESCR`` contains a free-text description of the data, while ``details`` +contains a dictionary of meta-data stored by openml, like the dataset id. +For more details, see the `OpenML documentation +`_ The ``data_id`` of the mice protein dataset +is 40966, and you can use this (or the name) to get more information on the +dataset on the openml website:: + + >>> mice.url + 'https://www.openml.org/d/40966' + +The ``data_id`` also uniquely identifies a dataset from OpenML:: + + >>> mice = fetch_openml(data_id=40966) + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', + 'creator': ..., + 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': + 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': + '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, + Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins + Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): + e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', + 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': + '3c479a6885bfa0438971388283a1ce32'} + +.. _openml_versions: + +Dataset Versions +---------------- + +A dataset is uniquely specified by its ``data_id``, but not necessarily by its +name. Several different "versions" of a dataset with the same name can exist +which can contain entirely different datasets. +If a particular version of a dataset has been found to contain significant +issues, it might be deactivated. Using a name to specify a dataset will yield +the earliest version of a dataset that is still active. That means that +``fetch_openml(name="miceprotein")`` can yield different results at different +times if earlier versions become inactive. +You can see that the dataset with ``data_id`` 40966 that we fetched above is +the version 1 of the "miceprotein" dataset:: + + >>> mice.details['version'] #doctest: +SKIP + '1' + +In fact, this dataset only has one version. The iris dataset on the other hand +has multiple versions:: + + >>> iris = fetch_openml(name="iris") + >>> iris.details['version'] #doctest: +SKIP + '1' + >>> iris.details['id'] #doctest: +SKIP + '61' + + >>> iris_61 = fetch_openml(data_id=61) + >>> iris_61.details['version'] + '1' + >>> iris_61.details['id'] + '61' + + >>> iris_969 = fetch_openml(data_id=969) + >>> iris_969.details['version'] + '3' + >>> iris_969.details['id'] + '969' + +Specifying the dataset by the name "iris" yields the lowest version, version 1, +with the ``data_id`` 61. To make sure you always get this exact dataset, it is +safest to specify it by the dataset ``data_id``. The other dataset, with +``data_id`` 969, is version 3 (version 2 has become inactive), and contains a +binarized version of the data:: + + >>> np.unique(iris_969.target) + array(['N', 'P'], dtype=object) + +You can also specify both the name and the version, which also uniquely +identifies the dataset:: + + >>> iris_version_3 = fetch_openml(name="iris", version=3) + >>> iris_version_3.details['version'] + '3' + >>> iris_version_3.details['id'] + '969' + + +.. topic:: References: + + * Vanschoren, van Rijn, Bischl and Torgo + `"OpenML: networked science in machine learning" + `_, + ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014. diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 4888d9205d9f0..31f5e5ef840a7 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -79,6 +79,7 @@ link to it from your website, or simply star to say "I use it": * `joblib `__ * `sphinx-gallery `__ * `numpydoc `__ + * `liac-arff `__ and larger projects: diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 44ff2ff0aefa3..fb38eab8fde0d 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -259,6 +259,7 @@ Loaders datasets.fetch_lfw_people datasets.fetch_mldata datasets.fetch_olivetti_faces + datasets.fetch_openml datasets.fetch_rcv1 datasets.fetch_species_distributions datasets.get_data_home diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 90597c535accd..df346696734a0 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -178,6 +178,11 @@ Support for Python 3.3 has been officially dropped. :mod:`sklearn.datasets` ....................... +- |MajorFeature| Added :func:`datasets.fetch_openml` to fetch datasets from + `OpenML `. OpenML is a free, open data sharing platform + and will be used instead of mldata as it provides better service availability. + :issue:`9908` by `Andreas Müller`_ and :user:`Jan N. van Rijn `. + - |Feature| In :func:`datasets.make_blobs`, one can now pass a list to the `n_samples` parameter to indicate the number of samples to generate per cluster. :issue:`8617` by :user:`Maskani Filali Mohamed ` and @@ -204,7 +209,6 @@ Support for Python 3.3 has been officially dropped. data points could be generated. :issue:`10045` by :user:`Christian Braune `. - :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index c43c0c4758b10..c7d78e633493d 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -23,6 +23,7 @@ from .twenty_newsgroups import fetch_20newsgroups from .twenty_newsgroups import fetch_20newsgroups_vectorized from .mldata import fetch_mldata, mldata_filename +from .openml import fetch_openml from .samples_generator import make_classification from .samples_generator import make_multilabel_classification from .samples_generator import make_hastie_10_2 @@ -65,6 +66,7 @@ 'fetch_covtype', 'fetch_rcv1', 'fetch_kddcup99', + 'fetch_openml', 'get_data_home', 'load_boston', 'load_diabetes', diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py new file mode 100644 index 0000000000000..d7693ffd6fea9 --- /dev/null +++ b/sklearn/datasets/openml.py @@ -0,0 +1,557 @@ +import gzip +import json +import os +import shutil +from os.path import join +from warnings import warn + +try: + # Python 3+ + from urllib.request import urlopen +except ImportError: + # Python 2 + from urllib2 import urlopen + + +import numpy as np +import scipy.sparse + +from sklearn.externals import _arff +from .base import get_data_home +from ..externals.six import string_types, PY2 +from ..externals.six.moves.urllib.error import HTTPError +from ..utils import Bunch + +__all__ = ['fetch_openml'] + +_OPENML_PREFIX = "https://openml.org/" +_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/1" +_DATA_INFO = "api/v1/json/data/{}" +_DATA_FEATURES = "api/v1/json/data/features/{}" +_DATA_FILE = "data/v1/download/{}" + + +def _open_openml_url(openml_path, data_home): + """ + Returns a resource from OpenML.org. Caches it to data_home if required. + + Parameters + ---------- + openml_path : str + OpenML URL that will be accessed. This will be prefixes with + _OPENML_PREFIX + + data_home : str + Directory to which the files will be cached. If None, no caching will + be applied. + + Returns + ------- + result : stream + A stream to the OpenML resource + """ + if data_home is None: + return urlopen(_OPENML_PREFIX + openml_path) + local_path = os.path.join(data_home, 'openml.org', openml_path + ".gz") + if not os.path.exists(local_path): + try: + os.makedirs(os.path.dirname(local_path)) + except OSError: + # potentially, the directory has been created already + pass + + try: + with gzip.GzipFile(local_path, 'wb') as fdst: + fsrc = urlopen(_OPENML_PREFIX + openml_path) + shutil.copyfileobj(fsrc, fdst) + fsrc.close() + except Exception: + os.unlink(local_path) + raise + # XXX: unnecessary decompression on first access + return gzip.GzipFile(local_path, 'rb') + + +def _get_json_content_from_openml_api(url, error_message, raise_if_error, + data_home): + """ + Loads json data from the openml api + + Parameters + ---------- + url : str + The URL to load from. Should be an official OpenML endpoint + + error_message : str or None + The error message to raise if an acceptable OpenML error is thrown + (acceptable error is, e.g., data id not found. Other errors, like 404's + will throw the native error message) + + raise_if_error : bool + Whether to raise an error if OpenML returns an acceptable error (e.g., + date not found). If this argument is set to False, a None is returned + in case of acceptable errors. Note that all other errors (e.g., 404) + will still be raised as normal. + + data_home : str or None + Location to cache the response. None if no cache is required. + + Returns + ------- + json_data : json or None + the json result from the OpenML server if the call was successful; + None otherwise iff raise_if_error was set to False and the error was + ``acceptable`` + """ + data_found = True + try: + response = _open_openml_url(url, data_home) + except HTTPError as error: + # 412 is an OpenML specific error code, indicating a generic error + # (e.g., data not found) + if error.code == 412: + data_found = False + else: + raise error + if not data_found: + # not in except for nicer traceback + if raise_if_error: + raise ValueError(error_message) + else: + return None + json_data = json.loads(response.read().decode("utf-8")) + response.close() + return json_data + + +def _split_sparse_columns(arff_data, include_columns): + """ + obtains several columns from sparse arff representation. Additionally, the + column indices are re-labelled, given the columns that are not included. + (e.g., when including [1, 2, 3], the columns will be relabelled to + [0, 1, 2]) + + Parameters + ---------- + arff_data : tuple + A tuple of three lists of equal size; first list indicating the value, + second the x coordinate and the third the y coordinate. + + include_columns : list + A list of columns to include. + + Returns + ------- + arff_data_new : tuple + Subset of arff data with only the include columns indicated by the + include_columns argument. + """ + arff_data_new = (list(), list(), list()) + reindexed_columns = {column_idx: array_idx for array_idx, column_idx + in enumerate(include_columns)} + for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): + if col_idx in include_columns: + arff_data_new[0].append(val) + arff_data_new[1].append(row_idx) + arff_data_new[2].append(reindexed_columns[col_idx]) + return arff_data_new + + +def _sparse_data_to_array(arff_data, include_columns): + # turns the sparse data back into an array (can't use toarray() function, + # as this does only work on numeric data) + num_obs = max(arff_data[1]) + 1 + y_shape = (num_obs, len(include_columns)) + reindexed_columns = {column_idx: array_idx for array_idx, column_idx + in enumerate(include_columns)} + # TODO: improve for efficiency + y = np.empty(y_shape, dtype=np.float64) + for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): + if col_idx in include_columns: + y[row_idx, reindexed_columns[col_idx]] = val + return y + + +def _convert_arff_data(arff_data, col_slice_x, col_slice_y): + """ + converts the arff object into the appropriate matrix type (np.array or + scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the + liac-arff dict, the object from the 'data' key) + + Parameters + ---------- + arff_data : list or dict + as obtained from liac-arff object + + col_slice_x : list + The column indices that are sliced from the original array to return + as X data + + col_slice_y : list + The column indices that are sliced from the original array to return + as y data + + Returns + ------- + X : np.array or scipy.sparse.csr_matrix + y : np.array + """ + if isinstance(arff_data, list): + data = np.array(arff_data, dtype=np.float64) + X = np.array(data[:, col_slice_x], dtype=np.float64) + y = np.array(data[:, col_slice_y], dtype=np.float64) + return X, y + elif isinstance(arff_data, tuple): + arff_data_X = _split_sparse_columns(arff_data, col_slice_x) + num_obs = max(arff_data[1]) + 1 + X_shape = (num_obs, len(col_slice_x)) + X = scipy.sparse.coo_matrix( + (arff_data_X[0], (arff_data_X[1], arff_data_X[2])), + shape=X_shape, dtype=np.float64) + X = X.tocsr() + y = _sparse_data_to_array(arff_data, col_slice_y) + return X, y + else: + # This should never happen + raise ValueError('Unexpected Data Type obtained from arff.') + + +def _get_data_info_by_name(name, version, data_home): + """ + Utilizes the openml dataset listing api to find a dataset by + name/version + OpenML api function: + https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name + + Parameters + ---------- + name : str + name of the dataset + + version : int or str + If version is an integer, the exact name/version will be obtained from + OpenML. If version is a string (value: "active") it will take the first + version from OpenML that is annotated as active. Any other string + values except "active" are treated as integer. + + data_home : str or None + Location to cache the response. None if no cache is required. + + Returns + ------- + first_dataset : json + json representation of the first dataset object that adhired to the + search criteria + + """ + if version == "active": + # situation in which we return the oldest active version + url = _SEARCH_NAME.format(name) + "/status/active/" + error_msg = "No active dataset {} found.".format(name) + json_data = _get_json_content_from_openml_api(url, error_msg, True, + data_home) + return json_data['data']['dataset'][0] + + # an integer version has been provided + url = (_SEARCH_NAME + "/data_version/{}").format(name, version) + json_data = _get_json_content_from_openml_api(url, None, False, + data_home) + if json_data is None: + # we can do this in 1 function call if OpenML does not require the + # specification of the dataset status (i.e., return datasets with a + # given name / version regardless of active, deactivated, etc. ) + # TODO: feature request OpenML. + url += "/status/deactivated" + error_msg = "Dataset {} with version {} not found.".format(name, + version) + json_data = _get_json_content_from_openml_api(url, error_msg, True, + data_home) + + return json_data['data']['dataset'][0] + + +def _get_data_description_by_id(data_id, data_home): + # OpenML API function: https://www.openml.org/api_docs#!/data/get_data_id + url = _DATA_INFO.format(data_id) + error_message = "Dataset with data_id {} not found.".format(data_id) + json_data = _get_json_content_from_openml_api(url, error_message, True, + data_home) + return json_data['data_set_description'] + + +def _get_data_features(data_id, data_home): + # OpenML function: + # https://www.openml.org/api_docs#!/data/get_data_features_id + url = _DATA_FEATURES.format(data_id) + error_message = "Dataset with data_id {} not found.".format(data_id) + json_data = _get_json_content_from_openml_api(url, error_message, True, + data_home) + return json_data['data_features']['feature'] + + +def _download_data_arff(file_id, sparse, data_home, encode_nominal=True): + # Accesses an ARFF file on the OpenML server. Documentation: + # https://www.openml.org/api_data_docs#!/data/get_download_id + # encode_nominal argument is to ensure unit testing, do not alter in + # production! + url = _DATA_FILE.format(file_id) + response = _open_openml_url(url, data_home) + if sparse is True: + return_type = _arff.COO + else: + return_type = _arff.DENSE + + if PY2: + arff_file = _arff.load(response, encode_nominal=encode_nominal, + return_type=return_type, ) + else: + arff_file = _arff.loads(response.read().decode('utf-8'), + encode_nominal=encode_nominal, + return_type=return_type) + response.close() + return arff_file + + +def _verify_target_data_type(features_dict, target_columns): + # verifies the data type of the y array in case there are multiple targets + # (throws an error if these targets do not comply with sklearn support) + if not isinstance(target_columns, list): + raise ValueError('target_column should be list, ' + 'got: %s' % type(target_columns)) + found_types = set() + for target_column in target_columns: + if target_column not in features_dict: + raise KeyError('Could not find target_column={}') + if features_dict[target_column]['data_type'] == "numeric": + found_types.add(np.float64) + else: + found_types.add(object) + + # note: we compare to a string, not boolean + if features_dict[target_column]['is_ignore'] == 'true': + warn('target_column={} has flag is_ignore.'.format( + target_column)) + if features_dict[target_column]['is_row_identifier'] == 'true': + warn('target_column={} has flag is_row_identifier.'.format( + target_column)) + if len(found_types) > 1: + raise ValueError('Can only handle homogeneous multi-target datasets, ' + 'i.e., all targets are either numeric or ' + 'categorical.') + + +def fetch_openml(name=None, version='active', data_id=None, data_home=None, + target_column='default-target', cache=True): + """Fetch dataset from openml by name or dataset id. + + Datasets are uniquely identified by either an integer ID or by a + combination of name and version (i.e. there might be multiple + versions of the 'iris' dataset). Please give either name or data_id + (not both). In case a name is given, a version can also be + provided. + + .. note:: EXPERIMENTAL + + The API is experimental in version 0.20 (particularly the return value + structure), and might have small backward-incompatible changes in + future releases. + + Parameters + ---------- + name : str or None + String identifier of the dataset. Note that OpenML can have multiple + datasets with the same name. + + version : integer or 'active', default='active' + Version of the dataset. Can only be provided if also ``name`` is given. + If 'active' the oldest version that's still active is used. + + data_id : int or None + OpenML ID of the dataset. The most specific way of retrieving a + dataset. If data_id is not given, name (and potential version) are + used to obtain a dataset. + + data_home : string or None, default None + Specify another download and cache folder for the data sets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + target_column : string, list or None, default 'default-target' + Specify the column name in the data to use as target. If + 'default-target', the standard target column a stored on the server + is used. If ``None``, all columns are returned as data and the + target is ``None``. If list (of strings), all columns with these names + are returned as multi-target (Note: not all scikit-learn classifiers + can handle all types of multi-output combinations) + + cache : boolean, default=True + Whether to cache downloaded datasets using joblib. + + Returns + ------- + + data : Bunch + Dictionary-like object, with attributes: + + data : np.array or scipy.sparse.csr_matrix of floats + The feature matrix. Categorical features are encoded as ordinals. + target : np.array + The regression target or classification labels, if applicable. + Dtype is float if numeric, and object if categorical. + DESCR : str + The full description of the dataset + feature_names : list + The names of the dataset columns + categories : dict + Maps each categorical feature name to a list of values, such + that the value encoded as i is ith in the list. + details : dict + More metadata from OpenML + + .. note:: EXPERIMENTAL + + This interface is **experimental** as at version 0.20 and + subsequent releases may change attributes without notice + (although there should only be minor changes to ``data`` + and ``target``). + + Missing values in the 'data' are represented as NaN's. Missing values + in 'target' are represented as NaN's (numerical target) or None + (categorical target) + """ + data_home = get_data_home(data_home=data_home) + data_home = join(data_home, 'openml') + if cache is False: + # no caching will be applied + data_home = None + + # check valid function arguments. data_id XOR (name, version) should be + # provided + if name is not None: + # OpenML is case-insensitive, but the caching mechanism is not + # convert all data names (str) to lower case + name = name.lower() + if data_id is not None: + raise ValueError( + "Dataset data_id={} and name={} passed, but you can only " + "specify a numeric data_id or a name, not " + "both.".format(data_id, name)) + data_info = _get_data_info_by_name(name, version, data_home) + data_id = data_info['did'] + elif data_id is not None: + # from the previous if statement, it is given that name is None + if version is not "active": + raise ValueError( + "Dataset data_id={} and version={} passed, but you can only " + "specify a numeric data_id or a version, not " + "both.".format(data_id, name)) + else: + raise ValueError( + "Neither name nor data_id are provided. Please provide name or " + "data_id.") + + data_description = _get_data_description_by_id(data_id, data_home) + if data_description['status'] != "active": + warn("Version {} of dataset {} is inactive, meaning that issues have " + "been found in the dataset. Try using a newer version from " + "this URL: {}".format( + data_description['version'], + data_description['name'], + data_description['url'])) + + # download data features, meta-info about column types + features_list = _get_data_features(data_id, data_home) + + for feature in features_list: + if 'true' in (feature['is_ignore'], feature['is_row_identifier']): + continue + if feature['data_type'] == 'string': + raise ValueError('STRING attributes are not yet supported') + + if target_column == "default-target": + # determines the default target based on the data feature results + # (which is currently more reliable than the data description; + # see issue: https://github.com/openml/OpenML/issues/768) + target_column = [feature['name'] for feature in features_list + if feature['is_target'] == 'true'] + elif isinstance(target_column, string_types): + # for code-simplicity, make target_column by default a list + target_column = [target_column] + elif target_column is None: + target_column = [] + elif not isinstance(target_column, list): + raise TypeError("Did not recognize type of target_column" + "Should be six.string_type, list or None. Got: " + "{}".format(type(target_column))) + data_columns = [feature['name'] for feature in features_list + if (feature['name'] not in target_column and + feature['is_ignore'] != 'true' and + feature['is_row_identifier'] != 'true')] + + # prepare which columns and data types should be returned for the X and y + features_dict = {feature['name']: feature for feature in features_list} + + # XXX: col_slice_y should be all nominal or all numeric + _verify_target_data_type(features_dict, target_column) + + col_slice_y = [int(features_dict[col_name]['index']) + for col_name in target_column] + + col_slice_x = [int(features_dict[col_name]['index']) + for col_name in data_columns] + for col_idx in col_slice_y: + feat = features_list[col_idx] + nr_missing = int(feat['number_of_missing_values']) + if nr_missing > 0: + raise ValueError('Target column {} has {} missing values. ' + 'Missing values are not supported for target ' + 'columns. '.format(feat['name'], nr_missing)) + + # determine arff encoding to return + return_sparse = False + if data_description['format'].lower() == 'sparse_arff': + return_sparse = True + + # obtain the data + arff = _download_data_arff(data_description['file_id'], return_sparse, + data_home) + arff_data = arff['data'] + nominal_attributes = {k: v for k, v in arff['attributes'] + if isinstance(v, list)} + for feature in features_list: + if 'true' in (feature['is_row_identifier'], + feature['is_ignore']) and (feature['name'] not in + target_column): + del nominal_attributes[feature['name']] + X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y) + + is_classification = {col_name in nominal_attributes + for col_name in target_column} + if not is_classification: + # No target + pass + elif all(is_classification): + y = np.hstack([np.take(np.asarray(nominal_attributes.pop(col_name), + dtype='O'), + y[:, i:i+1].astype(int)) + for i, col_name in enumerate(target_column)]) + elif any(is_classification): + raise ValueError('Mix of nominal and non-nominal targets is not ' + 'currently supported') + + description = u"{}\n\nDownloaded from openml.org.".format( + data_description.pop('description')) + + # reshape y back to 1-D array, if there is only 1 target column; back + # to None if there are not target columns + if y.shape[1] == 1: + y = y.reshape((-1,)) + elif y.shape[1] == 0: + y = None + + bunch = Bunch( + data=X, target=y, feature_names=data_columns, + DESCR=description, details=data_description, + categories=nominal_attributes, + url="https://www.openml.org/d/{}".format(data_id)) + + return bunch diff --git a/sklearn/datasets/tests/data/openml/2/data.arff.gz b/sklearn/datasets/tests/data/openml/2/data.arff.gz new file mode 100644 index 0000000000000..cdf3254add760 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data.arff.gz differ diff --git a/sklearn/datasets/tests/data/openml/2/data_description.json.gz b/sklearn/datasets/tests/data/openml/2/data_description.json.gz new file mode 100644 index 0000000000000..22dfb6ff61c1b Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data_description.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/2/data_features.json.gz b/sklearn/datasets/tests/data/openml/2/data_features.json.gz new file mode 100644 index 0000000000000..cb3d2750095bd Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data_features.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/2/data_list__anneal_1_active.json.gz b/sklearn/datasets/tests/data/openml/2/data_list__anneal_1_active.json.gz new file mode 100644 index 0000000000000..d19e4a633740c Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data_list__anneal_1_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/2/data_list__anneal_None_active.json.gz b/sklearn/datasets/tests/data/openml/2/data_list__anneal_None_active.json.gz new file mode 100644 index 0000000000000..ee94aa32a3fce Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/data_list__anneal_None_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/292/data.arff.gz b/sklearn/datasets/tests/data/openml/292/data.arff.gz new file mode 100644 index 0000000000000..6821829e1e43a Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data.arff.gz differ diff --git a/sklearn/datasets/tests/data/openml/292/data_description.json.gz b/sklearn/datasets/tests/data/openml/292/data_description.json.gz new file mode 100644 index 0000000000000..888140f92b360 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_description.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/292/data_features.json.gz b/sklearn/datasets/tests/data/openml/292/data_features.json.gz new file mode 100644 index 0000000000000..29016cc36bab6 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_features.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/292/data_list__australian_1_active.json.gz b/sklearn/datasets/tests/data/openml/292/data_list__australian_1_active.json.gz new file mode 100644 index 0000000000000..0e2c4395f1c23 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_list__australian_1_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/292/data_list__australian_1_deactivated.json.gz b/sklearn/datasets/tests/data/openml/292/data_list__australian_1_deactivated.json.gz new file mode 100644 index 0000000000000..5ee200d7c056b Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_list__australian_1_deactivated.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/292/data_list__australian_None_active.json.gz b/sklearn/datasets/tests/data/openml/292/data_list__australian_None_active.json.gz new file mode 100644 index 0000000000000..08d319727fb52 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/292/data_list__australian_None_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40589/data.arff.gz b/sklearn/datasets/tests/data/openml/40589/data.arff.gz new file mode 100644 index 0000000000000..96ed11d969557 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data.arff.gz differ diff --git a/sklearn/datasets/tests/data/openml/40589/data_description.json.gz b/sklearn/datasets/tests/data/openml/40589/data_description.json.gz new file mode 100644 index 0000000000000..9c71553ce5137 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data_description.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40589/data_features.json.gz b/sklearn/datasets/tests/data/openml/40589/data_features.json.gz new file mode 100644 index 0000000000000..155460906a7b7 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data_features.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40589/data_list__emotions_3_active.json.gz b/sklearn/datasets/tests/data/openml/40589/data_list__emotions_3_active.json.gz new file mode 100644 index 0000000000000..c5d35604ce956 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data_list__emotions_3_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40589/data_list__emotions_None_active.json.gz b/sklearn/datasets/tests/data/openml/40589/data_list__emotions_None_active.json.gz new file mode 100644 index 0000000000000..5a27505c5d2f4 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/data_list__emotions_None_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40675/data.arff.gz b/sklearn/datasets/tests/data/openml/40675/data.arff.gz new file mode 100644 index 0000000000000..c59c3b769e114 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data.arff.gz differ diff --git a/sklearn/datasets/tests/data/openml/40675/data_description.json.gz b/sklearn/datasets/tests/data/openml/40675/data_description.json.gz new file mode 100644 index 0000000000000..42b876f0a4723 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_description.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40675/data_features.json.gz b/sklearn/datasets/tests/data/openml/40675/data_features.json.gz new file mode 100644 index 0000000000000..2d5c6f8a302e0 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_features.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_active.json.gz b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_active.json.gz new file mode 100644 index 0000000000000..df1665b1db714 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_deactivated.json.gz b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_deactivated.json.gz new file mode 100644 index 0000000000000..2f48ea985b4c9 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_1_deactivated.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40675/data_list__glass2_None_active.json.gz b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_None_active.json.gz new file mode 100644 index 0000000000000..ff46d678f6457 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/data_list__glass2_None_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40945/data.arff.gz b/sklearn/datasets/tests/data/openml/40945/data.arff.gz new file mode 100644 index 0000000000000..ef170f47e63dd Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40945/data.arff.gz differ diff --git a/sklearn/datasets/tests/data/openml/40945/data_description.json.gz b/sklearn/datasets/tests/data/openml/40945/data_description.json.gz new file mode 100644 index 0000000000000..aaafa4a2def68 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40945/data_description.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40945/data_features.json.gz b/sklearn/datasets/tests/data/openml/40945/data_features.json.gz new file mode 100644 index 0000000000000..24cb46957f27d Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40945/data_features.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40945/data_list__titanic_1_active.json.gz b/sklearn/datasets/tests/data/openml/40945/data_list__titanic_1_active.json.gz new file mode 100644 index 0000000000000..b2310138197d8 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40945/data_list__titanic_1_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40966/data.arff.gz b/sklearn/datasets/tests/data/openml/40966/data.arff.gz new file mode 100644 index 0000000000000..43ec977bf67ac Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data.arff.gz differ diff --git a/sklearn/datasets/tests/data/openml/40966/data_description.json.gz b/sklearn/datasets/tests/data/openml/40966/data_description.json.gz new file mode 100644 index 0000000000000..02b25d717f925 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data_description.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40966/data_features.json.gz b/sklearn/datasets/tests/data/openml/40966/data_features.json.gz new file mode 100644 index 0000000000000..a372f9a7be751 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data_features.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_4_active.json.gz b/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_4_active.json.gz new file mode 100644 index 0000000000000..6c292b4196d06 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_4_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_None_active.json.gz b/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_None_active.json.gz new file mode 100644 index 0000000000000..712545f0b72d5 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/data_list__miceprotein_None_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/561/data.arff.gz b/sklearn/datasets/tests/data/openml/561/data.arff.gz new file mode 100644 index 0000000000000..eeb088c224a01 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data.arff.gz differ diff --git a/sklearn/datasets/tests/data/openml/561/data_description.json.gz b/sklearn/datasets/tests/data/openml/561/data_description.json.gz new file mode 100644 index 0000000000000..e4df6060ca0b8 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data_description.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/561/data_features.json.gz b/sklearn/datasets/tests/data/openml/561/data_features.json.gz new file mode 100644 index 0000000000000..54a3ab6a7a976 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data_features.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/561/data_list__cpu_1_active.json.gz b/sklearn/datasets/tests/data/openml/561/data_list__cpu_1_active.json.gz new file mode 100644 index 0000000000000..bb0f8103b8079 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data_list__cpu_1_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/561/data_list__cpu_None_active.json.gz b/sklearn/datasets/tests/data/openml/561/data_list__cpu_None_active.json.gz new file mode 100644 index 0000000000000..4436afa6bc760 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/data_list__cpu_None_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/61/data.arff.gz b/sklearn/datasets/tests/data/openml/61/data.arff.gz new file mode 100644 index 0000000000000..b05dadf99fb0e Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data.arff.gz differ diff --git a/sklearn/datasets/tests/data/openml/61/data_description.json.gz b/sklearn/datasets/tests/data/openml/61/data_description.json.gz new file mode 100644 index 0000000000000..83c3ececcfab9 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data_description.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/61/data_features.json.gz b/sklearn/datasets/tests/data/openml/61/data_features.json.gz new file mode 100644 index 0000000000000..6df4cf0dadbf9 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data_features.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/61/data_list__iris_1_active.json.gz b/sklearn/datasets/tests/data/openml/61/data_list__iris_1_active.json.gz new file mode 100644 index 0000000000000..6dd5e202aeccc Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data_list__iris_1_active.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/61/data_list__iris_None_active.json.gz b/sklearn/datasets/tests/data/openml/61/data_list__iris_None_active.json.gz new file mode 100644 index 0000000000000..b1824cde71fe2 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/data_list__iris_None_active.json.gz differ diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py new file mode 100644 index 0000000000000..7f978ef0595c6 --- /dev/null +++ b/sklearn/datasets/tests/test_openml.py @@ -0,0 +1,517 @@ +"""Test the openml loader. +""" +import gzip +import json +import numpy as np +import os +import scipy.sparse +import sklearn + +from sklearn.datasets import fetch_openml +from sklearn.datasets.openml import (_open_openml_url, + _get_data_description_by_id, + _download_data_arff) +from sklearn.utils.testing import (assert_warns_message, + assert_raise_message) +from sklearn.externals.six import string_types +from sklearn.externals.six.moves.urllib.error import HTTPError + + +currdir = os.path.dirname(os.path.abspath(__file__)) +# if True, urlopen will be monkey patched to only use local files +test_offline = True +test_gzip = True + + +def _test_features_list(data_id): + # XXX Test is intended to verify/ensure correct decoding behavior + # Not usable with sparse data or datasets that have columns marked as + # {row_identifier, ignore} + def decode_column(data_bunch, col_idx): + col_name = data_bunch.feature_names[col_idx] + if col_name in data_bunch.categories: + # XXX: This would be faster with np.take, although it does not + # handle missing values fast (also not with mode='wrap') + cat = data_bunch.categories[col_name] + result = [cat[idx] if 0 <= idx < len(cat) else None for idx in + data_bunch.data[:, col_idx].astype(int)] + return np.array(result, dtype='O') + else: + # non-nominal attribute + return data_bunch.data[:, col_idx] + + data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) + + # also obtain decoded arff + data_description = _get_data_description_by_id(data_id, None) + sparse = data_description['format'].lower() == 'sparse_arff' + if sparse is True: + raise ValueError('This test is not intended for sparse data, to keep ' + 'code relatively simple') + data_arff = _download_data_arff(data_description['file_id'], + sparse, None, False) + data_downloaded = np.array(data_arff['data'], dtype='O') + + for i in range(len(data_bunch.feature_names)): + # XXX: Test per column, as this makes it easier to avoid problems with + # missing values + + np.testing.assert_array_equal(data_downloaded[:, i], + decode_column(data_bunch, i)) + + +def _fetch_dataset_from_openml(data_id, data_name, data_version, + target_column, + expected_observations, expected_features, + expected_missing, + expected_data_dtype, expected_target_dtype, + expect_sparse, compare_default_target): + # fetches a dataset in three various ways from OpenML, using the + # fetch_openml function, and does various checks on the validity of the + # result. Note that this function can be mocked (by invoking + # _monkey_patch_webbased_functions before invoking this function) + data_by_name_id = fetch_openml(name=data_name, version=data_version, + cache=False) + assert int(data_by_name_id.details['id']) == data_id + + fetch_openml(name=data_name, cache=False) + # without specifying the version, there is no guarantee that the data id + # will be the same + + # fetch with dataset id + data_by_id = fetch_openml(data_id=data_id, cache=False, + target_column=target_column) + assert data_by_id.details['name'] == data_name + assert data_by_id.data.shape == (expected_observations, expected_features) + if isinstance(target_column, str): + # single target, so target is vector + assert data_by_id.target.shape == (expected_observations, ) + elif isinstance(target_column, list): + # multi target, so target is array + assert data_by_id.target.shape == (expected_observations, + len(target_column)) + assert data_by_id.data.dtype == np.float64 + assert data_by_id.target.dtype == expected_target_dtype + assert len(data_by_id.feature_names) == expected_features + for feature in data_by_id.feature_names: + assert isinstance(feature, string_types) + + # TODO: pass in a list of expected nominal features + for feature, categories in data_by_id.categories.items(): + feature_idx = data_by_id.feature_names.index(feature) + values = np.unique(data_by_id.data[:, feature_idx]) + values = values[np.isfinite(values)] + assert set(values) <= set(range(len(categories))) + + if compare_default_target: + # check whether the data by id and data by id target are equal + data_by_id_default = fetch_openml(data_id=data_id, cache=False) + if data_by_id.data.dtype == np.float64: + np.testing.assert_allclose(data_by_id.data, + data_by_id_default.data) + else: + assert np.array_equal(data_by_id.data, data_by_id_default.data) + if data_by_id.target.dtype == np.float64: + np.testing.assert_allclose(data_by_id.target, + data_by_id_default.target) + else: + assert np.array_equal(data_by_id.target, data_by_id_default.target) + + if expect_sparse: + assert isinstance(data_by_id.data, scipy.sparse.csr_matrix) + else: + assert isinstance(data_by_id.data, np.ndarray) + # np.isnan doesn't work on CSR matrix + assert (np.count_nonzero(np.isnan(data_by_id.data)) == + expected_missing) + return data_by_id + + +def _monkey_patch_webbased_functions(context, data_id, gziped_files): + url_prefix_data_description = "https://openml.org/api/v1/json/data/" + url_prefix_data_features = "https://openml.org/api/v1/json/data/features/" + url_prefix_download_data = "https://openml.org/data/v1/" + url_prefix_data_list = "https://openml.org/api/v1/json/data/list/" + + path_suffix = '' + read_fn = open + if gziped_files: + path_suffix = '.gz' + read_fn = gzip.open + + def _mock_urlopen_data_description(url): + assert url.startswith(url_prefix_data_description) + + path = os.path.join(currdir, 'data', 'openml', str(data_id), + 'data_description.json%s' % path_suffix) + return read_fn(path, 'rb') + + def _mock_urlopen_data_features(url): + assert url.startswith(url_prefix_data_features) + + path = os.path.join(currdir, 'data', 'openml', str(data_id), + 'data_features.json%s' % path_suffix) + return read_fn(path, 'rb') + + def _mock_urlopen_download_data(url): + assert (url.startswith(url_prefix_download_data)) + + path = os.path.join(currdir, 'data', 'openml', str(data_id), + 'data.arff%s' % path_suffix) + return read_fn(path, 'rb') + + def _mock_urlopen_data_list(url): + # url contains key value pairs of attributes, e.g., + # openml.org/api/v1/json/data_name/iris/data_version/1 should + # ideally become {data_name: 'iris', data_version: '1'} + assert url.startswith(url_prefix_data_list) + att_list = url[len(url_prefix_data_list):].split('/') + key_val_dict = dict(zip(att_list[::2], att_list[1::2])) + # add defaults, so we can make assumptions about the content + if 'data_version' not in key_val_dict: + key_val_dict['data_version'] = None + if 'status' not in key_val_dict: + key_val_dict['status'] = "active" + mock_file = "data_list__%s_%s_%s.json%s" % \ + (key_val_dict['data_name'], key_val_dict['data_version'], + key_val_dict['status'], path_suffix) + json_file_path = os.path.join(currdir, 'data', 'openml', + str(data_id), mock_file) + # load the file itself, to simulate a http error + json_data = json.loads(read_fn(json_file_path, 'rb'). + read().decode('utf-8')) + if 'error' in json_data: + raise HTTPError(url=None, code=412, + msg='Simulated mock error', + hdrs=None, fp=None) + return read_fn(json_file_path, 'rb') + + def _mock_urlopen(url): + if url.startswith(url_prefix_data_list): + return _mock_urlopen_data_list(url) + elif url.startswith(url_prefix_data_features): + return _mock_urlopen_data_features(url) + elif url.startswith(url_prefix_download_data): + return _mock_urlopen_download_data(url) + elif url.startswith(url_prefix_data_description): + return _mock_urlopen_data_description(url) + else: + raise ValueError('Unknown mocking URL pattern: %s' % url) + + # XXX: Global variable + if test_offline: + context.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen) + + +def test_fetch_openml_iris(monkeypatch): + # classification dataset with numeric only columns + data_id = 61 + data_name = 'iris' + data_version = 1 + target_column = 'class' + expected_observations = 150 + expected_features = 4 + expected_missing = 0 + + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, + expected_observations, expected_features, + expected_missing, + np.float64, object, expect_sparse=False, + compare_default_target=True) + + +def test_decode_iris(): + data_id = 61 + _test_features_list(data_id) + + +def test_fetch_openml_iris_multitarget(monkeypatch): + # classification dataset with numeric only columns + data_id = 61 + data_name = 'iris' + data_version = 1 + target_column = ['sepallength', 'sepalwidth'] + expected_observations = 150 + expected_features = 3 + expected_missing = 0 + + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, + expected_observations, expected_features, + expected_missing, + object, np.float64, expect_sparse=False, + compare_default_target=False) + + +def test_fetch_openml_anneal(monkeypatch): + # classification dataset with numeric and categorical columns + data_id = 2 + data_name = 'anneal' + data_version = 1 + target_column = 'class' + # Not all original instances included for space reasons + expected_observations = 11 + expected_features = 38 + expected_missing = 267 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, + expected_observations, expected_features, + expected_missing, + object, object, expect_sparse=False, + compare_default_target=True) + + +def test_decode_anneal(): + data_id = 2 + _test_features_list(data_id) + + +def test_fetch_openml_anneal_multitarget(monkeypatch): + # classification dataset with numeric and categorical columns + data_id = 2 + data_name = 'anneal' + data_version = 1 + target_column = ['class', 'product-type', 'shape'] + # Not all original instances included for space reasons + expected_observations = 11 + expected_features = 36 + expected_missing = 267 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, + expected_observations, expected_features, + expected_missing, + object, object, expect_sparse=False, + compare_default_target=False) + + +def test_fetch_openml_cpu(monkeypatch): + # regression dataset with numeric and categorical columns + data_id = 561 + data_name = 'cpu' + data_version = 1 + target_column = 'class' + expected_observations = 209 + expected_features = 7 + expected_missing = 0 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, + expected_observations, expected_features, + expected_missing, + object, np.float64, expect_sparse=False, + compare_default_target=True) + + +def test_decode_cpu(): + data_id = 561 + _test_features_list(data_id) + + +def test_fetch_openml_australian(monkeypatch): + # sparse dataset + # Australian is the only sparse dataset that is reasonably small + # as it is inactive, we need to catch the warning. Due to mocking + # framework, it is not deactivated in our tests + data_id = 292 + data_name = 'Australian' + data_version = 1 + target_column = 'Y' + # Not all original instances included for space reasons + expected_observations = 85 + expected_features = 14 + expected_missing = 0 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + assert_warns_message( + UserWarning, + "Version 1 of dataset Australian is inactive,", + _fetch_dataset_from_openml, + **{'data_id': data_id, 'data_name': data_name, + 'data_version': data_version, + 'target_column': target_column, + 'expected_observations': expected_observations, + 'expected_features': expected_features, + 'expected_missing': expected_missing, + 'expect_sparse': True, + 'expected_data_dtype': np.float64, + 'expected_target_dtype': object, + 'compare_default_target': False} # numpy specific check + ) + + +def test_fetch_openml_miceprotein(monkeypatch): + # JvR: very important check, as this dataset defined several row ids + # and ignore attributes. Note that data_features json has 82 attributes, + # and row id (1), ignore attributes (3) have been removed (and target is + # stored in data.target) + data_id = 40966 + data_name = 'MiceProtein' + data_version = 4 + target_column = 'class' + # Not all original instances included for space reasons + expected_observations = 7 + expected_features = 77 + expected_missing = 7 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, + expected_observations, expected_features, + expected_missing, + np.float64, object, expect_sparse=False, + compare_default_target=True) + + +def test_fetch_openml_emotions(monkeypatch): + # classification dataset with multiple targets (natively) + data_id = 40589 + data_name = 'emotions' + data_version = 3 + target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm', + 'quiet.still', 'sad.lonely', 'angry.aggresive'] + expected_observations = 13 + expected_features = 72 + expected_missing = 0 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + + _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, + expected_observations, expected_features, + expected_missing, + np.float64, object, expect_sparse=False, + compare_default_target=True) + + +def test_decode_emotions(): + data_id = 40589 + _test_features_list(data_id) + + +def test_open_openml_url_cache(monkeypatch): + data_id = 61 + + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) + test_directory = os.path.join(os.path.expanduser('~'), 'scikit_learn_data') + # first fill the cache + response1 = _open_openml_url(openml_path, test_directory) + # assert file exists + location = os.path.join(test_directory, 'openml.org', openml_path + '.gz') + assert os.path.isfile(location) + # redownload, to utilize cache + response2 = _open_openml_url(openml_path, test_directory) + assert response1.read() == response2.read() + + +def test_fetch_openml_notarget(monkeypatch): + data_id = 61 + target_column = None + expected_observations = 150 + expected_features = 5 + + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + data = fetch_openml(data_id=data_id, target_column=target_column, + cache=False) + assert data.data.shape == (expected_observations, expected_features) + assert data.target is None + + +def test_fetch_openml_inactive(monkeypatch): + # fetch inactive dataset by id + data_id = 40675 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + glas2 = assert_warns_message( + UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, + data_id=data_id, cache=False) + # fetch inactive dataset by name and version + assert glas2.data.shape == (163, 9) + glas2_by_version = assert_warns_message( + UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, + data_id=None, name="glass2", version=1, cache=False) + assert int(glas2_by_version.details['id']) == data_id + + +def test_fetch_nonexiting(monkeypatch): + # there is no active version of glass2 + data_id = 40675 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + # Note that we only want to search by name (not data id) + assert_raise_message(ValueError, "No active dataset glass2 found", + fetch_openml, name='glass2', cache=False) + + +def test_raises_illegal_multitarget(monkeypatch): + data_id = 61 + targets = ['sepalwidth', 'class'] + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + # Note that we only want to search by name (not data id) + assert_raise_message(ValueError, + "Can only handle homogeneous multi-target datasets,", + fetch_openml, data_id=data_id, + target_column=targets, cache=False) + + +def test_warn_ignore_attribute(monkeypatch): + data_id = 40966 + expected_row_id_msg = "target_column={} has flag is_row_identifier." + expected_ignore_msg = "target_column={} has flag is_ignore." + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + # single column test + assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), + fetch_openml, data_id=data_id, + target_column='MouseID', + cache=False) + assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), + fetch_openml, data_id=data_id, + target_column='Genotype', + cache=False) + # multi column test + assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), + fetch_openml, data_id=data_id, + target_column=['MouseID', 'class'], + cache=False) + assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), + fetch_openml, data_id=data_id, + target_column=['Genotype', 'class'], + cache=False) + + +def test_string_attribute(monkeypatch): + data_id = 40945 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + # single column test + assert_raise_message(ValueError, + 'STRING attributes are not yet supported', + fetch_openml, data_id=data_id, cache=False) + + +def test_illegal_column(monkeypatch): + data_id = 61 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + assert_raise_message(KeyError, "Could not find target_column=", + fetch_openml, data_id=data_id, + target_column='undefined', cache=False) + + assert_raise_message(KeyError, "Could not find target_column=", + fetch_openml, data_id=data_id, + target_column=['undefined', 'class'], + cache=False) + + +def test_fetch_openml_raises_missing_values_target(monkeypatch): + data_id = 2 + _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip) + assert_raise_message(ValueError, "Target column ", + fetch_openml, data_id=data_id, target_column='family') + + +def test_fetch_openml_raises_illegal_argument(): + assert_raise_message(ValueError, "Dataset data_id=", + fetch_openml, data_id=-1, name="name") + + assert_raise_message(ValueError, "Dataset data_id=", + fetch_openml, data_id=-1, name=None, + version="version") + + assert_raise_message(ValueError, "Dataset data_id=", + fetch_openml, data_id=-1, name="name", + version="version") + + assert_raise_message(ValueError, "Neither name nor data_id are provided. " + "Please provide name or data_id.", fetch_openml) diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py new file mode 100644 index 0000000000000..6225dfc3691c3 --- /dev/null +++ b/sklearn/externals/_arff.py @@ -0,0 +1,1059 @@ +# -*- coding: utf-8 -*- +# ============================================================================= +# Federal University of Rio Grande do Sul (UFRGS) +# Connectionist Artificial Intelligence Laboratory (LIAC) +# Renato de Pontes Pereira - rppereira@inf.ufrgs.br +# ============================================================================= +# Copyright (c) 2011 Renato de Pontes Pereira, renato.ppontes at gmail dot com +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================= + +''' +The liac-arff module implements functions to read and write ARFF files in +Python. It was created in the Connectionist Artificial Intelligence Laboratory +(LIAC), which takes place at the Federal University of Rio Grande do Sul +(UFRGS), in Brazil. + +ARFF (Attribute-Relation File Format) is an file format specially created for +describe datasets which are commonly used for machine learning experiments and +softwares. This file format was created to be used in Weka, the best +representative software for machine learning automated experiments. + +An ARFF file can be divided into two sections: header and data. The Header +describes the metadata of the dataset, including a general description of the +dataset, its name and its attributes. The source below is an example of a +header section in a XOR dataset:: + + % + % XOR Dataset + % + % Created by Renato Pereira + % rppereira@inf.ufrgs.br + % http://inf.ufrgs.br/~rppereira + % + % + @RELATION XOR + + @ATTRIBUTE input1 REAL + @ATTRIBUTE input2 REAL + @ATTRIBUTE y REAL + +The Data section of an ARFF file describes the observations of the dataset, in +the case of XOR dataset:: + + @DATA + 0.0,0.0,0.0 + 0.0,1.0,1.0 + 1.0,0.0,1.0 + 1.0,1.0,0.0 + % + % + % + +Notice that several lines are starting with an ``%`` symbol, denoting a +comment, thus, lines with ``%`` at the beginning will be ignored, except by the +description part at the beginning of the file. The declarations ``@RELATION``, +``@ATTRIBUTE``, and ``@DATA`` are all case insensitive and obligatory. + +For more information and details about the ARFF file description, consult +http://www.cs.waikato.ac.nz/~ml/weka/arff.html + + +ARFF Files in Python +~~~~~~~~~~~~~~~~~~~~ + +This module uses built-ins python objects to represent a deserialized ARFF +file. A dictionary is used as the container of the data and metadata of ARFF, +and have the following keys: + +- **description**: (OPTIONAL) a string with the description of the dataset. +- **relation**: (OBLIGATORY) a string with the name of the dataset. +- **attributes**: (OBLIGATORY) a list of attributes with the following + template:: + + (attribute_name, attribute_type) + + the attribute_name is a string, and attribute_type must be an string + or a list of strings. +- **data**: (OBLIGATORY) a list of data instances. Each data instance must be + a list with values, depending on the attributes. + +The above keys must follow the case which were described, i.e., the keys are +case sensitive. The attribute type ``attribute_type`` must be one of these +strings (they are not case sensitive): ``NUMERIC``, ``INTEGER``, ``REAL`` or +``STRING``. For nominal attributes, the ``atribute_type`` must be a list of +strings. + +In this format, the XOR dataset presented above can be represented as a python +object as:: + + xor_dataset = { + 'description': 'XOR Dataset', + 'relation': 'XOR', + 'attributes': [ + ('input1', 'REAL'), + ('input2', 'REAL'), + ('y', 'REAL'), + ], + 'data': [ + [0.0, 0.0, 0.0], + [0.0, 1.0, 1.0], + [1.0, 0.0, 1.0], + [1.0, 1.0, 0.0] + ] + } + + +Features +~~~~~~~~ + +This module provides several features, including: + +- Read and write ARFF files using python built-in structures, such dictionaries + and lists; +- Supports `scipy.sparse.coo `_ + and lists of dictionaries as used by SVMLight +- Supports the following attribute types: NUMERIC, REAL, INTEGER, STRING, and + NOMINAL; +- Has an interface similar to other built-in modules such as ``json``, or + ``zipfile``; +- Supports read and write the descriptions of files; +- Supports missing values and names with spaces; +- Supports unicode values and names; +- Fully compatible with Python 2.7+, Python 3.3+, pypy and pypy3; +- Under `MIT License `_ + +''' +__author__ = 'Renato de Pontes Pereira, Matthias Feurer, Joel Nothman' +__author_email__ = ('renato.ppontes@gmail.com, ' + 'feurerm@informatik.uni-freiburg.de, ' + 'joel.nothman@gmail.com') +__version__ = '2.3' + +import re +import sys +import csv + +# CONSTANTS =================================================================== +_SIMPLE_TYPES = ['NUMERIC', 'REAL', 'INTEGER', 'STRING'] + +_TK_DESCRIPTION = '%' +_TK_COMMENT = '%' +_TK_RELATION = '@RELATION' +_TK_ATTRIBUTE = '@ATTRIBUTE' +_TK_DATA = '@DATA' + +_RE_RELATION = re.compile(r'^([^\{\}%,\s]*|\".*\"|\'.*\')$', re.UNICODE) +_RE_ATTRIBUTE = re.compile(r'^(\".*\"|\'.*\'|[^\{\}%,\s]*)\s+(.+)$', re.UNICODE) +_RE_TYPE_NOMINAL = re.compile(r'^\{\s*((\".*\"|\'.*\'|\S*)\s*,\s*)*(\".*\"|\'.*\'|\S*)\s*\}$', re.UNICODE) +_RE_QUOTE_CHARS = re.compile(r'["\'\\ \t%,]') +_RE_ESCAPE_CHARS = re.compile(r'(?=["\'\\%])') # don't need to capture anything +_RE_SPARSE_LINE = re.compile(r'^\{.*\}$') +_RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]') + + +def _build_re_values(): + quoted_re = r'''(?x) + " # open quote followed by zero or more of: + (?: + (?= len(conversors): + raise BadDataFormat(s) + # XXX: int 0 is used for implicit values, not '0' + values = [values[i] if i in values else 0 for i in + xrange(len(conversors))] + else: + if len(values) != len(conversors): + raise BadDataFormat(s) + + self.data.append(self._decode_values(values, conversors)) + + @staticmethod + def _decode_values(values, conversors): + try: + values = [None if value is None else conversor(value) + for conversor, value + in zip(conversors, values)] + except ValueError as exc: + if 'float: ' in str(exc): + raise BadNumericalValue() + return values + + def _tuplify_sparse_data(self, x): + if len(x) != 2: + raise BadDataFormat(x) + return (int(x[0].strip('"').strip("'")), x[1]) + + def encode_data(self, data, attributes): + '''(INTERNAL) Encodes a line of data. + + Data instances follow the csv format, i.e, attribute values are + delimited by commas. After converted from csv. + + :param data: a list of values. + :param attributes: a list of attributes. Used to check if data is valid. + :return: a string with the encoded data line. + ''' + current_row = 0 + + for inst in data: + if len(inst) != len(attributes): + raise BadObject( + 'Instance %d has %d attributes, expected %d' % + (current_row, len(inst), len(attributes)) + ) + + new_data = [] + for value in inst: + if value is None or value == u'' or value != value: + s = '?' + else: + s = encode_string(unicode(value)) + new_data.append(s) + + current_row += 1 + yield u','.join(new_data) + +class COOData(Data): + def __init__(self): + self.data = ([], [], []) + self._current_num_data_points = 0 + + def decode_data(self, s, conversors): + values = _parse_values(s) + + if not isinstance(values, dict): + raise BadLayout() + if not values: + self._current_num_data_points += 1 + return + col, values = zip(*sorted(values.items())) + try: + values = [value if value is None else conversors[key](value) + for key, value in zip(col, values)] + except ValueError as exc: + if 'float: ' in str(exc): + raise BadNumericalValue() + raise + except IndexError: + # conversor out of range + raise BadDataFormat(s) + self.data[0].extend(values) + self.data[1].extend([self._current_num_data_points] * len(values)) + self.data[2].extend(col) + + self._current_num_data_points += 1 + + def encode_data(self, data, attributes): + num_attributes = len(attributes) + new_data = [] + current_row = 0 + + row = data.row + col = data.col + data = data.data + + # Check if the rows are sorted + if not all(row[i] <= row[i + 1] for i in xrange(len(row) - 1)): + raise ValueError("liac-arff can only output COO matrices with " + "sorted rows.") + + for v, col, row in zip(data, col, row): + if row > current_row: + # Add empty rows if necessary + while current_row < row: + yield " ".join([u"{", u','.join(new_data), u"}"]) + new_data = [] + current_row += 1 + + if col >= num_attributes: + raise BadObject( + 'Instance %d has at least %d attributes, expected %d' % + (current_row, col + 1, num_attributes) + ) + + if v is None or v == u'' or v != v: + s = '?' + else: + s = encode_string(unicode(v)) + new_data.append("%d %s" % (col, s)) + + yield " ".join([u"{", u','.join(new_data), u"}"]) + +class LODData(Data): + def __init__(self): + self.data = [] + + def decode_data(self, s, conversors): + values = _parse_values(s) + n_conversors = len(conversors) + + if not isinstance(values, dict): + raise BadLayout() + try: + self.data.append({key: None if value is None else conversors[key](value) + for key, value in values.items()}) + except ValueError as exc: + if 'float: ' in str(exc): + raise BadNumericalValue() + raise + except IndexError: + # conversor out of range + raise BadDataFormat(s) + + def encode_data(self, data, attributes): + current_row = 0 + + num_attributes = len(attributes) + for row in data: + new_data = [] + + if len(row) > 0 and max(row) >= num_attributes: + raise BadObject( + 'Instance %d has %d attributes, expected %d' % + (current_row, max(row) + 1, num_attributes) + ) + + for col in sorted(row): + v = row[col] + if v is None or v == u'' or v != v: + s = '?' + else: + s = encode_string(unicode(v)) + new_data.append("%d %s" % (col, s)) + + current_row += 1 + yield " ".join([u"{", u','.join(new_data), u"}"]) + +def _get_data_object_for_decoding(matrix_type): + if matrix_type == DENSE: + return Data() + elif matrix_type == COO: + return COOData() + elif matrix_type == LOD: + return LODData() + else: + raise ValueError("Matrix type %s not supported." % str(matrix_type)) + +def _get_data_object_for_encoding(matrix): + # Probably a scipy.sparse + if hasattr(matrix, 'format'): + if matrix.format == 'coo': + return COOData() + else: + raise ValueError('Cannot guess matrix format!') + elif isinstance(matrix[0], dict): + return LODData() + else: + return Data() + +# ============================================================================= + +# ADVANCED INTERFACE ========================================================== +class ArffDecoder(object): + '''An ARFF decoder.''' + + def __init__(self): + '''Constructor.''' + self._conversors = [] + self._current_line = 0 + + def _decode_comment(self, s): + '''(INTERNAL) Decodes a comment line. + + Comments are single line strings starting, obligatorily, with the ``%`` + character, and can have any symbol, including whitespaces or special + characters. + + This method must receive a normalized string, i.e., a string without + padding, including the "\r\n" characters. + + :param s: a normalized string. + :return: a string with the decoded comment. + ''' + res = re.sub('^\%( )?', '', s) + return res + + def _decode_relation(self, s): + '''(INTERNAL) Decodes a relation line. + + The relation declaration is a line with the format ``@RELATION + ``, where ``relation-name`` is a string. The string must + start with alphabetic character and must be quoted if the name includes + spaces, otherwise this method will raise a `BadRelationFormat` exception. + + This method must receive a normalized string, i.e., a string without + padding, including the "\r\n" characters. + + :param s: a normalized string. + :return: a string with the decoded relation name. + ''' + _, v = s.split(' ', 1) + v = v.strip() + + if not _RE_RELATION.match(v): + raise BadRelationFormat() + + res = unicode(v.strip('"\'')) + return res + + def _decode_attribute(self, s): + '''(INTERNAL) Decodes an attribute line. + + The attribute is the most complex declaration in an arff file. All + attributes must follow the template:: + + @attribute + + where ``attribute-name`` is a string, quoted if the name contains any + whitespace, and ``datatype`` can be: + + - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``. + - Strings as ``STRING``. + - Dates (NOT IMPLEMENTED). + - Nominal attributes with format: + + {, , , ...} + + The nominal names follow the rules for the attribute names, i.e., they + must be quoted if the name contains whitespaces. + + This method must receive a normalized string, i.e., a string without + padding, including the "\r\n" characters. + + :param s: a normalized string. + :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES). + ''' + _, v = s.split(' ', 1) + v = v.strip() + + # Verify the general structure of declaration + m = _RE_ATTRIBUTE.match(v) + if not m: + raise BadAttributeFormat() + + # Extracts the raw name and type + name, type_ = m.groups() + + # Extracts the final name + name = unicode(name.strip('"\'')) + + # Extracts the final type + if _RE_TYPE_NOMINAL.match(type_): + try: + type_ = _parse_values(type_.strip('{} ')) + except Exception: + raise BadAttributeType() + if isinstance(type_, dict): + raise BadAttributeType() + + else: + # If not nominal, verify the type name + type_ = unicode(type_).upper() + if type_ not in ['NUMERIC', 'REAL', 'INTEGER', 'STRING']: + raise BadAttributeType() + + return (name, type_) + + def _decode(self, s, encode_nominal=False, matrix_type=DENSE): + '''Do the job the ``encode``.''' + + # Make sure this method is idempotent + self._current_line = 0 + + # If string, convert to a list of lines + if isinstance(s, basestring): + s = s.strip('\r\n ').replace('\r\n', '\n').split('\n') + + # Create the return object + obj = { + u'description': u'', + u'relation': u'', + u'attributes': [], + u'data': [] + } + attribute_names = {} + + # Create the data helper object + data = _get_data_object_for_decoding(matrix_type) + + # Read all lines + STATE = _TK_DESCRIPTION + for row in s: + self._current_line += 1 + # Ignore empty lines + row = row.strip(' \r\n') + if not row: continue + + u_row = row.upper() + + # DESCRIPTION ----------------------------------------------------- + if u_row.startswith(_TK_DESCRIPTION) and STATE == _TK_DESCRIPTION: + obj['description'] += self._decode_comment(row) + '\n' + # ----------------------------------------------------------------- + + # RELATION -------------------------------------------------------- + elif u_row.startswith(_TK_RELATION): + if STATE != _TK_DESCRIPTION: + raise BadLayout() + + STATE = _TK_RELATION + obj['relation'] = self._decode_relation(row) + # ----------------------------------------------------------------- + + # ATTRIBUTE ------------------------------------------------------- + elif u_row.startswith(_TK_ATTRIBUTE): + if STATE != _TK_RELATION and STATE != _TK_ATTRIBUTE: + raise BadLayout() + + STATE = _TK_ATTRIBUTE + + attr = self._decode_attribute(row) + if attr[0] in attribute_names: + raise BadAttributeName(attr[0], attribute_names[attr[0]]) + else: + attribute_names[attr[0]] = self._current_line + obj['attributes'].append(attr) + + if isinstance(attr[1], (list, tuple)): + if encode_nominal: + conversor = EncodedNominalConversor(attr[1]) + else: + conversor = NominalConversor(attr[1]) + else: + CONVERSOR_MAP = {'STRING': unicode, + 'INTEGER': lambda x: int(float(x)), + 'NUMERIC': float, + 'REAL': float} + conversor = CONVERSOR_MAP[attr[1]] + + self._conversors.append(conversor) + # ----------------------------------------------------------------- + + # DATA ------------------------------------------------------------ + elif u_row.startswith(_TK_DATA): + if STATE != _TK_ATTRIBUTE: + raise BadLayout() + + STATE = _TK_DATA + # ----------------------------------------------------------------- + + # COMMENT --------------------------------------------------------- + elif u_row.startswith(_TK_COMMENT): + pass + # ----------------------------------------------------------------- + + # DATA INSTANCES -------------------------------------------------- + elif STATE == _TK_DATA: + data.decode_data(row, self._conversors) + # ----------------------------------------------------------------- + + # UNKNOWN INFORMATION --------------------------------------------- + else: + raise BadLayout() + # ----------------------------------------------------------------- + + # Alter the data object + obj['data'] = data.data + if obj['description'].endswith('\n'): + obj['description'] = obj['description'][:-1] + + return obj + + def decode(self, s, encode_nominal=False, return_type=DENSE): + '''Returns the Python representation of a given ARFF file. + + When a file object is passed as an argument, this method reads lines + iteratively, avoiding to load unnecessary information to the memory. + + :param s: a string or file object with the ARFF file. + :param encode_nominal: boolean, if True perform a label encoding + while reading the .arff file. + :param return_type: determines the data structure used to store the + dataset. Can be one of `arff.DENSE`, `arff.COO` and `arff.LOD`. + Consult the section on `working with sparse data`_ + ''' + try: + return self._decode(s, encode_nominal=encode_nominal, + matrix_type=return_type) + except ArffException as e: + e.line = self._current_line + raise e + + +class ArffEncoder(object): + '''An ARFF encoder.''' + + def _encode_comment(self, s=''): + '''(INTERNAL) Encodes a comment line. + + Comments are single line strings starting, obligatorily, with the ``%`` + character, and can have any symbol, including whitespaces or special + characters. + + If ``s`` is None, this method will simply return an empty comment. + + :param s: (OPTIONAL) string. + :return: a string with the encoded comment line. + ''' + if s: + return u'%s %s'%(_TK_COMMENT, s) + else: + return u'%s' % _TK_COMMENT + + def _encode_relation(self, name): + '''(INTERNAL) Decodes a relation line. + + The relation declaration is a line with the format ``@RELATION + ``, where ``relation-name`` is a string. + + :param name: a string. + :return: a string with the encoded relation declaration. + ''' + for char in ' %{},': + if char in name: + name = '"%s"'%name + break + + return u'%s %s'%(_TK_RELATION, name) + + def _encode_attribute(self, name, type_): + '''(INTERNAL) Encodes an attribute line. + + The attribute follow the template:: + + @attribute + + where ``attribute-name`` is a string, and ``datatype`` can be: + + - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``. + - Strings as ``STRING``. + - Dates (NOT IMPLEMENTED). + - Nominal attributes with format: + + {, , , ...} + + This method must receive a the name of the attribute and its type, if + the attribute type is nominal, ``type`` must be a list of values. + + :param name: a string. + :param type_: a string or a list of string. + :return: a string with the encoded attribute declaration. + ''' + for char in ' %{},': + if char in name: + name = '"%s"'%name + break + + if isinstance(type_, (tuple, list)): + type_tmp = [] + for i in range(len(type_)): + type_tmp.append(u'%s' % encode_string(type_[i])) + type_ = u'{%s}'%(u', '.join(type_tmp)) + + return u'%s %s %s'%(_TK_ATTRIBUTE, name, type_) + + def encode(self, obj): + '''Encodes a given object to an ARFF file. + + :param obj: the object containing the ARFF information. + :return: the ARFF file as an unicode string. + ''' + data = [row for row in self.iter_encode(obj)] + + return u'\n'.join(data) + + def iter_encode(self, obj): + '''The iterative version of `arff.ArffEncoder.encode`. + + This encodes iteratively a given object and return, one-by-one, the + lines of the ARFF file. + + :param obj: the object containing the ARFF information. + :return: (yields) the ARFF file as unicode strings. + ''' + # DESCRIPTION + if obj.get('description', None): + for row in obj['description'].split('\n'): + yield self._encode_comment(row) + + # RELATION + if not obj.get('relation'): + raise BadObject('Relation name not found or with invalid value.') + + yield self._encode_relation(obj['relation']) + yield u'' + + # ATTRIBUTES + if not obj.get('attributes'): + raise BadObject('Attributes not found.') + + attribute_names = set() + for attr in obj['attributes']: + # Verify for bad object format + if not isinstance(attr, (tuple, list)) or \ + len(attr) != 2 or \ + not isinstance(attr[0], basestring): + raise BadObject('Invalid attribute declaration "%s"'%str(attr)) + + if isinstance(attr[1], basestring): + # Verify for invalid types + if attr[1] not in _SIMPLE_TYPES: + raise BadObject('Invalid attribute type "%s"'%str(attr)) + + # Verify for bad object format + elif not isinstance(attr[1], (tuple, list)): + raise BadObject('Invalid attribute type "%s"'%str(attr)) + + # Verify attribute name is not used twice + if attr[0] in attribute_names: + raise BadObject('Trying to use attribute name "%s" for the ' + 'second time.' % str(attr[0])) + else: + attribute_names.add(attr[0]) + + yield self._encode_attribute(attr[0], attr[1]) + yield u'' + attributes = obj['attributes'] + + # DATA + yield _TK_DATA + if 'data' in obj: + data = _get_data_object_for_encoding(obj.get('data')) + for line in data.encode_data(obj.get('data'), attributes): + yield line + + yield u'' + +# ============================================================================= + +# BASIC INTERFACE ============================================================= +def load(fp, encode_nominal=False, return_type=DENSE): + '''Load a file-like object containing the ARFF document and convert it into + a Python object. + + :param fp: a file-like object. + :param encode_nominal: boolean, if True perform a label encoding + while reading the .arff file. + :param return_type: determines the data structure used to store the + dataset. Can be one of `arff.DENSE`, `arff.COO` and `arff.LOD`. + Consult the section on `working with sparse data`_ + :return: a dictionary. + ''' + decoder = ArffDecoder() + return decoder.decode(fp, encode_nominal=encode_nominal, + return_type=return_type) + +def loads(s, encode_nominal=False, return_type=DENSE): + '''Convert a string instance containing the ARFF document into a Python + object. + + :param s: a string object. + :param encode_nominal: boolean, if True perform a label encoding + while reading the .arff file. + :param return_type: determines the data structure used to store the + dataset. Can be one of `arff.DENSE`, `arff.COO` and `arff.LOD`. + Consult the section on `working with sparse data`_ + :return: a dictionary. + ''' + decoder = ArffDecoder() + return decoder.decode(s, encode_nominal=encode_nominal, + return_type=return_type) + +def dump(obj, fp): + '''Serialize an object representing the ARFF document to a given file-like + object. + + :param obj: a dictionary. + :param fp: a file-like object. + ''' + encoder = ArffEncoder() + generator = encoder.iter_encode(obj) + + last_row = next(generator) + for row in generator: + fp.write(last_row + u'\n') + last_row = row + fp.write(last_row) + + return fp + +def dumps(obj): + '''Serialize an object representing the ARFF document, returning a string. + + :param obj: a dictionary. + :return: a string with the ARFF document. + ''' + encoder = ArffEncoder() + return encoder.encode(obj) +# =============================================================================