[MRG+1] added return_X_y option to toy datasets in sklearn.datasets (scikit-learn#7154)

manu-chroma · agramfort · commit 8994d0ef61f6 · 2016-08-07T15:07:07.000+02:00
* added return_X_y support to more dataset loaders

* fix typo

* updated whats_new.rst

* fix indentation for version added tag

* call astype before the branching

* better formatting in whats_new.rst

* better formatting

* updated what's new
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -230,11 +230,16 @@ Enhancements
      (`#6846 <https://github.com/scikit-learn/scikit-learn/pull/6846>`_)
      By `Sebastian Säger`_ and `YenChen Lin`_.
 
-   - Added new return type ``(data, target)`` : tuple option to
-     :func:`load_iris` dataset, 
-     (`#7049 <https://github.com/scikit-learn/scikit-learn/pull/7049>`_)
+   - Added parameter ``return_X_y`` and return type ``(data, target) : tuple`` option to
+     :func:`load_iris` dataset 
+     `#7049 <https://github.com/scikit-learn/scikit-learn/pull/7049>`_, 
      :func:`load_breast_cancer` dataset
-     (`#7152 <https://github.com/scikit-learn/scikit-learn/pull/7152>`_) by
+     `#7152 <https://github.com/scikit-learn/scikit-learn/pull/7152>`_,
+     :func:`load_digits` dataset,
+     :func:`load_diabetes` dataset,
+     :func:`load_linnerud` dataset,
+     :func:`load_boston` dataset
+     `#7154 <https://github.com/scikit-learn/scikit-learn/pull/7154>`_ by
      `Manvendra Singh`_.
 
 Bug fixes
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
@@ -264,7 +264,7 @@ def load_iris(return_X_y=False):
         If True, returns ``(data, target)`` instead of a Bunch object.
         See below for more information about the `data` and `target` object.
 
-    .. versionadded:: 0.18
+        .. versionadded:: 0.18
 
     Returns
     -------
@@ -277,7 +277,7 @@ def load_iris(return_X_y=False):
 
     (data, target) : tuple if ``return_X_y`` is True
 
-    .. versionadded:: 0.18
+        .. versionadded:: 0.18
 
     Examples
     --------
@@ -338,7 +338,7 @@ def load_breast_cancer(return_X_y=False):
         If True, returns ``(data, target)`` instead of a Bunch object.
         See below for more information about the `data` and `target` object.
 
-    .. versionadded:: 0.18
+        .. versionadded:: 0.18
 
     Returns
     -------
@@ -351,7 +351,7 @@ def load_breast_cancer(return_X_y=False):
 
     (data, target) : tuple if ``return_X_y`` is True
 
-    .. versionadded:: 0.18
+        .. versionadded:: 0.18
 
     The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is
     downloaded from:
@@ -411,7 +411,7 @@ def load_breast_cancer(return_X_y=False):
                  feature_names=feature_names)
 
 
-def load_digits(n_class=10):
+def load_digits(n_class=10, return_X_y=False):
     """Load and return the digits dataset (classification).
 
     Each datapoint is a 8x8 image of a digit.
@@ -431,6 +431,12 @@ def load_digits(n_class=10):
     n_class : integer, between 0 and 10, optional (default=10)
         The number of classes to return.
 
+    return_X_y : boolean, default=False.
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
     Returns
     -------
     data : Bunch
@@ -440,6 +446,10 @@ def load_digits(n_class=10):
         sample, 'target_names', the meaning of the labels, and 'DESCR',
         the full description of the dataset.
 
+    (data, target) : tuple if ``return_X_y`` is True
+
+        .. versionadded:: 0.18
+
     Examples
     --------
     To load the data and visualize the images::
@@ -458,7 +468,7 @@ def load_digits(n_class=10):
                       delimiter=',')
     with open(join(module_path, 'descr', 'digits.rst')) as f:
         descr = f.read()
-    target = data[:, -1]
+    target = data[:, -1].astype(np.int)
     flat_data = data[:, :-1]
     images = flat_data.view()
     images.shape = (-1, 8, 8)
@@ -468,14 +478,17 @@ def load_digits(n_class=10):
         flat_data, target = flat_data[idx], target[idx]
         images = images[idx]
 
+    if return_X_y:
+        return flat_data, target
+
     return Bunch(data=flat_data,
-                 target=target.astype(np.int),
+                 target=target,
                  target_names=np.arange(10),
                  images=images,
                  DESCR=descr)
 
 
-def load_diabetes():
+def load_diabetes(return_X_y=False):
     """Load and return the diabetes dataset (regression).
 
     ==============      ==================
@@ -487,34 +500,62 @@ def load_diabetes():
 
     Read more in the :ref:`User Guide <datasets>`.
 
+    Parameters
+    ----------
+    return_X_y : boolean, default=False.
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
     Returns
     -------
     data : Bunch
         Dictionary-like object, the interesting attributes are:
         'data', the data to learn and 'target', the regression target for each
         sample.
+
+    (data, target) : tuple if ``return_X_y`` is True
+
+        .. versionadded:: 0.18    
     """
     base_dir = join(dirname(__file__), 'data')
     data = np.loadtxt(join(base_dir, 'diabetes_data.csv.gz'))
     target = np.loadtxt(join(base_dir, 'diabetes_target.csv.gz'))
+    
+    if return_X_y:
+        return data, target
+
     return Bunch(data=data, target=target)
 
 
-def load_linnerud():
+def load_linnerud(return_X_y=False):
     """Load and return the linnerud dataset (multivariate regression).
 
     Samples total: 20
     Dimensionality: 3 for both data and targets
     Features: integer
     Targets: integer
 
+    Parameters
+    ----------
+    return_X_y : boolean, default=False.
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
     Returns
     -------
     data : Bunch
         Dictionary-like object, the interesting attributes are: 'data' and
         'targets', the two multivariate datasets, with 'data' corresponding to
         the exercise and 'targets' corresponding to the physiological
         measurements, as well as 'feature_names' and 'target_names'.
+    
+    (data, target) : tuple if ``return_X_y`` is True
+
+        .. versionadded:: 0.18
     """
     base_dir = join(dirname(__file__), 'data/')
     # Read data
@@ -529,13 +570,16 @@ def load_linnerud():
     with open(dirname(__file__) + '/descr/linnerud.rst') as f:
         descr = f.read()
 
+    if return_X_y:
+        return data_exercise, data_physiological
+
     return Bunch(data=data_exercise, feature_names=header_exercise,
                  target=data_physiological,
                  target_names=header_physiological,
                  DESCR=descr)
 
 
-def load_boston():
+def load_boston(return_X_y=False):
     """Load and return the boston house-prices dataset (regression).
 
     ==============     ==============
@@ -545,13 +589,25 @@ def load_boston():
     Targets             real 5. - 50.
     ==============     ==============
 
+    Parameters
+    ----------
+    return_X_y : boolean, default=False.
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+        .. versionadded:: 0.18
+
     Returns
     -------
     data : Bunch
         Dictionary-like object, the interesting attributes are:
         'data', the data to learn, 'target', the regression targets,
         and 'DESCR', the full description of the dataset.
 
+    (data, target) : tuple if ``return_X_y`` is True
+
+        .. versionadded:: 0.18    
+
     Examples
     --------
     >>> from sklearn.datasets import load_boston
@@ -580,6 +636,9 @@ def load_boston():
             data[i] = np.asarray(d[:-1], dtype=np.float64)
             target[i] = np.asarray(d[-1], dtype=np.float64)
 
+    if return_X_y:
+        return data, target
+
     return Bunch(data=data,
                  target=target,
                  # last column is target value
diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
@@ -128,6 +128,13 @@ def test_load_digits():
     assert_equal(digits.data.shape, (1797, 64))
     assert_equal(numpy.unique(digits.target).size, 10)
 
+    # test return_X_y option
+    X_y_tuple = load_digits(return_X_y=True)
+    bunch = load_digits()
+    assert_true(isinstance(X_y_tuple, tuple))
+    assert_array_equal(X_y_tuple[0], bunch.data)
+    assert_array_equal(X_y_tuple[1], bunch.target)
+
 
 def test_load_digits_n_class_lt_10():
     digits = load_digits(9)
@@ -165,6 +172,13 @@ def test_load_diabetes():
     assert_equal(res.data.shape, (442, 10))
     assert_true(res.target.size, 442)
 
+    # test return_X_y option
+    X_y_tuple = load_diabetes(return_X_y=True)
+    bunch = load_diabetes()
+    assert_true(isinstance(X_y_tuple, tuple))
+    assert_array_equal(X_y_tuple[0], bunch.data)
+    assert_array_equal(X_y_tuple[1], bunch.target)
+
 
 def test_load_linnerud():
     res = load_linnerud()
@@ -173,6 +187,12 @@ def test_load_linnerud():
     assert_equal(len(res.target_names), 3)
     assert_true(res.DESCR)
 
+    # test return_X_y option
+    X_y_tuple = load_linnerud(return_X_y=True)
+    bunch = load_linnerud()
+    assert_true(isinstance(X_y_tuple, tuple))
+    assert_array_equal(X_y_tuple[0], bunch.data)
+    assert_array_equal(X_y_tuple[1], bunch.target)
 
 def test_load_iris():
     res = load_iris()
@@ -211,6 +231,12 @@ def test_load_boston():
     assert_equal(res.feature_names.size, 13)
     assert_true(res.DESCR)
 
+    # test return_X_y option
+    X_y_tuple = load_boston(return_X_y=True)
+    bunch = load_boston()
+    assert_true(isinstance(X_y_tuple, tuple))
+    assert_array_equal(X_y_tuple[0], bunch.data)
+    assert_array_equal(X_y_tuple[1], bunch.target)
 
 def test_loads_dumps_bunch():
     bunch = Bunch(x="x")