Merge branch 'master' of https://github.com/pandas-dev/pandas into da…

…te_range_overflow
pandas-dev · Oct 26, 2018 · 4c4694d · 4c4694d
2 parents 4618398 + 437f31c
commit 4c4694d
Show file tree

Hide file tree

Showing 98 changed files with 2,743 additions and 1,020 deletions.
diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
@@ -1,18 +1,30 @@
 import numpy as np
 
-from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine,
-                                ObjectEngine)
+from pandas._libs import index as libindex
+
+
+def _get_numeric_engines():
+    engine_names = [
+        ('Int64Engine', np.int64), ('Int32Engine', np.int32),
+        ('Int16Engine', np.int16), ('Int8Engine', np.int8),
+        ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32),
+        ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8),
+        ('Float64Engine', np.float64), ('Float32Engine', np.float32),
+    ]
+    return [(getattr(libindex, engine_name), dtype)
+            for engine_name, dtype in engine_names
+            if hasattr(libindex, engine_name)]
 
 
 class NumericEngineIndexing(object):
 
-    params = [[Int64Engine, UInt64Engine, Float64Engine],
-              [np.int64, np.uint64, np.float64],
+    params = [_get_numeric_engines(),
               ['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
               ]
-    param_names = ['engine', 'dtype', 'index_type']
+    param_names = ['engine_and_dtype', 'index_type']
 
-    def setup(self, engine, dtype, index_type):
+    def setup(self, engine_and_dtype, index_type):
+        engine, dtype = engine_and_dtype
         N = 10**5
         values = list([1] * N + [2] * N + [3] * N)
         arr = {
@@ -26,7 +38,7 @@ def setup(self, engine, dtype, index_type):
         # code belows avoids populating the mapping etc. while timing.
         self.data.get_loc(2)
 
-    def time_get_loc(self, engine, dtype, index_type):
+    def time_get_loc(self, engine_and_dtype, index_type):
         self.data.get_loc(2)
 
 
@@ -44,7 +56,7 @@ def setup(self, index_type):
             'non_monotonic': np.array(list('abc') * N, dtype=object),
         }[index_type]
 
-        self.data = ObjectEngine(lambda: arr, len(arr))
+        self.data = libindex.ObjectEngine(lambda: arr, len(arr))
         # code belows avoids populating the mapping etc. while timing.
         self.data.get_loc('b')
 

diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -851,6 +851,22 @@ Sparse
    SparseSeries.to_coo
    SparseSeries.from_coo
 
+.. autosummary::
+   :toctree: generated/
+   :template: autosummary/accessor_attribute.rst
+
+   Series.sparse.npoints
+   Series.sparse.density
+   Series.sparse.fill_value
+   Series.sparse.sp_values
+
+
+.. autosummary::
+   :toctree: generated/
+
+   Series.sparse.from_coo
+   Series.sparse.to_coo
+
 .. _api.dataframe:
 
 DataFrame

diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst
@@ -657,12 +657,12 @@ Testing With Continuous Integration
 -----------------------------------
 
 The *pandas* test suite will run automatically on `Travis-CI <https://travis-ci.org/>`__,
-`Appveyor <https://www.appveyor.com/>`__, and `Circle CI <https://circleci.com/>`__ continuous integration
-services, once your pull request is submitted.
+`Azure Pipelines <https://azure.microsoft.com/en-us/services/devops/pipelines/>`__,
+and `Circle CI <https://circleci.com/>`__ continuous integration services, once your pull request is submitted.
 However, if you wish to run the test suite on a branch prior to submitting the pull request,
 then the continuous integration services need to be hooked to your GitHub repository. Instructions are here
 for `Travis-CI <http://about.travis-ci.org/docs/user/getting-started/>`__,
-`Appveyor <https://www.appveyor.com/docs/>`__ , and `CircleCI <https://circleci.com/>`__.
+`Azure Pipelines <https://docs.microsoft.com/en-us/azure/devops/pipelines/>`__, and `CircleCI <https://circleci.com/>`__.
 
 A pull-request will be considered for merging when you have an all 'green' build. If any tests are failing,
 then you will get a red 'X', where you can click through to see the individual failed tests.
@@ -672,8 +672,8 @@ This is an example of a green build.
 
 .. note::
 
-   Each time you push to *your* fork, a *new* run of the tests will be triggered on the CI. Appveyor will auto-cancel
-   any non-currently-running tests for that same pull-request. You can enable the auto-cancel feature for
+   Each time you push to *your* fork, a *new* run of the tests will be triggered on the CI.
+   You can enable the auto-cancel feature, which removes any non-currently-running tests for that same pull-request, for
    `Travis-CI here <https://docs.travis-ci.com/user/customizing-the-build/#Building-only-the-latest-commit>`__ and
    for `CircleCI here <https://circleci.com/changelog-legacy/#option-to-auto-cancel-redundant-builds>`__.
 

diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst
@@ -62,6 +62,26 @@ Any sparse object can be converted back to the standard dense form by calling
 
    sts.to_dense()
 
+.. _sparse.accessor:
+
+Sparse Accessor
+---------------
+
+.. versionadded:: 0.24.0
+
+Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat``
+for categorical data, and ``.dt`` for datetime-like data. This namespace provides
+attributes and methods that are specific to sparse data.
+
+.. ipython:: python
+
+   s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]")
+   s.sparse.density
+   s.sparse.fill_value
+
+This accessor is available only on data with ``SparseDtype``, and on the :class:`Series`
+class itself for creating a Series with sparse data from a scipy COO matrix with.
+
 .. _sparse.array:
 
 SparseArray

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -145,33 +145,41 @@ Current Behavior:
 
 .. _whatsnew_0240.enhancements.interval:
 
-Storing Interval Data in Series and DataFrame
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Storing Interval and Period Data in Series and DataFrame
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an
-:class:`IntervalIndex` like previously (:issue:`19453`).
+Interval and Period data may now be stored in a ``Series`` or ``DataFrame``, in addition to an
+:class:`IntervalIndex` and :class:`PeriodIndex` like previously (:issue:`19453`, :issue:`22862`).
 
 .. ipython:: python
 
    ser = pd.Series(pd.interval_range(0, 5))
    ser
    ser.dtype
 
-Previously, these would be cast to a NumPy array of ``Interval`` objects. In general,
-this should result in better performance when storing an array of intervals in
-a :class:`Series`.
+And for periods:
+
+.. ipython:: python
+
+   pser = pd.Series(pd.date_range("2000", freq="D", periods=5))
+   pser
+   pser.dtype
+
+Previously, these would be cast to a NumPy array with object dtype. In general,
+this should result in better performance when storing an array of intervals or periods
+in a :class:`Series` or column of a :class:`DataFrame`.
 
-Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy
+Note that the ``.values`` of a ``Series`` containing one of these types is no longer a NumPy
 array, but rather an ``ExtensionArray``:
 
 .. ipython:: python
 
    ser.values
+   pser.values
 
 This is the same behavior as ``Series.values`` for categorical data. See
 :ref:`whatsnew_0240.api_breaking.interval_values` for more.
 
-
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
@@ -360,7 +368,7 @@ New Behavior:
 This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``.
 
 For situations where you need an ``ndarray`` of ``Interval`` objects, use
-:meth:`numpy.asarray` or ``idx.astype(object)``.
+:meth:`numpy.asarray`.
 
 .. ipython:: python
 
@@ -524,14 +532,20 @@ changes were made:
 - ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray.
 - Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed.
 
-
 Some new warnings are issued for operations that require or are likely to materialize a large dense array:
 
 - A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array.
 - A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used.
 
 In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made <whatsnew_0240.bug_fixes.sparse>`.
 
+Finally, a ``Series.sparse`` accessor was added to provide sparse-specific methods like :meth:`Series.sparse.from_coo`.
+
+.. ipython:: python
+
+   s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]')
+   s.sparse.density
+
 .. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:
 
 Raise ValueError in ``DataFrame.to_dict(orient='index')``
@@ -810,6 +824,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
 - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
 - Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)
+- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`).
 
 .. _whatsnew_0240.api.incompatibilities:
 
@@ -939,9 +954,11 @@ Removal of prior version deprecations/changes
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
-  both when indexing by label (using .loc) and position(.iloc).
-  Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
+- Slicing Series and Dataframes with an monotonically increasing :class:`CategoricalIndex`
+  is now very fast and has speed comparable to slicing with an ``Int64Index``.
+  The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`)
+  Slicing a monotonically increasing :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``)
+  shows similar speed improvements as above (:issue:`21659`)
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)
@@ -1003,6 +1020,7 @@ Datetimelike
 - Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`)
 - Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`)
 - Bug in :func:`date_range` when decrementing a start date to a past end date by a negative frequency (:issue:`23270`)
+- Bug in :func:`DataFrame.combine` with datetimelike values raising a TypeError (:issue:`23079`)
 - Bug in :func:`date_range` with frequency of ``Day`` or higher where dates sufficiently far in the future could wrap around to the past instead of raising ``OutOfBoundsDatetime`` (:issue:`19740`)
 
 Timedelta

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -10,7 +10,8 @@ from libc.math cimport fabs, sqrt
 import numpy as np
 cimport numpy as cnp
 from numpy cimport (ndarray,
-                    NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8,
+                    NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8,
+                    NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8,
                     NPY_FLOAT32, NPY_FLOAT64,
                     NPY_OBJECT,
                     int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
@@ -359,9 +360,13 @@ ctypedef fused algos_t:
     float64_t
     float32_t
     object
-    int32_t
     int64_t
+    int32_t
+    int16_t
+    int8_t
     uint64_t
+    uint32_t
+    uint16_t
     uint8_t
 
 
@@ -459,7 +464,12 @@ pad_float32 = pad["float32_t"]
 pad_object = pad["object"]
 pad_int64 = pad["int64_t"]
 pad_int32 = pad["int32_t"]
+pad_int16 = pad["int16_t"]
+pad_int8 = pad["int8_t"]
 pad_uint64 = pad["uint64_t"]
+pad_uint32 = pad["uint32_t"]
+pad_uint16 = pad["uint16_t"]
+pad_uint8 = pad["uint8_t"]
 pad_bool = pad["uint8_t"]
 
 
@@ -653,7 +663,12 @@ backfill_float32 = backfill["float32_t"]
 backfill_object = backfill["object"]
 backfill_int64 = backfill["int64_t"]
 backfill_int32 = backfill["int32_t"]
+backfill_int16 = backfill["int16_t"]
+backfill_int8 = backfill["int8_t"]
 backfill_uint64 = backfill["uint64_t"]
+backfill_uint32 = backfill["uint32_t"]
+backfill_uint16 = backfill["uint16_t"]
+backfill_uint8 = backfill["uint8_t"]
 backfill_bool = backfill["uint8_t"]
 
 
@@ -866,7 +881,12 @@ is_monotonic_float32 = is_monotonic["float32_t"]
 is_monotonic_object = is_monotonic["object"]
 is_monotonic_int64 = is_monotonic["int64_t"]
 is_monotonic_int32 = is_monotonic["int32_t"]
+is_monotonic_int16 = is_monotonic["int16_t"]
+is_monotonic_int8 = is_monotonic["int8_t"]
 is_monotonic_uint64 = is_monotonic["uint64_t"]
+is_monotonic_uint32 = is_monotonic["uint32_t"]
+is_monotonic_uint16 = is_monotonic["uint16_t"]
+is_monotonic_uint8 = is_monotonic["uint8_t"]
 is_monotonic_bool = is_monotonic["uint8_t"]
 
 

diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
@@ -133,6 +133,9 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
           ('int16', 'INT16', 'int16'),
           ('int32', 'INT32', 'int32'),
           ('int64', 'INT64', 'int64'),
+          ('uint8', 'UINT8', 'uint8'),
+          ('uint16', 'UINT16', 'uint16'),
+          ('uint32', 'UINT32', 'uint32'),
           ('uint64', 'UINT64', 'uint64'),
           # ('platform_int', 'INT', 'int_'),
           # ('object', 'OBJECT', 'object_'),

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -5,8 +5,10 @@ import cython
 
 import numpy as np
 cimport numpy as cnp
-from numpy cimport (ndarray, float64_t, int32_t,
-                    int64_t, uint8_t, uint64_t, intp_t,
+from numpy cimport (ndarray, intp_t,
+                    float64_t, float32_t,
+                    int64_t, int32_t, int16_t, int8_t,
+                    uint64_t, uint32_t, uint16_t, uint8_t,
                     # Note: NPY_DATETIME, NPY_TIMEDELTA are only available
                     # for cimport in cython>=0.27.3
                     NPY_DATETIME, NPY_TIMEDELTA)

diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in
@@ -10,14 +10,22 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 
 {{py:
 
-# name, dtype, ctype
-dtypes = [('Float64', 'float64', 'float64_t'),
-          ('UInt64', 'uint64', 'uint64_t'),
-          ('Int64', 'int64', 'int64_t'),
-          ('Object', 'object', 'object')]
+# name, dtype, ctype, hashtable_name, hashtable_dtype
+dtypes = [('Float64', 'float64', 'float64_t', 'Float64', 'float64'),
+          ('Float32', 'float32', 'float32_t', 'Float64', 'float64'),
+          ('Int64', 'int64', 'int64_t', 'Int64', 'int64'),
+          ('Int32', 'int32', 'int32_t', 'Int64', 'int64'),
+          ('Int16', 'int16', 'int16_t', 'Int64', 'int64'),
+          ('Int8', 'int8', 'int8_t', 'Int64', 'int64'),
+          ('UInt64', 'uint64', 'uint64_t', 'UInt64', 'uint64'),
+          ('UInt32', 'uint32', 'uint32_t', 'UInt64', 'uint64'),
+          ('UInt16', 'uint16', 'uint16_t', 'UInt64', 'uint64'),
+          ('UInt8', 'uint8', 'uint8_t', 'UInt64', 'uint64'),
+          ('Object', 'object', 'object', 'PyObject', 'object'),
+          ]
 }}
 
-{{for name, dtype, ctype in dtypes}}
+{{for name, dtype, ctype, hashtable_name, hashtable_dtype in dtypes}}
 
 
 cdef class {{name}}Engine(IndexEngine):
@@ -34,13 +42,9 @@ cdef class {{name}}Engine(IndexEngine):
                                    other, limit=limit)
 
     cdef _make_hash_table(self, n):
-        {{if name == 'Object'}}
-        return _hash.PyObjectHashTable(n)
-        {{else}}
-        return _hash.{{name}}HashTable(n)
-        {{endif}}
+        return _hash.{{hashtable_name}}HashTable(n)
 
-    {{if name != 'Float64' and name != 'Object'}}
+    {{if name not in {'Float64', 'Float32', 'Object'} }}
     cdef _check_type(self, object val):
         hash(val)
         if util.is_bool_object(val):
@@ -50,6 +54,11 @@ cdef class {{name}}Engine(IndexEngine):
     {{endif}}
 
     {{if name != 'Object'}}
+    cpdef _call_map_locations(self, values):
+        # self.mapping is of type {{hashtable_name}}HashTable,
+        # so convert dtype of values
+        self.mapping.map_locations(algos.ensure_{{hashtable_dtype}}(values))
+
     cdef _get_index_values(self):
         return algos.ensure_{{dtype}}(self.vgetter())
 
@@ -60,7 +69,7 @@ cdef class {{name}}Engine(IndexEngine):
             ndarray[{{ctype}}] values
             int count = 0
 
-        {{if name != 'Float64'}}
+        {{if name not in {'Float64', 'Float32'} }}
         if not util.is_integer_object(val):
             raise KeyError(val)
         {{endif}}