merging

pandas-dev · Jun 6, 2019 · 8418a07 · 8418a07
2 parents cf96e22 + 891a419
commit 8418a07
Show file tree

Hide file tree

Showing 95 changed files with 2,443 additions and 1,609 deletions.
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -96,6 +96,8 @@ def time_dict_rename_both_axes(self):
 
 
 class Iteration:
+    # mem_itertuples_* benchmarks are slow
+    timeout = 120
 
     def setup(self):
         N = 1000

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -1,12 +1,11 @@
 from functools import partial
 from itertools import product
 from string import ascii_letters
-import warnings
 
 import numpy as np
 
 from pandas import (
-    Categorical, DataFrame, MultiIndex, Series, TimeGrouper, Timestamp,
+    Categorical, DataFrame, MultiIndex, Series, Timestamp,
     date_range, period_range)
 import pandas.util.testing as tm
 
@@ -301,10 +300,6 @@ def setup(self):
     def time_multi_size(self):
         self.df.groupby(['key1', 'key2']).size()
 
-    def time_dt_timegrouper_size(self):
-        with warnings.catch_warnings(record=True):
-            self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
-
     def time_category_size(self):
         self.draws.groupby(self.cats).size()
 

diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -52,7 +52,6 @@ def time_is_dates_only(self):
 
 class Ops:
 
-    sample_time = 0.2
     params = ['float', 'int']
     param_names = ['dtype']
 
@@ -95,6 +94,12 @@ def time_min(self):
     def time_min_trivial(self):
         self.idx_inc.min()
 
+    def time_get_loc_inc(self):
+        self.idx_inc.get_loc(900000)
+
+    def time_get_loc_dec(self):
+        self.idx_dec.get_loc(100000)
+
 
 class IndexAppend:
 

diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
@@ -1,7 +1,11 @@
 import numpy as np
 
-from pandas._libs.tslibs.parsing import (
-    _concat_date_cols, _does_string_look_like_datetime)
+try:
+    from pandas._libs.tslibs.parsing import (
+        _concat_date_cols, _does_string_look_like_datetime)
+except ImportError:
+    # Avoid whole benchmark suite import failure on asv (currently 0.4)
+    pass
 
 
 class DoesStringLookLikeDatetime(object):

diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -4,7 +4,6 @@
 
 class Methods:
 
-    sample_time = 0.2
     params = (['DataFrame', 'Series'],
               [10, 1000],
               ['int', 'float'],
@@ -23,7 +22,6 @@ def time_rolling(self, constructor, window, dtype, method):
 
 class ExpandingMethods:
 
-    sample_time = 0.2
     params = (['DataFrame', 'Series'],
               ['int', 'float'],
               ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
@@ -41,7 +39,6 @@ def time_expanding(self, constructor, dtype, method):
 
 class EWMMethods:
 
-    sample_time = 0.2
     params = (['DataFrame', 'Series'],
               [10, 1000],
               ['int', 'float'],
@@ -58,7 +55,6 @@ def time_ewm(self, constructor, window, dtype, method):
 
 
 class VariableWindowMethods(Methods):
-    sample_time = 0.2
     params = (['DataFrame', 'Series'],
               ['50s', '1h', '1d'],
               ['int', 'float'],
@@ -75,7 +71,6 @@ def setup(self, constructor, window, dtype, method):
 
 class Pairwise:
 
-    sample_time = 0.2
     params = ([10, 1000, None],
               ['corr', 'cov'],
               [True, False])
@@ -95,7 +90,6 @@ def time_pairwise(self, window, method, pairwise):
 
 
 class Quantile:
-    sample_time = 0.2
     params = (['DataFrame', 'Series'],
               [10, 1000],
               ['int', 'float'],

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -15,7 +15,7 @@ jobs:
     name: Windows
     vmImage: vs2017-win2016
 
-- job: 'Checks_and_doc'
+- job: 'Checks'
   pool:
     vmImage: ubuntu-16.04
   timeoutInMinutes: 90
@@ -97,10 +97,11 @@ jobs:
   - script: |
       export PATH=$HOME/miniconda3/bin:$PATH
       source activate pandas-dev
+      cd asv_bench
+      asv check -E existing
       git remote add upstream https://github.com/pandas-dev/pandas.git
       git fetch upstream
       if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
-          cd asv_bench
           asv machine --yes
           ASV_OUTPUT="$(asv dev)"
           if [[ $(echo "$ASV_OUTPUT" | grep "failed") ]]; then

diff --git a/ci/deps/azure-35-compat.yaml b/ci/deps/azure-35-compat.yaml
@@ -26,5 +26,5 @@ dependencies:
   - pip
   - pip:
     # for python 3.5, pytest>=4.0.2 is not available in conda
-    - pytest>=4.0.2
+    - pytest==4.5.0
     - html5lib==1.0b2
diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml
@@ -25,7 +25,7 @@ dependencies:
   - pip:
     - python-dateutil==2.5.3
     # universal
-    - pytest>=4.0.2
+    - pytest==4.5.0
     - pytest-xdist
     - pytest-mock
     - hypothesis>=3.58.0
diff --git a/ci/setup_env.sh b/ci/setup_env.sh
@@ -118,12 +118,12 @@ echo "conda list"
 conda list
 
 # Install DB for Linux
-if [ ${TRAVIS_OS_NAME} == "linux" ]; then
+if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
   echo "installing dbs"
   mysql -e 'create database pandas_nosetest;'
   psql -c 'create database pandas_nosetest;' -U postgres
 else
-   echo "not using dbs on non-linux"
+   echo "not using dbs on non-linux Travis builds or Azure Pipelines"
 fi
 
 echo "done"
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -363,4 +363,5 @@ Library        Accessor   Classes
 ============== ========== =========================
 
 .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest
-.. _pdvega: https://jakevdp.github.io/pdvega/
+.. _pdvega: https://altair-viz.github.io/pdvega/
+
diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
@@ -1455,9 +1455,8 @@ Iteration
 
 The behavior of basic iteration over pandas objects depends on the type.
 When iterating over a Series, it is regarded as array-like, and basic iteration
-produces the values. Other data structures, like DataFrame,
-follow the dict-like convention of iterating over the "keys" of the
-objects.
+produces the values. DataFrames follow the dict-like convention of iterating
+over the "keys" of the objects.
 
 In short, basic iteration (``for i in object``) produces:
 
@@ -1537,9 +1536,9 @@ For example:
 
 .. ipython:: python
 
-   for item, frame in df.iteritems():
-       print(item)
-       print(frame)
+   for label, ser in df.iteritems():
+       print(label)
+       print(ser)
 
 .. _basics.iterrows:
 

diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -281,7 +281,6 @@ Optional Dependencies
   `qtpy  <https://github.com/spyder-ide/qtpy>`__ (requires PyQt or PySide),
   `PyQt5 <https://www.riverbankcomputing.com/software/pyqt/download5>`__,
   `PyQt4 <http://www.riverbankcomputing.com/software/pyqt/download>`__,
-  `pygtk <http://www.pygtk.org/>`__,
   `xsel <http://www.vergenet.net/~conrad/software/xsel/>`__, or
   `xclip <https://github.com/astrand/xclip/>`__: necessary to use
   :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation.

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -48,7 +48,6 @@ Conversion
    :toctree: api/
 
    DataFrame.astype
-   DataFrame.convert_objects
    DataFrame.infer_objects
    DataFrame.copy
    DataFrame.isna
@@ -205,7 +204,6 @@ Reindexing / Selection / Label manipulation
    DataFrame.rename_axis
    DataFrame.reset_index
    DataFrame.sample
-   DataFrame.select
    DataFrame.set_axis
    DataFrame.set_index
    DataFrame.tail

diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -56,7 +56,6 @@ Conversion
 
    Series.astype
    Series.infer_objects
-   Series.convert_objects
    Series.copy
    Series.bool
    Series.to_numpy
@@ -212,7 +211,6 @@ Reindexing / Selection / Label manipulation
    Series.rename_axis
    Series.reset_index
    Series.sample
-   Series.select
    Series.set_axis
    Series.take
    Series.tail

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -568,6 +568,67 @@ For a grouped ``DataFrame``, you can rename in a similar manner:
                             'mean': 'bar',
                             'std': 'baz'}))
 
+.. _groupby.aggregate.named:
+
+Named Aggregation
+~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.25.0
+
+To support column-specific aggregation *with control over the output column names*, pandas
+accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", where
+
+- The keywords are the *output* column names
+- The values are tuples whose first element is the column to select
+  and the second element is the aggregation to apply to that column. Pandas
+  provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']``
+  to make it clearer what the arguments are. As usual, the aggregation can
+  be a callable or a string alias.
+
+.. ipython:: python
+
+   animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
+                           'height': [9.1, 6.0, 9.5, 34.0],
+                           'weight': [7.9, 7.5, 9.9, 198.0]})
+   animals
+
+   animals.groupby("kind").agg(
+       min_height=pd.NamedAgg(column='height', aggfunc='min'),
+       max_height=pd.NamedAgg(column='height', aggfunc='max'),
+       average_weight=pd.NamedAgg(column='height', aggfunc=np.mean),
+   )
+
+
+``pandas.NamedAgg`` is just a ``namedtuple``. Plain tuples are allowed as well.
+
+.. ipython:: python
+
+   animals.groupby("kind").agg(
+       min_height=('height', 'min'),
+       max_height=('height', 'max'),
+       average_weight=('height', np.mean),
+   )
+
+
+If your desired output column names are not valid python keywords, construct a dictionary
+and unpack the keyword arguments
+
+.. ipython:: python
+
+   animals.groupby("kind").agg(**{
+       'total weight': pd.NamedAgg(column='weight', aggfunc=sum),
+   })
+
+Additional keyword arguments are not passed through to the aggregation functions. Only pairs
+of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions
+requires additional arguments, partially apply them with :meth:`functools.partial`.
+
+.. note::
+
+   For Python 3.5 and earlier, the order of ``**kwargs`` in a functions was not
+   preserved. This means that the output column ordering would not be
+   consistent. To ensure consistent ordering, the keys (and so output columns)
+   will always be sorted for Python 3.5.
 
 Applying different functions to DataFrame columns
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -588,19 +649,6 @@ must be either implemented on GroupBy or available via :ref:`dispatching
 
    grouped.agg({'C': 'sum', 'D': 'std'})
 
-.. note::
-
-    If you pass a dict to ``aggregate``, the ordering of the output columns is
-    non-deterministic. If you want to be sure the output columns will be in a specific
-    order, you can use an ``OrderedDict``.  Compare the output of the following two commands:
-
-.. ipython:: python
-
-   from collections import OrderedDict
-
-   grouped.agg({'D': 'std', 'C': 'mean'})
-   grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')]))
-
 .. _groupby.aggregate.cython:
 
 Cython-optimized aggregation functions

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -3272,7 +3272,7 @@ We can see that we got the same content back, which we had earlier written to th
 
 .. note::
 
-   You may need to install xclip or xsel (with gtk, PyQt5, PyQt4 or qtpy) on Linux to use these methods.
+   You may need to install xclip or xsel (with PyQt5, PyQt4 or qtpy) on Linux to use these methods.
 
 .. _io.pickle:
 

diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst
@@ -269,7 +269,7 @@ have no replacement.
 Interaction with scipy.sparse
 -----------------------------
 
-Use :meth:`DataFrame.sparse.from_coo` to create a ``DataFrame`` with sparse values from a sparse matrix.
+Use :meth:`DataFrame.sparse.from_spmatrix` to create a ``DataFrame`` with sparse values from a sparse matrix.
 
 .. versionadded:: 0.25.0
 

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -70,6 +70,16 @@ and replacing any remaining whitespaces with underscores:
     ``.str`` methods which operate on elements of type ``list`` are not available on such a
     ``Series``.
 
+.. _text.warn_types:
+
+.. warning::
+
+    Before v.0.25.0, the ``.str``-accessor did only the most rudimentary type checks. Starting with
+    v.0.25.0, the type of the Series is inferred and the allowed types (i.e. strings) are enforced more rigorously.
+
+    Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few
+    exceptions, other uses are not supported, and may be disabled at a later point.
+
 
 Splitting and Replacing Strings
 -------------------------------

diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst
@@ -92,6 +92,7 @@ Interaction with scipy.sparse
 Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` instances (see :ref:`here <sparse.scipysparse>`). For example, given a SparseSeries with MultiIndex we can convert to a `scipy.sparse.coo_matrix` by specifying the row and column labels as index levels:
 
 .. ipython:: python
+   :okwarning:
 
    s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
    s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
@@ -121,6 +122,7 @@ The from_coo method is a convenience method for creating a ``SparseSeries``
 from a ``scipy.sparse.coo_matrix``:
 
 .. ipython:: python
+   :okwarning:
 
    from scipy import sparse
    A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),

diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst
@@ -394,6 +394,7 @@ used in the ``pandas`` implementation (:issue:`12644`, :issue:`12638`, :issue:`1
 An example of this signature augmentation is illustrated below:
 
 .. ipython:: python
+   :okwarning:
 
    sp = pd.SparseDataFrame([1, 2, 3])
    sp
@@ -409,6 +410,7 @@ Previous behaviour:
 New behaviour:
 
 .. ipython:: python
+   :okwarning:
 
    np.cumsum(sp, axis=0)