Merge branch 'master' into feature/add-fsspec-support

pandas-dev · jorisvandenbossche · Jun 23, 2020 · Apr 14, 2020 · May 19, 2020 · May 20, 2020
commit 302ba1307c193aeccd1d1787b5725609fabb9d9c
diff --git a/.travis.yml b/.travis.yml
@@ -14,6 +14,8 @@ cache:
 
 env:
   global:
+    # Variable for test workers
+    - PYTEST_WORKERS="auto"
     # create a github personal access token
     # cd pandas-dev/pandas
     # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas
@@ -27,12 +29,21 @@ matrix:
   fast_finish: true
 
   include:
+    # In allowed failures
+    - dist: bionic
+      python: 3.9-dev
+      env:
+        - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)"
     - env:
         - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)"
 
     - env:
         - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)"
 
+    - arch: arm64
+      env:
+        - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)"
+
     - env:
         - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1"
       services:
@@ -53,11 +64,18 @@ matrix:
       services:
         - mysql
         - postgresql
+  allow_failures:
+    - arch: arm64
+      env:
+        - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)"
+    - dist: bionic
+      python: 3.9-dev
+      env:
+        - JOB="3.9-dev" PATTERN="(not slow and not network)"
 
 before_install:
   - echo "before_install"
-  # set non-blocking IO on travis
-  # https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024
+  # Use blocking IO on travis.  Ref:  https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024
   - python -c 'import os,sys,fcntl; flags = fcntl.fcntl(sys.stdout, fcntl.F_GETFL); fcntl.fcntl(sys.stdout, fcntl.F_SETFL, flags&~os.O_NONBLOCK);'
   - source ci/travis_process_gbq_encryption.sh
   - export PATH="$HOME/miniconda3/bin:$PATH"
@@ -83,7 +101,7 @@ install:
 script:
   - echo "script start"
   - echo "$JOB"
-  - source activate pandas-dev
+  - if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi
   - ci/run_tests.sh
 
 after_script:

diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@
 [![Downloads](https://anaconda.org/conda-forge/pandas/badges/downloads.svg)](https://pandas.pydata.org)
 [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas)
 [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 ## What is it?
 

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -34,7 +34,16 @@ class Factorize:
     params = [
         [True, False],
         [True, False],
-        ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
+        [
+            "int",
+            "uint",
+            "float",
+            "string",
+            "datetime64[ns]",
+            "datetime64[ns, tz]",
+            "Int64",
+            "boolean",
+        ],
     ]
     param_names = ["unique", "sort", "dtype"]
 
@@ -49,13 +58,15 @@ def setup(self, unique, sort, dtype):
             "datetime64[ns, tz]": pd.date_range(
                 "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
             ),
+            "Int64": pd.array(np.arange(N), dtype="Int64"),
+            "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
         }[dtype]
         if not unique:
             data = data.repeat(5)
-        self.idx = data
+        self.data = data
 
     def time_factorize(self, unique, sort, dtype):
-        self.idx.factorize(sort=sort)
+        pd.factorize(self.data, sort=sort)
 
 
 class Duplicated:

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -67,7 +67,7 @@ def time_series_op_with_fill_value_no_nas(self):
         self.ser.add(self.ser, fill_value=4)
 
 
-class MixedFrameWithSeriesAxis0:
+class MixedFrameWithSeriesAxis:
     params = [
         [
             "eq",
@@ -78,7 +78,7 @@ class MixedFrameWithSeriesAxis0:
             "gt",
             "add",
             "sub",
-            "div",
+            "truediv",
             "floordiv",
             "mul",
             "pow",
@@ -87,15 +87,72 @@ class MixedFrameWithSeriesAxis0:
     param_names = ["opname"]
 
     def setup(self, opname):
-        arr = np.arange(10 ** 6).reshape(100, -1)
+        arr = np.arange(10 ** 6).reshape(1000, -1)
         df = DataFrame(arr)
         df["C"] = 1.0
         self.df = df
         self.ser = df[0]
+        self.row = df.iloc[0]
 
     def time_frame_op_with_series_axis0(self, opname):
         getattr(self.df, opname)(self.ser, axis=0)
 
+    def time_frame_op_with_series_axis1(self, opname):
+        getattr(operator, opname)(self.df, self.ser)
+
+
+class FrameWithFrameWide:
+    # Many-columns, mixed dtypes
+
+    params = [
+        [
+            # GH#32779 has discussion of which operators are included here
+            operator.add,
+            operator.floordiv,
+            operator.gt,
+        ]
+    ]
+    param_names = ["op"]
+
+    def setup(self, op):
+        # we choose dtypes so as to make the blocks
+        #  a) not perfectly match between right and left
+        #  b) appreciably bigger than single columns
+        n_cols = 2000
+        n_rows = 500
+
+        # construct dataframe with 2 blocks
+        arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8")
+        arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4")
+        df = pd.concat(
+            [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True,
+        )
+        # should already be the case, but just to be sure
+        df._consolidate_inplace()
+
+        # TODO: GH#33198 the setting here shoudlnt need two steps
+        arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
+        arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8")
+        arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
+        df2 = pd.concat(
+            [pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
+            axis=1,
+            ignore_index=True,
+        )
+        # should already be the case, but just to be sure
+        df2._consolidate_inplace()
+
+        self.left = df
+        self.right = df2
+
+    def time_op_different_blocks(self, op):
+        # blocks (and dtypes) are not aligned
+        op(self.left, self.right)
+
+    def time_op_same_blocks(self, op):
+        # blocks (and dtypes) are aligned
+        op(self.left, self.left)
+
 
 class Ops:
 

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -564,7 +564,7 @@ def setup(self):
 
     def time_frame_get_dtype_counts(self):
         with warnings.catch_warnings(record=True):
-            self.df._data.get_dtype_counts()
+            self.df.dtypes.value_counts()
 
     def time_info(self):
         self.df.info()

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -626,4 +626,96 @@ def time_first(self):
         self.df_nans.groupby("key").transform("first")
 
 
+class TransformEngine:
+    def setup(self):
+        N = 10 ** 3
+        data = DataFrame(
+            {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
+            columns=[0, 1],
+        )
+        self.grouper = data.groupby(0)
+
+    def time_series_numba(self):
+        def function(values, index):
+            return values * 5
+
+        self.grouper[1].transform(function, engine="numba")
+
+    def time_series_cython(self):
+        def function(values):
+            return values * 5
+
+        self.grouper[1].transform(function, engine="cython")
+
+    def time_dataframe_numba(self):
+        def function(values, index):
+            return values * 5
+
+        self.grouper.transform(function, engine="numba")
+
+    def time_dataframe_cython(self):
+        def function(values):
+            return values * 5
+
+        self.grouper.transform(function, engine="cython")
+
+
+class AggEngine:
+    def setup(self):
+        N = 10 ** 3
+        data = DataFrame(
+            {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
+            columns=[0, 1],
+        )
+        self.grouper = data.groupby(0)
+
+    def time_series_numba(self):
+        def function(values, index):
+            total = 0
+            for i, value in enumerate(values):
+                if i % 2:
+                    total += value + 5
+                else:
+                    total += value * 2
+            return total
+
+        self.grouper[1].agg(function, engine="numba")
+
+    def time_series_cython(self):
+        def function(values):
+            total = 0
+            for i, value in enumerate(values):
+                if i % 2:
+                    total += value + 5
+                else:
+                    total += value * 2
+            return total
+
+        self.grouper[1].agg(function, engine="cython")
+
+    def time_dataframe_numba(self):
+        def function(values, index):
+            total = 0
+            for i, value in enumerate(values):
+                if i % 2:
+                    total += value + 5
+                else:
+                    total += value * 2
+            return total
+
+        self.grouper.agg(function, engine="numba")
+
+    def time_dataframe_cython(self):
+        def function(values):
+            total = 0
+            for i, value in enumerate(values):
+                if i % 2:
+                    total += value + 5
+                else:
+                    total += value * 2
+            return total
+
+        self.grouper.agg(function, engine="cython")
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
@@ -2,7 +2,7 @@
 
 try:
     from pandas._libs.tslibs.parsing import (
-        _concat_date_cols,
+        concat_date_cols,
         _does_string_look_like_datetime,
     )
 except ImportError:
@@ -39,4 +39,4 @@ def setup(self, value, dim):
             )
 
     def time_check_concat(self, value, dim):
-        _concat_date_cols(self.object)
+        concat_date_cols(self.object)
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -150,19 +150,18 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
         self.roll.quantile(percentile, interpolation=interpolation)
 
 
-class PeakMemFixed:
-    def setup(self):
-        N = 10
-        arr = 100 * np.random.random(N)
-        self.roll = pd.Series(arr).rolling(10)
-
-    def peakmem_fixed(self):
-        # GH 25926
-        # This is to detect memory leaks in rolling operations.
-        # To save time this is only ran on one method.
-        # 6000 iterations is enough for most types of leaks to be detected
-        for x in range(6000):
-            self.roll.max()
+class PeakMemFixedWindowMinMax:
+
+    params = ["min", "max"]
+
+    def setup(self, operation):
+        N = int(1e6)
+        arr = np.random.random(N)
+        self.roll = pd.Series(arr).rolling(2)
+
+    def peakmem_fixed(self, operation):
+        for x in range(5):
+            getattr(self.roll, operation)()
 
 
 class ForwardWindowMethods:

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
@@ -11,8 +11,8 @@ class FrameOps:
     param_names = ["op", "dtype", "axis"]
 
     def setup(self, op, dtype, axis):
-        if op == "mad" and dtype == "Int64" and axis == 1:
-            # GH-33036
+        if op == "mad" and dtype == "Int64":
+            # GH-33036, GH#33600
             raise NotImplementedError
         values = np.random.randn(100000, 4)
         if dtype == "Int64":

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -5,6 +5,9 @@ trigger:
 pr:
 - master
 
+variables:
+  PYTEST_WORKERS: auto
+
 jobs:
 # Mac and Linux use the same template
 - template: ci/azure/posix.yml

diff --git a/ci/build39.sh b/ci/build39.sh
@@ -0,0 +1,21 @@
+#!/bin/bash -e
+# Special build for python3.9 until numpy puts its own wheels up
+
+sudo apt-get install build-essential gcc xvfb
+pip install --no-deps -U pip wheel setuptools
+pip install python-dateutil pytz pytest pytest-xdist hypothesis
+pip install cython --pre # https://github.com/cython/cython/issues/3395
+
+git clone https://github.com/numpy/numpy
+cd numpy
+python setup.py build_ext --inplace
+python setup.py install
+cd ..
+rm -rf numpy
+
+python setup.py build_ext -inplace
+python -m pip install --no-build-isolation -e .
+
+python -c "import sys; print(sys.version_info)"
+python -c "import pandas as pd"
+python -c "import hypothesis"