Merge remote-tracking branch 'upstream/main' into ci/windows/xdist

mroeschke · Jul 7, 2023 · fecb99a · fecb99a
2 parents 719df90 + d32d025
commit fecb99a
Show file tree

Hide file tree

Showing 101 changed files with 1,333 additions and 608 deletions.
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -58,7 +58,15 @@ jobs:
             # Also install zh_CN (its encoding is gb2312) but do not activate it.
             # It will be temporarily activated during tests with locale.setlocale
             extra_loc: "zh_CN"
-          - name: "Copy-on-Write"
+          - name: "Copy-on-Write 3.9"
+            env_file: actions-39.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "1"
+          - name: "Copy-on-Write 3.10"
+            env_file: actions-310.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "1"
+          - name: "Copy-on-Write 3.11"
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
             pandas_copy_on_write: "1"

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,18 +15,11 @@ default_stages: [
 ci:
     autofix_prs: false
 repos:
--   repo: local
+-   repo: https://github.com/hauntsaninja/black-pre-commit-mirror
+    # black compiled with mypyc
+    rev: 23.3.0
     hooks:
-    # NOTE: we make `black` a local hook because if it's installed from
-    # PyPI (rather than from source) then it'll run twice as fast thanks to mypyc
-    -   id: black
-        name: black
-        description: "Black: The uncompromising Python code formatter"
-        entry: black
-        language: python
-        require_serial: true
-        types_or: [python, pyi]
-        additional_dependencies: [black==23.3.0]
+      - id: black
 -   repo: https://github.com/charliermarsh/ruff-pre-commit
     rev: v0.0.270
     hooks:
@@ -74,7 +67,7 @@ repos:
             --linelength=88,
             '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
         ]
--   repo: https://github.com/pycqa/pylint
+-   repo: https://github.com/pylint-dev/pylint
     rev: v3.0.0a6
     hooks:
     -   id: pylint
@@ -93,11 +86,6 @@ repos:
             |^pandas/conftest\.py  # keep excluded
         args: [--disable=all, --enable=redefined-outer-name]
         stages: [manual]
-    -   id: pylint
-        alias: unspecified-encoding
-        name: Using open without explicitly specifying an encoding
-        args: [--disable=all, --enable=unspecified-encoding]
-        stages: [manual]
 -   repo: https://github.com/PyCQA/isort
     rev: 5.12.0
     hooks:

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -110,31 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas_object \
         pandas.api.interchange.from_dataframe \
         pandas.DatetimeIndex.snap \
-        pandas.core.window.rolling.Rolling.max \
-        pandas.core.window.rolling.Rolling.cov \
-        pandas.core.window.rolling.Rolling.skew \
-        pandas.core.window.rolling.Rolling.apply \
-        pandas.core.window.rolling.Window.mean \
-        pandas.core.window.rolling.Window.sum \
-        pandas.core.window.rolling.Window.var \
-        pandas.core.window.rolling.Window.std \
-        pandas.core.window.expanding.Expanding.count \
-        pandas.core.window.expanding.Expanding.sum \
-        pandas.core.window.expanding.Expanding.mean \
-        pandas.core.window.expanding.Expanding.median \
-        pandas.core.window.expanding.Expanding.min \
-        pandas.core.window.expanding.Expanding.max \
-        pandas.core.window.expanding.Expanding.corr \
-        pandas.core.window.expanding.Expanding.cov \
-        pandas.core.window.expanding.Expanding.skew \
-        pandas.core.window.expanding.Expanding.apply \
-        pandas.core.window.expanding.Expanding.quantile \
-        pandas.core.window.ewm.ExponentialMovingWindow.mean \
-        pandas.core.window.ewm.ExponentialMovingWindow.sum \
-        pandas.core.window.ewm.ExponentialMovingWindow.std \
-        pandas.core.window.ewm.ExponentialMovingWindow.var \
-        pandas.core.window.ewm.ExponentialMovingWindow.corr \
-        pandas.core.window.ewm.ExponentialMovingWindow.cov \
         pandas.api.indexers.BaseIndexer \
         pandas.api.indexers.VariableOffsetWindowIndexer \
         pandas.io.formats.style.Styler \

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -240,7 +240,7 @@
     "footer_start": ["pandas_footer", "sphinx-version"],
     "github_url": "https://github.com/pandas-dev/pandas",
     "twitter_url": "https://twitter.com/pandas_dev",
-    "analytics": {"google_analytics_id": "UA-27880019-2"},
+    "analytics": {"google_analytics_id": "G-5RE31C1RNW"},
     "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"},
     "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"],
     "switcher": {

diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
@@ -764,7 +764,7 @@ install pandas) by typing::
     your installation is probably fine and you can start contributing!
 
 Often it is worth running only a subset of tests first around your changes before running the
-entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage.herokuapp.com/))
+entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage-12d2130077bc.herokuapp.com/))
 to find out which tests hit the lines of code you've modified, and then run only those).
 
 The easiest way to do this is with::

diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -246,7 +246,7 @@ In pandas we may use :meth:`~pandas.pivot_table` method to handle this:
        }
    )
 
-   baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max)
+   baseball.pivot_table(values="batting avg", columns="team", aggfunc="max")
 
 For more details and examples see :ref:`the reshaping documentation
 <reshaping.pivot>`.
@@ -359,7 +359,7 @@ In pandas the equivalent expression, using the
    )
 
    grouped = df.groupby(["month", "week"])
-   grouped["x"].agg([np.mean, np.std])
+   grouped["x"].agg(["mean", "std"])
 
 
 For more details and examples see :ref:`the groupby documentation
@@ -482,7 +482,7 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`:
        values="value",
        index=["variable", "week"],
        columns=["month"],
-       aggfunc=np.mean,
+       aggfunc="mean",
    )
 
 Similarly for ``dcast`` which uses a data.frame called ``df`` in R to

diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst
@@ -198,7 +198,7 @@ to your grouped DataFrame, indicating which functions to apply to specific colum
 
 .. ipython:: python
 
-    tips.groupby("day").agg({"tip": np.mean, "day": np.size})
+    tips.groupby("day").agg({"tip": "mean", "day": "size"})
 
 Grouping by more than one column is done by passing a list of columns to the
 :meth:`~pandas.DataFrame.groupby` method.
@@ -222,7 +222,7 @@ Grouping by more than one column is done by passing a list of columns to the
 
 .. ipython:: python
 
-    tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})
+    tips.groupby(["smoker", "day"]).agg({"tip": ["size", "mean"]})
 
 .. _compare_with_sql.join:
 

diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -16,6 +16,16 @@ Customarily, we import as follows:
    import numpy as np
    import pandas as pd
 
+Basic data structures in pandas
+-------------------------------
+
+Pandas provides two types of classes for handling data:
+
+1. :class:`Series`: a one-dimensional labeled array holding data of any type
+    such as integers, strings, Python objects etc.
+2. :class:`DataFrame`: a two-dimensional data structure that holds data like
+   a two-dimension array or a table with rows and columns.
+
 Object creation
 ---------------
 

diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -881,8 +881,8 @@ statistics methods, takes an optional ``axis`` argument:
 
 .. ipython:: python
 
-   df.apply(np.mean)
-   df.apply(np.mean, axis=1)
+   df.apply(lambda x: np.mean(x))
+   df.apply(lambda x: np.mean(x), axis=1)
    df.apply(lambda x: x.max() - x.min())
    df.apply(np.cumsum)
    df.apply(np.exp)
@@ -986,7 +986,7 @@ output:
 
 .. ipython:: python
 
-   tsdf.agg(np.sum)
+   tsdf.agg(lambda x: np.sum(x))
 
    tsdf.agg("sum")
 

diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -530,7 +530,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
 
    code_groups = df.groupby("code")
 
-   agg_n_sort_order = code_groups[["data"]].transform(sum).sort_values(by="data")
+   agg_n_sort_order = code_groups[["data"]].transform("sum").sort_values(by="data")
 
    sorted_df = df.loc[agg_n_sort_order.index]
 
@@ -549,7 +549,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
            return x.iloc[1] * 1.234
        return pd.NaT
 
-   mhc = {"Mean": np.mean, "Max": np.max, "Custom": MyCust}
+   mhc = {"Mean": "mean", "Max": "max", "Custom": MyCust}
    ts.resample("5min").apply(mhc)
    ts
 
@@ -685,7 +685,7 @@ The :ref:`Pivot <reshaping.pivot>` docs.
        values=["Sales"],
        index=["Province"],
        columns=["City"],
-       aggfunc=np.sum,
+       aggfunc="sum",
        margins=True,
    )
    table.stack("City")

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -878,7 +878,7 @@ will be broadcast across the group.
     grouped.transform("sum")
 
 In addition to string aliases, the :meth:`~.DataFrameGroupBy.transform` method can
-also except User-Defined functions (UDFs). The UDF must:
+also accept User-Defined Functions (UDFs). The UDF must:
 
 * Return a result that is either the same size as the group chunk or
   broadcastable to the size of the group chunk (e.g., a scalar,
@@ -1363,7 +1363,7 @@ implementation headache).
 Grouping with ordered factors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Categorical variables represented as instance of pandas's ``Categorical`` class
+Categorical variables represented as instances of pandas's ``Categorical`` class
 can be used as group keys. If so, the order of the levels will be preserved:
 
 .. ipython:: python
@@ -1496,7 +1496,7 @@ You can also select multiple rows from each group by specifying multiple nth val
    # get the first, 4th, and last date index for each month
    df.groupby([df.index.year, df.index.month]).nth([0, 3, -1])
 
-You may also use a slices or lists of slices.
+You may also use slices or lists of slices.
 
 .. ipython:: python
 

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1568,8 +1568,7 @@ class of the csv module. For this, you have to specify ``sep=None``.
 .. ipython:: python
 
    df = pd.DataFrame(np.random.randn(10, 4))
-   df.to_csv("tmp.csv", sep="|")
-   df.to_csv("tmp2.csv", sep=":")
+   df.to_csv("tmp2.csv", sep=":", index=False)
    pd.read_csv("tmp2.csv", sep=None, engine="python")
 
 .. ipython:: python
@@ -1597,8 +1596,8 @@ rather than reading the entire file into memory, such as the following:
 .. ipython:: python
 
    df = pd.DataFrame(np.random.randn(10, 4))
-   df.to_csv("tmp.csv", sep="|")
-   table = pd.read_csv("tmp.csv", sep="|")
+   df.to_csv("tmp.csv", index=False)
+   table = pd.read_csv("tmp.csv")
    table
 
 
@@ -1607,8 +1606,8 @@ value will be an iterable object of type ``TextFileReader``:
 
 .. ipython:: python
 
-   with pd.read_csv("tmp.csv", sep="|", chunksize=4) as reader:
-       reader
+   with pd.read_csv("tmp.csv", chunksize=4) as reader:
+       print(reader)
        for chunk in reader:
            print(chunk)
 
@@ -1620,8 +1619,8 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
 
 .. ipython:: python
 
-   with pd.read_csv("tmp.csv", sep="|", iterator=True) as reader:
-       reader.get_chunk(5)
+   with pd.read_csv("tmp.csv", iterator=True) as reader:
+       print(reader.get_chunk(5))
 
 .. ipython:: python
    :suppress:

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -402,12 +402,12 @@ We can produce pivot tables from this data very easily:
 .. ipython:: python
 
    pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])
-   pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum)
+   pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc="sum")
    pd.pivot_table(
        df, values=["D", "E"],
        index=["B"],
        columns=["A", "C"],
-       aggfunc=np.sum,
+       aggfunc="sum",
    )
 
 The result object is a :class:`DataFrame` having potentially hierarchical indexes on the
@@ -451,7 +451,7 @@ rows and columns:
        columns="C",
        values=["D", "E"],
        margins=True,
-       aggfunc=np.std
+       aggfunc="std"
    )
    table
 
@@ -552,7 +552,7 @@ each group defined by the first two :class:`Series`:
 
 .. ipython:: python
 
-   pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc=np.sum)
+   pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc="sum")
 
 Adding margins
 ~~~~~~~~~~~~~~
@@ -562,7 +562,7 @@ Finally, one can also add margins or normalize this output.
 .. ipython:: python
 
    pd.crosstab(
-       df["A"], df["B"], values=df["C"], aggfunc=np.sum, normalize=True, margins=True
+       df["A"], df["B"], values=df["C"], aggfunc="sum", normalize=True, margins=True
    )
 
 .. _reshaping.tile:

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -1801,22 +1801,22 @@ You can pass a list or dict of functions to do aggregation with, outputting a ``
 
 .. ipython:: python
 
-   r["A"].agg([np.sum, np.mean, np.std])
+   r["A"].agg(["sum", "mean", "std"])
 
 On a resampled ``DataFrame``, you can pass a list of functions to apply to each
 column, which produces an aggregated result with a hierarchical index:
 
 .. ipython:: python
 
-   r.agg([np.sum, np.mean])
+   r.agg(["sum", "mean"])
 
 By passing a dict to ``aggregate`` you can apply a different aggregation to the
 columns of a ``DataFrame``:
 
 .. ipython:: python
    :okexcept:
 
-   r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)})
+   r.agg({"A": "sum", "B": lambda x: np.std(x, ddof=1)})
 
 The function names can also be strings. In order for a string to be valid it
 must be implemented on the resampled object:

diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
@@ -140,7 +140,7 @@ of multiple aggregations applied to a window.
 .. ipython:: python
 
    df = pd.DataFrame({"A": range(5), "B": range(10, 15)})
-   df.expanding().agg([np.sum, np.mean, np.std])
+   df.expanding().agg(["sum", "mean", "std"])
 
 
 .. _window.generic:

diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst
@@ -846,7 +846,7 @@ Enhancements
      df.pivot_table(values='Quantity',
                     index=pd.Grouper(freq='M', key='Date'),
                     columns=pd.Grouper(freq='M', key='PayDay'),
-                    aggfunc=np.sum)
+                    aggfunc="sum")
 
 - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`)
 - Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs <basics.nsorted>` (:issue:`3960`)