CLN: TODOs and FIXMEs (pandas-dev#44455)

jbrockmendel · web-flow · commit b314627e3795 · 2021-11-15T19:02:44.000-05:00
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
@@ -4,7 +4,10 @@
 
 import numpy as np
 
-from pandas._typing import ArrayLike
+from pandas._typing import (
+    ArrayLike,
+    npt,
+)
 
 from pandas.core.dtypes.common import is_sparse
 from pandas.core.dtypes.missing import (
@@ -18,7 +21,9 @@
     from pandas.core.arrays import ExtensionArray
 
 
-def quantile_compat(values: ArrayLike, qs: np.ndarray, interpolation: str) -> ArrayLike:
+def quantile_compat(
+    values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str
+) -> ArrayLike:
     """
     Compute the quantiles of the given values for each quantile in `qs`.
 
@@ -55,7 +60,7 @@ def _quantile_with_mask(
     values: np.ndarray,
     mask: np.ndarray,
     fill_value,
-    qs: np.ndarray,
+    qs: npt.NDArray[np.float64],
     interpolation: str,
 ) -> np.ndarray:
     """
@@ -112,7 +117,7 @@ def _quantile_with_mask(
 
 
 def _quantile_ea_compat(
-    values: ExtensionArray, qs: np.ndarray, interpolation: str
+    values: ExtensionArray, qs: npt.NDArray[np.float64], interpolation: str
 ) -> ExtensionArray:
     """
     ExtensionArray compatibility layer for _quantile_with_mask.
@@ -158,7 +163,7 @@ def _quantile_ea_compat(
 
 
 def _quantile_ea_fallback(
-    values: ExtensionArray, qs: np.ndarray, interpolation: str
+    values: ExtensionArray, qs: npt.NDArray[np.float64], interpolation: str
 ) -> ExtensionArray:
     """
     quantile compatibility for ExtensionArray subclasses that do not
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1584,6 +1584,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
                 value = try_datetime(v)  # type: ignore[assignment]
 
     if value.dtype.kind in ["m", "M"] and seen_str:
+        # TODO(2.0): enforcing this deprecation should close GH#40111
         warnings.warn(
             f"Inferring {value.dtype} from data containing strings is deprecated "
             "and will be removed in a future version. To retain the old behavior "
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1902,11 +1902,13 @@ def _setitem_single_block(self, indexer, value, name: str):
             ):
                 col = item_labels[indexer[info_axis]]
                 if len(item_labels.get_indexer_for([col])) == 1:
+                    # e.g. test_loc_setitem_empty_append_expands_rows
                     loc = item_labels.get_loc(col)
                     self.obj._iset_item(loc, value, inplace=True)
                     return
 
-            indexer = maybe_convert_ix(*indexer)
+            indexer = maybe_convert_ix(*indexer)  # e.g. test_setitem_frame_align
+
         if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict):
             # TODO(EA): ExtensionBlock.setitem this causes issues with
             # setting for extensionarrays that store dicts. Need to decide
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -930,10 +930,7 @@ def setitem(self, indexer, value):
             value = setitem_datetimelike_compat(values, len(values[indexer]), value)
             values[indexer] = value
 
-        if transpose:
-            values = values.T
-        block = type(self)(values, placement=self._mgr_locs, ndim=self.ndim)
-        return block
+        return self
 
     def putmask(self, mask, new) -> list[Block]:
         """
@@ -961,9 +958,7 @@ def putmask(self, mask, new) -> list[Block]:
             new = self.fill_value
 
         if self._can_hold_element(new):
-            # error: Argument 1 to "putmask_without_repeat" has incompatible type
-            # "Union[ndarray, ExtensionArray]"; expected "ndarray"
-            putmask_without_repeat(self.values.T, mask, new)  # type: ignore[arg-type]
+            putmask_without_repeat(values.T, mask, new)
             return [self]
 
         elif noop:
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -861,9 +861,9 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
         """
         # We have 6 tests that get here with a slice
         indexer = (
-            np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64")
+            np.arange(indexer.start, indexer.stop, indexer.step, dtype=np.intp)
             if isinstance(indexer, slice)
-            else np.asanyarray(indexer, dtype="int64")
+            else np.asanyarray(indexer, dtype=np.intp)
         )
 
         n = self.shape[axis]
diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py
@@ -34,7 +34,7 @@
 from pandas.core.ops import roperator
 
 
-def fill_zeros(result, x, y):
+def _fill_zeros(result, x, y):
     """
     If this is a reversed op, then flip x,y
 
@@ -102,9 +102,6 @@ def mask_zero_div_zero(x, y, result: np.ndarray) -> np.ndarray:
     >>> mask_zero_div_zero(x, y, result)
     array([ inf,  nan, -inf])
     """
-    if not isinstance(result, np.ndarray):
-        # FIXME: SparseArray would raise TypeError with np.putmask
-        return result
 
     if is_scalar(y):
         y = np.array(y)
@@ -141,7 +138,7 @@ def mask_zero_div_zero(x, y, result: np.ndarray) -> np.ndarray:
 
 def dispatch_fill_zeros(op, left, right, result):
     """
-    Call fill_zeros with the appropriate fill value depending on the operation,
+    Call _fill_zeros with the appropriate fill value depending on the operation,
     with special logic for divmod and rdivmod.
 
     Parameters
@@ -163,12 +160,12 @@ def dispatch_fill_zeros(op, left, right, result):
     if op is divmod:
         result = (
             mask_zero_div_zero(left, right, result[0]),
-            fill_zeros(result[1], left, right),
+            _fill_zeros(result[1], left, right),
         )
     elif op is roperator.rdivmod:
         result = (
             mask_zero_div_zero(right, left, result[0]),
-            fill_zeros(result[1], right, left),
+            _fill_zeros(result[1], right, left),
         )
     elif op is operator.floordiv:
         # Note: no need to do this for truediv; in py3 numpy behaves the way
@@ -179,7 +176,7 @@ def dispatch_fill_zeros(op, left, right, result):
         #  we want.
         result = mask_zero_div_zero(right, left, result)
     elif op is operator.mod:
-        result = fill_zeros(result, left, right)
+        result = _fill_zeros(result, left, right)
     elif op is roperator.rmod:
-        result = fill_zeros(result, right, left)
+        result = _fill_zeros(result, right, left)
     return result
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
@@ -1,6 +1,5 @@
 import collections
 from datetime import timedelta
-from io import StringIO
 
 import numpy as np
 import pytest
@@ -190,19 +189,21 @@ def test_value_counts_datetime64(index_or_series):
 
     # GH 3002, datetime64[ns]
     # don't test names though
-    txt = "\n".join(
-        [
-            "xxyyzz20100101PIE",
-            "xxyyzz20100101GUM",
-            "xxyyzz20100101EGG",
-            "xxyyww20090101EGG",
-            "foofoo20080909PIE",
-            "foofoo20080909GUM",
-        ]
-    )
-    f = StringIO(txt)
-    df = pd.read_fwf(
-        f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]
+    df = pd.DataFrame(
+        {
+            "person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
+            "dt": pd.to_datetime(
+                [
+                    "2010-01-01",
+                    "2010-01-01",
+                    "2010-01-01",
+                    "2009-01-01",
+                    "2008-09-09",
+                    "2008-09-09",
+                ]
+            ),
+            "food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
+        }
     )
 
     s = klass(df["dt"].copy())
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
@@ -209,15 +209,15 @@ def test_combine_first_align_nan(self):
         )
         tm.assert_frame_equal(res, exp)
         assert res["a"].dtype == "datetime64[ns]"
-        # ToDo: this must be int64
+        # TODO: this must be int64
         assert res["b"].dtype == "int64"
 
         res = dfa.iloc[:0].combine_first(dfb)
         exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
         tm.assert_frame_equal(res, exp)
-        # ToDo: this must be datetime64
+        # TODO: this must be datetime64
         assert res["a"].dtype == "float64"
-        # ToDo: this must be int64
+        # TODO: this must be int64
         assert res["b"].dtype == "int64"
 
     def test_combine_first_timezone(self):
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from datetime import datetime
-from io import StringIO
 import re
 
 import numpy as np
@@ -912,12 +911,14 @@ def test_replace_dict_tuple_list_ordering_remains_the_same(self):
         tm.assert_frame_equal(res3, expected)
 
     def test_replace_doesnt_replace_without_regex(self):
-        raw = """fol T_opp T_Dir T_Enh
-        0    1     0     0    vo
-        1    2    vr     0     0
-        2    2     0     0     0
-        3    3     0    bt     0"""
-        df = pd.read_csv(StringIO(raw), sep=r"\s+")
+        df = DataFrame(
+            {
+                "fol": [1, 2, 2, 3],
+                "T_opp": ["0", "vr", "0", "0"],
+                "T_Dir": ["0", "0", "0", "bt"],
+                "T_Enh": ["vo", "0", "0", "0"],
+            }
+        )
         res = df.replace({r"\D": 1})
         tm.assert_frame_equal(df, res)
 
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
@@ -2,7 +2,6 @@
     datetime,
     timedelta,
 )
-from io import StringIO
 import itertools
 
 import numpy as np
@@ -289,15 +288,29 @@ def test_pickle(self, float_string_frame, timezone_frame):
     def test_consolidate_datetime64(self):
         # numpy vstack bug
 
-        data = (
-            "starting,ending,measure\n"
-            "2012-06-21 00:00,2012-06-23 07:00,77\n"
-            "2012-06-23 07:00,2012-06-23 16:30,65\n"
-            "2012-06-23 16:30,2012-06-25 08:00,77\n"
-            "2012-06-25 08:00,2012-06-26 12:00,0\n"
-            "2012-06-26 12:00,2012-06-27 08:00,77\n"
+        df = DataFrame(
+            {
+                "starting": pd.to_datetime(
+                    [
+                        "2012-06-21 00:00",
+                        "2012-06-23 07:00",
+                        "2012-06-23 16:30",
+                        "2012-06-25 08:00",
+                        "2012-06-26 12:00",
+                    ]
+                ),
+                "ending": pd.to_datetime(
+                    [
+                        "2012-06-23 07:00",
+                        "2012-06-23 16:30",
+                        "2012-06-25 08:00",
+                        "2012-06-26 12:00",
+                        "2012-06-27 08:00",
+                    ]
+                ),
+                "measure": [77, 65, 77, 0, 77],
+            }
         )
-        df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
 
         ser_starting = df.starting
         ser_starting.index = ser_starting.values
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
@@ -1,4 +1,3 @@
-from io import StringIO
 import operator
 
 import numpy as np
@@ -1020,23 +1019,19 @@ def test_object_array_eq_ne(self, parser, engine):
 
     def test_query_with_nested_strings(self, parser, engine):
         skip_if_no_pandas_parser(parser)
-        raw = """id          event          timestamp
-        1   "page 1 load"   1/1/2014 0:00:01
-        1   "page 1 exit"   1/1/2014 0:00:31
-        2   "page 2 load"   1/1/2014 0:01:01
-        2   "page 2 exit"   1/1/2014 0:01:31
-        3   "page 3 load"   1/1/2014 0:02:01
-        3   "page 3 exit"   1/1/2014 0:02:31
-        4   "page 1 load"   2/1/2014 1:00:01
-        4   "page 1 exit"   2/1/2014 1:00:31
-        5   "page 2 load"   2/1/2014 1:01:01
-        5   "page 2 exit"   2/1/2014 1:01:31
-        6   "page 3 load"   2/1/2014 1:02:01
-        6   "page 3 exit"   2/1/2014 1:02:31
-        """
-        df = pd.read_csv(
-            StringIO(raw), sep=r"\s{2,}", engine="python", parse_dates=["timestamp"]
+        events = [
+            f"page {n} {act}" for n in range(1, 4) for act in ["load", "exit"]
+        ] * 2
+        stamps1 = date_range("2014-01-01 0:00:01", freq="30s", periods=6)
+        stamps2 = date_range("2014-02-01 1:00:01", freq="30s", periods=6)
+        df = DataFrame(
+            {
+                "id": np.arange(1, 7).repeat(2),
+                "event": events,
+                "timestamp": stamps1.append(stamps2),
+            }
         )
+
         expected = df[df.event == '"page 1 load"']
         res = df.query("""'"page 1 load"' in event""", parser=parser, engine=engine)
         tm.assert_frame_equal(expected, res)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1,6 +1,5 @@
 from datetime import datetime
 from decimal import Decimal
-from io import StringIO
 
 import numpy as np
 import pytest
@@ -20,7 +19,6 @@
     Timedelta,
     Timestamp,
     date_range,
-    read_csv,
     to_datetime,
 )
 import pandas._testing as tm
@@ -1132,14 +1130,18 @@ def test_grouping_ndarray(df):
 
 
 def test_groupby_wrong_multi_labels():
-    data = """index,foo,bar,baz,spam,data
-0,foo1,bar1,baz1,spam2,20
-1,foo1,bar2,baz1,spam3,30
-2,foo2,bar2,baz1,spam2,40
-3,foo1,bar1,baz2,spam1,50
-4,foo3,bar1,baz2,spam1,60"""
-
-    data = read_csv(StringIO(data), index_col=0)
+
+    index = Index([0, 1, 2, 3, 4], name="index")
+    data = DataFrame(
+        {
+            "foo": ["foo1", "foo1", "foo2", "foo1", "foo3"],
+            "bar": ["bar1", "bar2", "bar2", "bar1", "bar1"],
+            "baz": ["baz1", "baz1", "baz1", "baz2", "baz2"],
+            "spam": ["spam2", "spam3", "spam2", "spam1", "spam1"],
+            "data": [20, 30, 40, 50, 60],
+        },
+        index=index,
+    )
 
     grouped = data.groupby(["foo", "bar", "baz", "spam"])
 
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -277,7 +277,7 @@ def test_dups_fancy_indexing_only_missing_label(self):
         ):
             dfnu.loc[["E"]]
 
-        # ToDo: check_index_type can be True after GH 11497
+        # TODO: check_index_type can be True after GH 11497
 
     @pytest.mark.parametrize("vals", [[0, 1, 2], list("abc")])
     def test_dups_fancy_indexing_missing_label(self, vals):
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -1394,7 +1394,7 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture):
         tz = tz_naive_fixture
         idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz)
         expected = DataFrame(1.2, index=idx, columns=["var"])
-        # if result started off with object dtype, tehn the .loc.__setitem__
+        # if result started off with object dtype, then the .loc.__setitem__
         #  below would retain object dtype
         result = DataFrame(index=idx, columns=["var"], dtype=np.float64)
         result.loc[:, idxer] = expected
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
diff --git a/pandas/tests/reshape/concat/test_invalid.py b/pandas/tests/reshape/concat/test_invalid.py
diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py
diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py
diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py