BUG: AttributeError: 'BooleanArray' object has no attribute 'sum' while infer types pandas-dev#44079 (pandas-dev#44442)

zhangxiaoxing · web-flow · commit c676844ff45f · 2021-11-20T10:58:54.000-05:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -646,6 +646,7 @@ I/O
 - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`)
 - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`)
 - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`)
+- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
 - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
 
 Period
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -705,7 +705,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
             # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected
             # "Union[Union[ExtensionArray, ndarray], Index, Series]"
             mask = algorithms.isin(values, list(na_values))  # type: ignore[arg-type]
-            na_count = mask.sum()
+            na_count = mask.astype("uint8", copy=False).sum()
             if na_count > 0:
                 if is_integer_dtype(values):
                     values = values.astype(np.float64)
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -297,3 +297,27 @@ def test_multiindex_columns_index_col_with_data(all_parsers):
         index=Index(["data"]),
     )
     tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_infer_types_boolean_sum(all_parsers):
+    # GH#44079
+    parser = all_parsers
+    result = parser.read_csv(
+        StringIO("0,1"),
+        names=["a", "b"],
+        index_col=["a"],
+        dtype={"a": "UInt8"},
+    )
+    expected = DataFrame(
+        data={
+            "a": [
+                0,
+            ],
+            "b": [1],
+        }
+    ).set_index("a")
+    # Not checking index type now, because the C parser will return a
+    # index column of dtype 'object', and the Python parser will return a
+    # index column of dtype 'int64'.
+    tm.assert_frame_equal(result, expected, check_index_type=False)