Skip to content

Commit 3313f23

Browse files
committed
API: Infer extension types in array
* string * integer
1 parent 816f3df commit 3313f23

File tree

6 files changed

+108
-24
lines changed

6 files changed

+108
-24
lines changed

doc/source/user_guide/integer_na.rst

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ numbers.
2525

2626
Pandas can represent integer data with possibly missing values using
2727
:class:`arrays.IntegerArray`. This is an :ref:`extension types <extending.extension-types>`
28-
implemented within pandas. It is not the default dtype for integers, and will not be inferred;
29-
you must explicitly pass the dtype into :meth:`array` or :class:`Series`:
28+
implemented within pandas.
3029

3130
.. ipython:: python
3231
@@ -50,17 +49,34 @@ NumPy array.
5049
You can also pass the list-like object to the :class:`Series` constructor
5150
with the dtype.
5251

53-
.. ipython:: python
52+
.. warning::
5453

55-
s = pd.Series([1, 2, np.nan], dtype="Int64")
56-
s
54+
Currently :meth:`pandas.array` and :meth:`pandas.Series` use different
55+
rules for dtype inference. :meth:`pandas.array` will infer a nullable-
56+
integer dtype
5757

58-
By default (if you don't specify ``dtype``), NumPy is used, and you'll end
59-
up with a ``float64`` dtype Series:
58+
.. ipython:: python
6059
61-
.. ipython:: python
60+
pd.array([1, None])
61+
pd.array([1, 2])
62+
63+
For backwards-compatibility, :class:`Series` infers these as either
64+
integer or float dtype
65+
66+
.. ipython:: python
67+
68+
pd.Series([1, None])
69+
pd.Series([1, 2])
70+
71+
We recommend explicitly providing the dtype to avoid confusion.
72+
73+
.. ipython:: python
74+
75+
pd.array([1, None], dtype="Int64")
76+
pd.Series([1, None], dtype="Int64")
6277
63-
pd.Series([1, 2, np.nan])
78+
In the future, we may provide an option for :class:`Series` to infer a
79+
nullable-integer dtype.
6480

6581
Operations involving an integer array will behave similar to NumPy arrays.
6682
Missing values will be propagated, and the data will be coerced to another

doc/source/whatsnew/v1.0.0.rst

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,37 @@ The following methods now also correctly output values for unobserved categories
234234
235235
df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
236236
237+
:meth:`pandas.array` inference changes
238+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
239+
240+
:meth:`pandas.array` now infers pandas' new extension types in several cases:
241+
242+
1. Sting data (including missing values) now returns a :class:`arrays.StringArray`.
243+
2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`.
244+
245+
*pandas 0.25.x*
246+
247+
.. code-block:: python
248+
249+
>>> pd.array(["a", None])
250+
<PandasArray>
251+
['a', None]
252+
Length: 2, dtype: object
253+
254+
>>> pd.array([1, None])
255+
<PandasArray>
256+
[1, None]
257+
Length: 2, dtype: object
258+
259+
260+
*pandas 1.0.0*
261+
262+
.. ipython:: python
263+
264+
pd.array(["a", None])
265+
pd.array([1, None])
266+
267+
As a reminder, you can specify the ``dtype`` to disable all inference.
237268

238269
.. _whatsnew_1000.api_breaking.deps:
239270

pandas/_libs/lib.pyx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,6 +1113,7 @@ def infer_dtype(value: object, skipna: object=None) -> str:
11131113
Results can include:
11141114

11151115
- string
1116+
- mixed-string
11161117
- unicode
11171118
- bytes
11181119
- floating
@@ -1319,8 +1320,11 @@ def infer_dtype(value: object, skipna: object=None) -> str:
13191320
return 'boolean'
13201321

13211322
elif isinstance(val, str):
1322-
if is_string_array(values, skipna=skipna):
1323-
return 'string'
1323+
if is_string_array(values, skipna=True):
1324+
if isnaobj(values).any():
1325+
return "mixed-string"
1326+
else:
1327+
return "string"
13241328

13251329
elif isinstance(val, bytes):
13261330
if is_bytes_array(values, skipna=skipna):

pandas/core/construction.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,18 @@ def array(
9494
:class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
9595
:class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
9696
:class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
97+
:class:`int` :class:`pandas.arrays.IntegerArray`
98+
:class:`str` :class:`pandas.arrays.StringArray`
9799
============================== =====================================
98100
99101
For all other cases, NumPy's usual inference rules will be used.
100102
101-
copy : bool, default True
103+
.. versionchanged:: 1.0.0
104+
105+
Pandas infers nullable-integer dtype for integer data and
106+
string dtype for string data.
107+
108+
copy : bool, default True
102109
Whether to copy the data, even if not necessary. Depending
103110
on the type of `data`, creating the new array may require
104111
copying data, even if ``copy=False``.
@@ -246,21 +253,25 @@ def array(
246253
"""
247254
from pandas.core.arrays import (
248255
period_array,
256+
IntegerArray,
249257
IntervalArray,
250258
PandasArray,
251259
DatetimeArray,
252260
TimedeltaArray,
261+
StringArray,
253262
)
254263

255264
if lib.is_scalar(data):
256265
msg = "Cannot pass scalar '{}' to 'pandas.array'."
257266
raise ValueError(msg.format(data))
258267

259-
data = extract_array(data, extract_numpy=True)
260-
261-
if dtype is None and isinstance(data, ABCExtensionArray):
268+
if dtype is None and isinstance(
269+
data, (ABCSeries, ABCIndexClass, ABCExtensionArray)
270+
):
262271
dtype = data.dtype
263272

273+
data = extract_array(data, extract_numpy=True)
274+
264275
# this returns None for not-found dtypes.
265276
if isinstance(dtype, str):
266277
dtype = registry.find(dtype) or dtype
@@ -298,6 +309,12 @@ def array(
298309
# timedelta, timedelta64
299310
return TimedeltaArray._from_sequence(data, copy=copy)
300311

312+
elif inferred_dtype in {"string", "mixed-string"}:
313+
return StringArray._from_sequence(data, copy=copy)
314+
315+
elif inferred_dtype in {"integer", "mixed-integer"}:
316+
return IntegerArray._from_sequence(data, copy=copy)
317+
301318
# TODO(BooleanArray): handle this type
302319

303320
# Pandas overrides NumPy for

pandas/tests/arrays/test_array.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919
"data, dtype, expected",
2020
[
2121
# Basic NumPy defaults.
22-
([1, 2], None, PandasArray(np.array([1, 2]))),
22+
([1, 2], None, pd.arrays.IntegerArray._from_sequence([1, 2])),
2323
([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),
2424
(
2525
[1, 2],
2626
np.dtype("float32"),
2727
PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
2828
),
29-
(np.array([1, 2]), None, PandasArray(np.array([1, 2]))),
29+
(np.array([1, 2]), None, pd.arrays.IntegerArray._from_sequence([1, 2])),
3030
# String alias passes through to NumPy
3131
([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))),
3232
# Period alias
@@ -113,6 +113,13 @@
113113
# IntegerNA
114114
([1, None], "Int16", integer_array([1, None], dtype="Int16")),
115115
(pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
116+
# String
117+
(["a", None], "string", pd.arrays.StringArray._from_sequence(["a", None])),
118+
(
119+
["a", None],
120+
pd.StringDtype(),
121+
pd.arrays.StringArray._from_sequence(["a", None]),
122+
),
116123
# Index
117124
(pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
118125
# Series[EA] returns the EA
@@ -139,15 +146,15 @@ def test_array(data, dtype, expected):
139146
def test_array_copy():
140147
a = np.array([1, 2])
141148
# default is to copy
142-
b = pd.array(a)
149+
b = pd.array(a, dtype=a.dtype)
143150
assert np.shares_memory(a, b._ndarray) is False
144151

145152
# copy=True
146-
b = pd.array(a, copy=True)
153+
b = pd.array(a, dtype=a.dtype, copy=True)
147154
assert np.shares_memory(a, b._ndarray) is False
148155

149156
# copy=False
150-
b = pd.array(a, copy=False)
157+
b = pd.array(a, dtype=a.dtype, copy=False)
151158
assert np.shares_memory(a, b._ndarray) is True
152159

153160

@@ -211,6 +218,12 @@ def test_array_copy():
211218
np.array([1, 2], dtype="m8[us]"),
212219
pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")),
213220
),
221+
# integer
222+
([1, 2], pd.arrays.IntegerArray._from_sequence([1, 2])),
223+
([1, None], pd.arrays.IntegerArray._from_sequence([1, None])),
224+
# string
225+
(["a", "b"], pd.arrays.StringArray._from_sequence(["a", "b"])),
226+
(["a", None], pd.arrays.StringArray._from_sequence(["a", None])),
214227
],
215228
)
216229
def test_array_inference(data, expected):
@@ -241,7 +254,7 @@ def test_array_inference_fails(data):
241254
@pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]])
242255
def test_nd_raises(data):
243256
with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"):
244-
pd.array(data)
257+
pd.array(data, dtype="int64")
245258

246259

247260
def test_scalar_raises():

pandas/tests/dtypes/test_inference.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -732,12 +732,15 @@ def test_string(self):
732732
def test_unicode(self):
733733
arr = ["a", np.nan, "c"]
734734
result = lib.infer_dtype(arr, skipna=False)
735-
assert result == "mixed"
735+
assert result == "mixed-string"
736736

737737
arr = ["a", np.nan, "c"]
738738
result = lib.infer_dtype(arr, skipna=True)
739-
expected = "string"
740-
assert result == expected
739+
assert result == "string"
740+
741+
arr = ["a", "c"]
742+
result = lib.infer_dtype(arr, skipna=False)
743+
assert result == "string"
741744

742745
@pytest.mark.parametrize(
743746
"dtype, missing, skipna, expected",

0 commit comments

Comments
 (0)