Skip to content

REF: Separate out DataFrame/Series Construction Helpers #24100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 5, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update imports, re-privatize
  • Loading branch information
jbrockmendel committed Dec 4, 2018
commit d9a1dc3a2eea74b0030a8f5a10e3b54a594e2269
4 changes: 2 additions & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
values = maybe_infer_to_datetimelike(values, convert_dates=True)
if not isinstance(values, np.ndarray):
values = _convert_to_list_like(values)
from pandas.core.series import sanitize_array
from pandas.core.internals.construction import sanitize_array
# By convention, empty lists result in object dtype:
if len(values) == 0:
sanitize_dtype = 'object'
Expand Down Expand Up @@ -2442,7 +2442,7 @@ def isin(self, values):
>>> s.isin(['lama'])
array([ True, False, True, False, True, False])
"""
from pandas.core.series import sanitize_array
from pandas.core.internals.construction import sanitize_array
if not is_list_like(values):
raise TypeError("only list-like objects are allowed to be passed"
" to isin(), you passed a [{values_type}]"
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None,
if not is_array_like(data):
try:
# probably shared code in sanitize_series
from pandas.core.series import sanitize_array
from pandas.core.internals.construction import sanitize_array
data = sanitize_array(data, index=None)
except ValueError:
# NumPy may raise a ValueError on data like [1, []]
Expand Down
26 changes: 13 additions & 13 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
dtype=dtype, copy=copy)
elif isinstance(data, dict):
mgr = self._init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(data, index, columns, dtype=dtype)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords
# masked recarray
Expand All @@ -400,22 +400,22 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
data[mask] = fill_value
else:
data = data.copy()
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
mgr = init_ndarray(data, index, columns, dtype=dtype,
copy=copy)

elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
mgr = self._init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(data, index, columns, dtype=dtype)
elif getattr(data, 'name', None) is not None:
mgr = self._init_dict({data.name: data}, index, columns,
dtype=dtype)
mgr = init_dict({data.name: data}, index, columns,
dtype=dtype)
else:
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
mgr = init_ndarray(data, index, columns, dtype=dtype,
copy=copy)

# For data is list-like, or Iterable (will consume into list)
elif (isinstance(data, compat.Iterable)
Expand All @@ -441,10 +441,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
mgr = arrays_to_mgr(arrays, columns, index, columns,
dtype=dtype)
else:
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
mgr = init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
else:
mgr = self._init_dict({}, index, columns, dtype=dtype)
mgr = init_dict({}, index, columns, dtype=dtype)
else:
try:
arr = np.array(data, dtype=dtype, copy=copy)
Expand All @@ -456,8 +456,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
if arr.ndim == 0 and index is not None and columns is not None:
values = cast_scalar_to_array((len(index), len(columns)),
data, dtype=dtype)
mgr = self._init_ndarray(values, index, columns,
dtype=values.dtype, copy=False)
mgr = init_ndarray(values, index, columns,
dtype=values.dtype, copy=False)
else:
raise ValueError('DataFrame constructor not properly called!')

Expand Down
171 changes: 88 additions & 83 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
# ---------------------------------------------------------------------
# BlockManager Interface


def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
"""
Segregate Series based on type and coerce into matrices.
Expand All @@ -50,7 +51,7 @@ def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
index = ensure_index(index)

# don't force copy because getting jammed in an ndarray anyway
arrays = homogenize(arrays, index, dtype)
arrays = _homogenize(arrays, index, dtype)

# from BlockManager perspective
axes = [ensure_index(columns), index]
Expand Down Expand Up @@ -116,21 +117,6 @@ def init_ndarray(values, index, columns, dtype=None, copy=False):
if not len(values) and columns is not None and len(columns):
values = np.empty((0, 1), dtype=object)

# helper to create the axes as indexes
def _get_axes(N, K, index=index, columns=columns):
# return axes or defaults

if index is None:
index = ibase.default_index(N)
else:
index = ensure_index(index)

if columns is None:
columns = ibase.default_index(K)
else:
columns = ensure_index(columns)
return index, columns

# we could have a categorical type passed or coerced to 'category'
# recast this to an arrays_to_mgr
if (is_categorical_dtype(getattr(values, 'dtype', None)) or
Expand All @@ -142,7 +128,7 @@ def _get_axes(N, K, index=index, columns=columns):
elif copy:
values = values.copy()

index, columns = _get_axes(len(values), 1)
index, columns = _get_axes(len(values), 1, index, columns)
return arrays_to_mgr([values], columns, index, columns,
dtype=dtype)
elif (is_datetime64tz_dtype(values) or
Expand All @@ -167,7 +153,7 @@ def _get_axes(N, K, index=index, columns=columns):
orig=orig))
raise_with_traceback(e)

index, columns = _get_axes(*values.shape)
index, columns = _get_axes(*values.shape, index=index, columns=columns)
values = values.T

# if we don't have a dtype specified, then try to convert objects
Expand Down Expand Up @@ -255,7 +241,7 @@ def convert(v):
return values


def homogenize(data, index, dtype=None):
def _homogenize(data, index, dtype=None):
oindex = None
homogenized = []

Expand Down Expand Up @@ -363,6 +349,22 @@ def get_names_from_index(data):
return index


def _get_axes(N, K, index, columns):
# helper to create the axes as indexes
# return axes or defaults

if index is None:
index = ibase.default_index(N)
else:
index = ensure_index(index)

if columns is None:
columns = ibase.default_index(K)
else:
columns = ensure_index(columns)
return index, columns


# ---------------------------------------------------------------------
# Conversion of Inputs to Arrays

Expand All @@ -387,15 +389,15 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
return [[]] * len(columns), columns
return [], [] # columns if columns is not None else []
if isinstance(data[0], (list, tuple)):
return list_to_arrays(data, columns, coerce_float=coerce_float,
dtype=dtype)
return _list_to_arrays(data, columns, coerce_float=coerce_float,
dtype=dtype)
elif isinstance(data[0], compat.Mapping):
return list_of_dict_to_arrays(data, columns,
coerce_float=coerce_float, dtype=dtype)
return _list_of_dict_to_arrays(data, columns,
coerce_float=coerce_float, dtype=dtype)
elif isinstance(data[0], ABCSeries):
return list_of_series_to_arrays(data, columns,
coerce_float=coerce_float,
dtype=dtype)
return _list_of_series_to_arrays(data, columns,
coerce_float=coerce_float,
dtype=dtype)
elif isinstance(data[0], Categorical):
if columns is None:
columns = ibase.default_index(len(data))
Expand All @@ -409,21 +411,21 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
else:
# last ditch effort
data = lmap(tuple, data)
return list_to_arrays(data, columns, coerce_float=coerce_float,
dtype=dtype)
return _list_to_arrays(data, columns, coerce_float=coerce_float,
dtype=dtype)


def list_to_arrays(data, columns, coerce_float=False, dtype=None):
def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
if len(data) > 0 and isinstance(data[0], tuple):
content = list(lib.to_object_array_tuples(data).T)
else:
# list of lists
content = list(lib.to_object_array(data).T)
return convert_object_array(content, columns, dtype=dtype,
coerce_float=coerce_float)
return _convert_object_array(content, columns, dtype=dtype,
coerce_float=coerce_float)


def list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
if columns is None:
columns = _get_objs_combined_axis(data, sort=False)

Expand All @@ -447,13 +449,13 @@ def list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):

if values.dtype == np.object_:
content = list(values.T)
return convert_object_array(content, columns, dtype=dtype,
coerce_float=coerce_float)
return _convert_object_array(content, columns, dtype=dtype,
coerce_float=coerce_float)
else:
return values.T, columns


def list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
if columns is None:
gen = (list(x.keys()) for x in data)
sort = not any(isinstance(d, OrderedDict) for d in data)
Expand All @@ -464,11 +466,11 @@ def list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
data = [(type(d) is dict) and d or dict(d) for d in data]

content = list(lib.dicts_to_array(data, list(columns)).T)
return convert_object_array(content, columns, dtype=dtype,
coerce_float=coerce_float)
return _convert_object_array(content, columns, dtype=dtype,
coerce_float=coerce_float)


def convert_object_array(content, columns, coerce_float=False, dtype=None):
def _convert_object_array(content, columns, coerce_float=False, dtype=None):
if columns is None:
columns = ibase.default_index(len(content))
else:
Expand Down Expand Up @@ -539,45 +541,6 @@ def sanitize_array(data, index, dtype=None, copy=False,
else:
data = data.copy()

def _try_cast(arr, take_fast_path):

# perf shortcut as this is the most common case
if take_fast_path:
if maybe_castable(arr) and not copy and dtype is None:
return arr

try:
# GH#15832: Check if we are requesting a numeric dype and
# that we can convert the data to the requested dtype.
if is_integer_dtype(dtype):
subarr = maybe_cast_to_integer_array(arr, dtype)

subarr = maybe_cast_to_datetime(arr, dtype)
# Take care in creating object arrays (but iterators are not
# supported):
if is_object_dtype(dtype) and (is_list_like(subarr) and
not (is_iterator(subarr) or
isinstance(subarr, np.ndarray))):
subarr = construct_1d_object_array_from_listlike(subarr)
elif not is_extension_type(subarr):
subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
copy=copy)
except (ValueError, TypeError):
if is_categorical_dtype(dtype):
# We *do* allow casting to categorical, since we know
# that Categorical is the only array type for 'category'.
subarr = Categorical(arr, dtype.categories,
ordered=dtype.ordered)
elif is_extension_array_dtype(dtype):
# create an extension array from its dtype
array_type = dtype.construct_array_type()._from_sequence
subarr = array_type(arr, dtype=dtype, copy=copy)
elif dtype is not None and raise_cast_failure:
raise
else:
subarr = np.array(arr, dtype=object, copy=copy)
return subarr

# GH#846
if isinstance(data, (np.ndarray, Index, ABCSeries)):

Expand All @@ -587,11 +550,12 @@ def _try_cast(arr, take_fast_path):
# possibility of nan -> garbage
if is_float_dtype(data.dtype) and is_integer_dtype(dtype):
if not isna(data).any():
subarr = _try_cast(data, True)
subarr = _try_cast(data, True, dtype, copy,
raise_cast_failure)
elif copy:
subarr = data.copy()
else:
subarr = _try_cast(data, True)
subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
elif isinstance(data, Index):
# don't coerce Index types
# e.g. indexes can have different conversions (so don't fast path
Expand All @@ -601,7 +565,7 @@ def _try_cast(arr, take_fast_path):
else:

# we will try to copy be-definition here
subarr = _try_cast(data, True)
subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)

elif isinstance(data, ExtensionArray):
subarr = data
Expand All @@ -616,7 +580,8 @@ def _try_cast(arr, take_fast_path):
elif isinstance(data, (list, tuple)) and len(data) > 0:
if dtype is not None:
try:
subarr = _try_cast(data, False)
subarr = _try_cast(data, False, dtype, copy,
raise_cast_failure)
except Exception:
if raise_cast_failure: # pragma: no cover
raise
Expand All @@ -632,9 +597,9 @@ def _try_cast(arr, take_fast_path):
# GH#16804
start, stop, step = get_range_parameters(data)
arr = np.arange(start, stop, step, dtype='int64')
subarr = _try_cast(arr, False)
subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure)
else:
subarr = _try_cast(data, False)
subarr = _try_cast(data, False, dtype, copy, raise_cast_failure)

# scalar like, GH
if getattr(subarr, 'ndim', 0) == 0:
Expand Down Expand Up @@ -691,3 +656,43 @@ def _try_cast(arr, take_fast_path):
pass

return subarr


def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure):

# perf shortcut as this is the most common case
if take_fast_path:
if maybe_castable(arr) and not copy and dtype is None:
return arr

try:
# GH#15832: Check if we are requesting a numeric dype and
# that we can convert the data to the requested dtype.
if is_integer_dtype(dtype):
subarr = maybe_cast_to_integer_array(arr, dtype)

subarr = maybe_cast_to_datetime(arr, dtype)
# Take care in creating object arrays (but iterators are not
# supported):
if is_object_dtype(dtype) and (is_list_like(subarr) and
not (is_iterator(subarr) or
isinstance(subarr, np.ndarray))):
subarr = construct_1d_object_array_from_listlike(subarr)
elif not is_extension_type(subarr):
subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
copy=copy)
except (ValueError, TypeError):
if is_categorical_dtype(dtype):
# We *do* allow casting to categorical, since we know
# that Categorical is the only array type for 'category'.
subarr = Categorical(arr, dtype.categories,
ordered=dtype.ordered)
elif is_extension_array_dtype(dtype):
# create an extension array from its dtype
array_type = dtype.construct_array_type()._from_sequence
subarr = array_type(arr, dtype=dtype, copy=copy)
elif dtype is not None and raise_cast_failure:
raise
else:
subarr = np.array(arr, dtype=object, copy=copy)
return subarr