Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 8b584c4

Browse files
committed
Fixes incorrect definition of layout for SeriesType
Details: definition of underlying data type of Series was done from PyObject dtype only and didn't take into account layout of original array, as a result 'C' layout was always inferred, where the original array might have other layout, breaking iteration over such Series (DF columns). Fixes #996.
1 parent 1ebf55c commit 8b584c4

File tree

3 files changed

+95
-15
lines changed

3 files changed

+95
-15
lines changed

sdc/hiframes/boxing.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646
from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical
4747
from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical
4848
from sdc.hiframes.pd_series_ext import SeriesType
49-
from sdc.hiframes.pd_series_type import _get_series_array_type
5049
from sdc.hiframes.pd_dataframe_ext import get_structure_maps
5150
from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types
5251

@@ -70,7 +69,7 @@ def typeof_pd_dataframe(val, c):
7069

7170
col_names = tuple(val.columns.tolist())
7271
# TODO: support other types like string and timestamp
73-
col_types = get_hiframes_dtypes(val)
72+
col_types = _infer_df_col_types(val)
7473
index_type = _infer_index_type(val.index)
7574
column_loc, _, _ = get_structure_maps(col_types, col_names)
7675

@@ -82,8 +81,24 @@ def typeof_pd_dataframe(val, c):
8281
def typeof_pd_series(val, c):
8382
index_type = _infer_index_type(val.index)
8483
is_named = val.name is not None
84+
85+
# attempt to define numba Series data type via Series values,
86+
# if not successful, define it later via dtype in SeriesType init
87+
if isinstance(val.values, np.ndarray):
88+
try:
89+
underlying_type = numba.typeof(val.values)
90+
except ValueError:
91+
pass
92+
93+
if not (isinstance(underlying_type, types.Array)
94+
and not isinstance(underlying_type.dtype, types.PyObject)):
95+
underlying_type = None
96+
8597
return SeriesType(
86-
_infer_series_dtype(val), index=index_type, is_named=is_named)
98+
dtype=_infer_series_dtype(val),
99+
data=underlying_type,
100+
index=index_type,
101+
is_named=is_named)
87102

88103

89104
@unbox(DataFrameType)
@@ -140,13 +155,13 @@ def unbox_dataframe(typ, val, c):
140155
return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))
141156

142157

143-
def get_hiframes_dtypes(df):
144-
"""get hiframe data types for a pandas dataframe
145-
"""
158+
def _infer_df_col_types(df):
159+
""" Infer column data types for a pandas DataFrame """
160+
146161
col_names = df.columns.tolist()
147-
hi_typs = [_get_series_array_type(_infer_series_dtype(df[cname]))
148-
for cname in col_names]
149-
return tuple(hi_typs)
162+
col_typs = [numba.typeof(df[cname]).data for cname in col_names]
163+
164+
return tuple(col_typs)
150165

151166

152167
def _infer_series_dtype(S):

sdc/tests/test_dataframe.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def test_impl():
132132
self.assertEqual(hpat_func(), test_impl())
133133

134134
def test_create_with_series1(self):
135+
""" Create pandas DataFrame from Series of different dtypes """
135136
def test_impl(n):
136137
A = pd.Series(np.ones(n, dtype=np.int64))
137138
B = pd.Series(np.zeros(n, dtype=np.float64))
@@ -143,7 +144,7 @@ def test_impl(n):
143144
pd.testing.assert_frame_equal(hpat_func(n), test_impl(n))
144145

145146
def test_create_with_series2(self):
146-
# test creating dataframe from passed series
147+
""" Test creating pandas DataFrame from passed Series """
147148
def test_impl(A):
148149
df = pd.DataFrame({'A': A})
149150
return (df.A == 2).sum()
@@ -153,6 +154,18 @@ def test_impl(A):
153154
df = pd.DataFrame({'A': np.arange(n)})
154155
self.assertEqual(hpat_func(df.A), test_impl(df.A))
155156

157+
def test_create_with_series3(self):
158+
""" Test creating pandas DataFrame from Series of different layouts """
159+
def test_impl(A, B):
160+
df = pd.DataFrame({'A': A, 'B': B})
161+
return df.A.sum(), df.B.sum()
162+
sdc_func = self.jit(test_impl)
163+
164+
n = 11
165+
A = pd.Series(np.arange(n))
166+
B = pd.Series(np.arange(2 * n)[::2])
167+
self.assertEqual(sdc_func(A, B), test_impl(A, B))
168+
156169
def test_df_create_param_index_default(self):
157170
def test_impl():
158171
data = {'A': ['a', 'b'], 'B': [2, 3]}
@@ -219,6 +232,8 @@ def test_impl():
219232
pd.testing.assert_frame_equal(hpat_func(), test_impl())
220233

221234
def test_pass_df1(self):
235+
""" Test passing df with contiguous data layout """
236+
222237
def test_impl(df):
223238
return (df.A == 2).sum()
224239
hpat_func = self.jit(test_impl)
@@ -227,6 +242,18 @@ def test_impl(df):
227242
df = pd.DataFrame({'A': np.arange(n)})
228243
self.assertEqual(hpat_func(df), test_impl(df))
229244

245+
def test_pass_df_2(self):
246+
""" Test passing df with non-contiguous data layout """
247+
248+
def test_impl(df):
249+
return df.B.sum()
250+
sdc_func = self.jit(test_impl)
251+
252+
n_rows, n_cols = 4, 6
253+
col_names = list(string.ascii_uppercase[:n_cols])
254+
df = pd.DataFrame(np.random.rand(n_rows, n_cols), columns=col_names)
255+
self.assertAlmostEqual(sdc_func(df), test_impl(df))
256+
230257
def test_pass_df_str(self):
231258
def test_impl(df):
232259
return (df.A == 'a').sum()

sdc/tests/test_series.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
gen_strlist,
6464
_make_func_from_text)
6565
from sdc.utilities.sdc_typing_utils import SDCLimitation
66+
from sdc.hiframes.pd_series_type import SeriesType
6667

6768

6869
_cov_corr_series = [(pd.Series(x), pd.Series(y)) for x, y in [
@@ -339,25 +340,43 @@ def test_impl(name):
339340

340341
pd.testing.assert_series_equal(hpat_func('A'), test_impl('A'))
341342

342-
@skip_numba_jit
343+
def test_create_series_data_layouts(self):
344+
def test_impl(data):
345+
vals = pd.Series(data).values
346+
return vals[0], vals[-1]
347+
sdc_func = self.jit(test_impl)
348+
349+
n = 10
350+
arrays_to_test = [
351+
np.arange(n), # 'C' layout
352+
np.arange(2 * n)[::2], # 'A' layout
353+
# no 'F' layout for 1d arrays
354+
]
355+
356+
for data in arrays_to_test:
357+
with self.subTest(layout=numba.typeof(data).layout):
358+
result = sdc_func(data)
359+
result_ref = test_impl(data)
360+
self.assertEqual(result, result_ref)
361+
343362
def test_pass_series1(self):
344-
# TODO: check to make sure it is series type
345363
def test_impl(A):
346364
return (A == 2).sum()
347-
hpat_func = self.jit(test_impl)
365+
sdc_func = self.jit(test_impl)
348366

349367
n = 11
350368
S = pd.Series(np.arange(n), name='A')
351-
self.assertEqual(hpat_func(S), test_impl(S))
369+
self.assertEqual(sdc_func(S), test_impl(S))
370+
self.assertIsInstance(numba.typeof(S), SeriesType)
352371

353-
@skip_numba_jit
354372
def test_pass_series_str(self):
355373
def test_impl(A):
356374
return (A == 'a').sum()
357375
hpat_func = self.jit(test_impl)
358376

359377
S = pd.Series(['a', 'b', 'c'], name='A')
360378
self.assertEqual(hpat_func(S), test_impl(S))
379+
self.assertIsInstance(numba.typeof(S), SeriesType)
361380

362381
def test_pass_series_all_indexes(self):
363382
def test_impl(A):
@@ -378,6 +397,25 @@ def test_impl(A):
378397
S = pd.Series(np.arange(n), index, name='A')
379398
pd.testing.assert_series_equal(hpat_func(S), test_impl(S))
380399

400+
def test_pass_series_data_layouts(self):
401+
def test_impl(S):
402+
vals = S.values
403+
return vals[0], vals[-1]
404+
sdc_func = self.jit(test_impl)
405+
406+
n = 10
407+
series_to_test = [
408+
# no 'F' layout for Series
409+
pd.Series(np.arange(n)), # 'C' layout
410+
pd.Series(np.arange(n))[::2], # 'A' layout
411+
]
412+
413+
for s in series_to_test:
414+
with self.subTest(layout=numba.typeof(s).data.layout):
415+
result = sdc_func(s)
416+
result_ref = test_impl(s)
417+
self.assertEqual(result, result_ref)
418+
381419
def test_series_getattr_size(self):
382420
def test_impl(S):
383421
return S.size

0 commit comments

Comments
 (0)