Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions sdc/datatypes/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ def _sdc_asarray(data):
pass


@sdc_overload(_sdc_asarray, jit_options={'parallel': True})
@sdc_overload(_sdc_asarray)
def _sdc_asarray_overload(data):

# TODO: extend with other types
Expand Down Expand Up @@ -673,14 +673,21 @@ def sdc_reindex_series(arr, index, name, by_index):
pass


@sdc_overload(sdc_reindex_series, jit_options={'parallel': True})
@sdc_overload(sdc_reindex_series)
def sdc_reindex_series_overload(arr, index, name, by_index):
""" Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """

same_index_types = index is by_index
data_dtype, index_dtype = arr.dtype, index.dtype
data_is_str_arr = isinstance(arr.dtype, types.UnicodeType)

def sdc_reindex_series_impl(arr, index, name, by_index):

# if index types are the same, we may not reindex if indexes are the same
if same_index_types == True: # noqa
if index is by_index:
return pandas.Series(data=arr, index=index, name=name)

if data_is_str_arr == True: # noqa
_res_data = [''] * len(by_index)
res_data_nan_mask = numpy.zeros(len(by_index), dtype=types.bool_)
Expand Down Expand Up @@ -722,5 +729,3 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
return pandas.Series(data=res_data, index=by_index, name=name)

return sdc_reindex_series_impl

return None
13 changes: 8 additions & 5 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
| Also, it contains Numba internal operators which are required for DataFrame type handling
'''


import numba
import numpy
import operator
Expand Down Expand Up @@ -66,6 +65,7 @@
from sdc.datatypes.common_functions import _sdc_take, sdc_reindex_series
from sdc.utilities.prange_utils import parallel_chunks


@sdc_overload_attribute(DataFrameType, 'index')
def hpat_pandas_dataframe_index(df):
"""
Expand Down Expand Up @@ -105,6 +105,7 @@ def hpat_pandas_df_index_none_impl(df):

return hpat_pandas_df_index_none_impl
else:

def hpat_pandas_df_index_impl(df):
return df._index

Expand Down Expand Up @@ -404,7 +405,6 @@ def sdc_pandas_dataframe_append_impl(df, other, _func_name, ignore_index, indexe

return sdc_pandas_dataframe_append_impl(df, other, _func_name, ignore_index, indexes_comparable, args)


# Example func_text for func_name='count' columns=('A', 'B'):
#
# def _df_count_impl(df, axis=0, level=None, numeric_only=False):
Expand Down Expand Up @@ -1534,9 +1534,9 @@ def df_getitem_bool_series_idx_main_codelines(self, idx):
else:
func_lines = [f' length = {df_length_expr(self)}',
f' self_index = self.index',
f' idx_reindexed = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
f' res_index = getitem_by_mask(self_index, idx_reindexed._data)',
f' selected_pos = getitem_by_mask(range(length), idx_reindexed._data)']
f' reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
f' res_index = getitem_by_mask(self_index, reindexed_idx._data)',
f' selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)']

results = []
for i, col in enumerate(self.columns):
Expand Down Expand Up @@ -1835,6 +1835,7 @@ def _df_getitem_str_literal_idx_impl(self, idx):
return _df_getitem_str_literal_idx_impl

if isinstance(idx, types.UnicodeType):

def _df_getitem_unicode_idx_impl(self, idx):
# http://numba.pydata.org/numba-doc/dev/developer/literal.html#specifying-for-literal-typing
# literally raises special exception to call getitem with literal idx value got from unicode
Expand Down Expand Up @@ -1886,6 +1887,7 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
if isinstance(idx, types.Tuple) and isinstance(idx[1], types.Literal):
col = idx[1].literal_value
if -1 < col < len(self.dataframe.columns):

def df_getitem_iat_tuple_impl(self, idx):
row, _ = idx
if -1 < row < len(self._dataframe.index):
Expand Down Expand Up @@ -2335,6 +2337,7 @@ def df_set_column_overload(self, key, value):
return gen_df_replace_column_impl(self, key)

if isinstance(key, types.UnicodeType):

def _df_set_column_unicode_key_impl(self, key, value):
# http://numba.pydata.org/numba-doc/dev/developer/literal.html#specifying-for-literal-typing
# literally raises special exception to call df._set_column with literal idx value got from unicode
Expand Down
24 changes: 11 additions & 13 deletions sdc/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,29 +420,27 @@ def hpat_pandas_series_getitem_idx_list_impl(self, idx):
' Given: self.index={}, idx.index={}'
raise TypingError(msg.format(_func_name, self.index, idx.index))

def hpat_pandas_series_getitem_idx_bool_indexer_impl(self, idx):
def _series_getitem_idx_bool_indexer_impl(self, idx):

if none_indexes == True: # noqa
if len(self) > len(idx):
msg = "Unalignable boolean Series provided as indexer " + \
"(index of the boolean Series and of the indexed object do not match)."
raise IndexingError(msg)

return pandas.Series(
data=numpy_like.getitem_by_mask(self._data, idx._data),
index=numpy_like.getitem_by_mask(range(len(self)), idx._data),
name=self._name
)
self_index = range(len(self))
reindexed_idx = idx
else:
self_index = self.index
idx_reindexed = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)
return pandas.Series(
data=numpy_like.getitem_by_mask(self._data, idx_reindexed._data),
index=numpy_like.getitem_by_mask(self_index, idx_reindexed._data),
name=self._name
)
reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)

return pandas.Series(
data=numpy_like.getitem_by_mask(self._data, reindexed_idx._data),
index=numpy_like.getitem_by_mask(self_index, reindexed_idx._data),
name=self._name
)

return hpat_pandas_series_getitem_idx_bool_indexer_impl
return _series_getitem_idx_bool_indexer_impl

# idx is Series and it's index is None, idx.dtype is not Boolean
if (isinstance(idx, SeriesType) and index_is_none
Expand Down
11 changes: 11 additions & 0 deletions sdc/str_arr_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1543,3 +1543,14 @@ def _sdc_str_arr_operator_mul_impl(self, other):
return res_arr

return _sdc_str_arr_operator_mul_impl


@lower_builtin(operator.is_, StringArrayType, StringArrayType)
def sdc_str_arr_operator_is(context, builder, sig, args):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good.

We would need to implement such method for each new series type (e.g. categorical, datetime, etc.)


# meminfo ptr uniquely identifies each StringArray allocation
a = context.make_helper(builder, string_array_type, args[0])
b = context.make_helper(builder, string_array_type, args[1])
ma = builder.ptrtoint(a.meminfo, cgutils.intp_t)
mb = builder.ptrtoint(b.meminfo, cgutils.intp_t)
return builder.icmp_signed('==', ma, mb)
25 changes: 25 additions & 0 deletions sdc/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -6984,6 +6984,31 @@ def test_impl(A, idx):
sdc_exception = context.exception
self.assertIn(str(sdc_exception), str(pandas_exception))

@skip_sdc_jit('Not implemented in old-pipeline')
def test_series_getitem_idx_bool_series3(self):
""" Verifies Series.getitem by mask indicated by a Boolean Series with the same object as index """
def test_impl(A, mask, index):
S = pd.Series(A, index)
idx = pd.Series(mask, S.index)
return S[idx]
hpat_func = self.jit(test_impl)

n = 11
np.random.seed(0)

idxs_to_test = [
np.arange(n),
np.arange(n, dtype='float'),
gen_strlist(n, 2, 'abcd123 ')
]
series_data = np.arange(n)
mask = np.random.choice([True, False], n)
for index in idxs_to_test:
with self.subTest(series_index=index):
result = hpat_func(series_data, mask, index)
result_ref = test_impl(series_data, mask, index)
pd.testing.assert_series_equal(result, result_ref)

@skip_sdc_jit('Not implemented in old-pipeline')
def test_series_getitem_idx_bool_series_reindex(self):
""" Verifies Series.getitem with reindexing by mask indicated by a Boolean Series
Expand Down
34 changes: 27 additions & 7 deletions sdc/tests/tests_perf/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@ def gen_series_fixed_str(data_num, data_length, input_data, data_width):
return results


def gen_arr_from_input(input_data, data_length, random=True):
def gen_arr_from_input(data_length, input_data, random=True, repeat=True, seed=None):
if seed is not None:
np.random.seed(seed)

if random:
return np.random.choice(input_data, data_length)
return np.random.choice(input_data, data_length, replace=repeat)
else:
return np.asarray(multiply_oneds_data(input_data, data_length))

Expand All @@ -50,7 +53,7 @@ def gen_arr_of_dtype(data_length, dtype='float', random=True, limits=None, nuniq

# prefer generation based on input data if it's provided
if input_data is not None:
return gen_arr_from_input(input_data, data_length, random=random)
return gen_arr_from_input(data_length, input_data, random=random)

if dtype == 'float':
return np.random.ranf(data_length)
Expand All @@ -67,6 +70,21 @@ def gen_arr_of_dtype(data_length, dtype='float', random=True, limits=None, nuniq
return None


def gen_unique_values(data_length, dtype='int', seed=None):
"""
data_length: result length of array of unique values,
dtype: dtype of generated array,
seed: seed to initialize random state
"""

if dtype in ('float', 'int'):
values = np.arange(data_length, dtype=dtype)
if dtype == 'str':
values = gen_strlist(data_length)

return gen_arr_from_input(data_length, values, repeat=False, seed=seed)


def gen_series(data_length, dtype='float', random=True, limits=None, nunique=1000, input_data=None, seed=None):
"""
data_length: result series length,
Expand All @@ -82,7 +100,7 @@ def gen_series(data_length, dtype='float', random=True, limits=None, nunique=100

# prefer generation based on input data if it's provided
if input_data is not None:
series_data = gen_arr_from_input(input_data, data_length, random=random)
series_data = gen_arr_from_input(data_length, input_data, random=random)
else:
series_data = gen_arr_of_dtype(data_length, dtype=dtype, limits=limits, nunique=nunique)

Expand All @@ -98,13 +116,15 @@ def gen_df(data_length,
limits=None,
nunique=1000,
input_data=None,
index_gen=None,
seed=None):
"""
data_length: result series length,
dtype: dtype of generated series,
limits: a tuple of (min, max) limits for numeric series,
nunique: number of unique values in generated series,
input_data: 2D sequence of values used for generation of dataframe columns,
index_gen: callable that will generate index of needed size,
seed: seed to initialize random state
"""

Expand All @@ -116,10 +136,10 @@ def gen_df(data_length,
for i in range(columns):
# prefer generation based on input data if it's provided
if (input_data is not None and i < len(input_data)):
col_data = gen_arr_from_input(input_data[i], data_length, random=random)
col_data = gen_arr_from_input(data_length, input_data[i], random=random)
else:
col_data = gen_arr_of_dtype(data_length, dtype=dtype, limits=limits, nunique=nunique)
all_data.append(col_data)

# TODO: support index generation
return pd.DataFrame(dict(zip(col_names, all_data)))
index_data = index_gen(data_length) if index_gen is not None else None
return pd.DataFrame(dict(zip(col_names, all_data)), index=index_data)
4 changes: 3 additions & 1 deletion sdc/tests/tests_perf/test_perf_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from sdc.tests.test_utils import test_global_input_data_float64
from .generator import generate_test_cases
from .generator import TestCase as TC
from .data_generator import gen_df, gen_series, gen_arr_of_dtype
from .data_generator import gen_df, gen_series, gen_arr_of_dtype, gen_unique_values


# python -m sdc.runtests sdc.tests.tests_perf.test_perf_df.TestDataFrameMethods.test_df_{method_name}
Expand Down Expand Up @@ -88,6 +88,8 @@ def _test_case(self, pyfunc, name, total_data_length, input_data=None, data_num=
TC(name='getitem_idx_bool_array', size=[10 ** 7], call_expr='df[idx]', usecase_params='df, idx',
data_gens=(gen_df, partial(gen_arr_of_dtype, dtype='bool', random=False)),
input_data=[None, [True, False, False, True, False, True]]),
TC(name='getitem_filter_by_value', size=[10 ** 7], call_expr='df[df.A > 0]', usecase_params='df',
data_gens=(partial(gen_df, index_gen=gen_unique_values), )),
]

generate_test_cases(cases, TestDataFrameMethods, 'df')