Skip to content

Commit

Permalink
FEAT-modin-project#1598: Update iterator implemetion to iloc (modin…
Browse files Browse the repository at this point in the history
…-project#1599)

Co-authored-by: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com>

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>
  • Loading branch information
devin-petersohn authored and aregm committed Sep 16, 2020
1 parent 72b6ba1 commit 222b4a9
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 48 deletions.
29 changes: 9 additions & 20 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,14 +1180,11 @@ def iterrows(self):
Returns:
A generator that iterates over the rows of the frame.
"""
index_iter = iter(self.index)

def iterrow_builder(df):
df.columns = self.columns
df.index = [next(index_iter)]
return df.iterrows()
def iterrow_builder(s):
return s.name, s

partition_iterator = PartitionIterator(self._query_compiler, 0, iterrow_builder)
partition_iterator = PartitionIterator(self, 0, iterrow_builder)
for v in partition_iterator:
yield v

Expand All @@ -1202,14 +1199,11 @@ def items(self):
Returns:
A generator that iterates over the columns of the frame.
"""
col_iter = iter(self.columns)

def items_builder(df):
df.columns = [next(col_iter)]
df.index = self.index
return df.items()
def items_builder(s):
return s.name, s

partition_iterator = PartitionIterator(self._query_compiler, 1, items_builder)
partition_iterator = PartitionIterator(self, 1, items_builder)
for v in partition_iterator:
yield v

Expand Down Expand Up @@ -1240,16 +1234,11 @@ def itertuples(self, index=True, name="Pandas"):
Returns:
A tuple representing row data. See args for varying tuples.
"""
index_iter = iter(self.index)

def itertuples_builder(df):
df.columns = self.columns
df.index = [next(index_iter)]
return df.itertuples(index=index, name=name)
def itertuples_builder(s):
return next(s._to_pandas().to_frame().T.itertuples(index=index, name=name))

partition_iterator = PartitionIterator(
self._query_compiler, 0, itertuples_builder
)
partition_iterator = PartitionIterator(self, 0, itertuples_builder)
for v in partition_iterator:
yield v

Expand Down
23 changes: 13 additions & 10 deletions modin/pandas/iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,27 @@


class PartitionIterator(Iterator):
def __init__(self, query_compiler, axis, func):
def __init__(self, df, axis, func):
"""PartitionIterator class to define a generator on partitioned data
Args:
query_compiler: Data manager for the dataframe
df: The dataframe to iterate over
axis: axis to iterate over
func: The function to get inner iterables from
each partition
"""
self.query_compiler = query_compiler
self.df = df
self.axis = axis
self.index_iter = (
iter(self.query_compiler.columns)
zip(
iter(slice(None) for _ in range(len(self.df.columns))),
range(len(self.df.columns)),
)
if axis
else iter(range(len(self.query_compiler.index)))
else zip(
range(len(self.df.index)),
iter(slice(None) for _ in range(len(self.df.index))),
)
)
self.func = func

Expand All @@ -41,8 +47,5 @@ def __next__(self):

def next(self):
key = next(self.index_iter)
if self.axis:
df = self.query_compiler.getitem_column_array([key]).to_pandas()
else:
df = self.query_compiler.getitem_row_array([key]).to_pandas()
return next(self.func(df))
df = self.df.iloc[key]
return self.func(df)
11 changes: 3 additions & 8 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,15 +934,10 @@ def item(self):
return self[0]

def items(self):
index_iter = iter(self.index)
def item_builder(s):
return s.name, s.squeeze()

def item_builder(df):
s = df.iloc[:, 0]
s.index = [next(index_iter)]
s.name = self.name
return s.items()

partition_iterator = PartitionIterator(self._query_compiler, 0, item_builder)
partition_iterator = PartitionIterator(self.to_frame(), 0, item_builder)
for v in partition_iterator:
yield v

Expand Down
34 changes: 24 additions & 10 deletions modin/pandas/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5381,8 +5381,10 @@ def test_iterrows(self, data):
df_equals(pandas_series, modin_series)
assert pandas_index == modin_index

@pytest.mark.parametrize("name", [None, "NotPandas", "Pandas"])
@pytest.mark.parametrize("index", [True, False])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_itertuples(self, data):
def test_itertuples(self, name, index, data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)

Expand All @@ -5392,16 +5394,28 @@ def test_itertuples(self, data):
for modin_row, pandas_row in zip(modin_it_default, pandas_it_default):
np.testing.assert_equal(modin_row, pandas_row)

# test all combinations of custom params
indices = [True, False]
names = [None, "NotPandas", "Pandas"]
modin_it_custom = modin_df.itertuples(index=index, name=name)
pandas_it_custom = pandas_df.itertuples(index=index, name=name)
for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom):
np.testing.assert_equal(modin_row, pandas_row)

for index in indices:
for name in names:
modin_it_custom = modin_df.itertuples(index=index, name=name)
pandas_it_custom = pandas_df.itertuples(index=index, name=name)
for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom):
np.testing.assert_equal(modin_row, pandas_row)
mi_index_modin = pd.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(len(modin_df.columns))]
)
mi_index_pandas = pandas.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(len(pandas_df.columns))]
)
modin_df.columns = mi_index_modin
pandas_df.columns = mi_index_pandas
modin_it_default = modin_df.itertuples()
pandas_it_default = pandas_df.itertuples()
for modin_row, pandas_row in zip(modin_it_default, pandas_it_default):
np.testing.assert_equal(modin_row, pandas_row)

modin_it_custom = modin_df.itertuples(index=index, name=name)
pandas_it_custom = pandas_df.itertuples(index=index, name=name)
for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom):
np.testing.assert_equal(modin_row, pandas_row)

@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___iter__(self, data):
Expand Down

0 comments on commit 222b4a9

Please sign in to comment.