Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix loc len #45

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,

self._dtypes_cache = dtypes_cache

self._is_view = False

# Check type of data and use appropriate constructor
if data is not None or (col_partitions is None and
row_partitions is None and
Expand Down Expand Up @@ -123,7 +125,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
if block_partitions is not None:
axis = 0
# put in numpy array here to make accesses easier since it's 2D
self._block_partitions = np.array(block_partitions)
if not isinstance(block_partitions, np.ndarray):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this check be done in _fix_blocks_dimensions?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!

self._block_partitions = np.array(block_partitions)
else:
self._block_partitions = block_partitions
self._block_partitions = \
_fix_blocks_dimensions(self._block_partitions, axis)

Expand Down
65 changes: 27 additions & 38 deletions modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from warnings import warn

from .utils import (_get_nan_block_id, extractor,
_mask_block_partitions, writer, _blocks_to_series)
writer, _blocks_to_series)
from .index_metadata import _IndexMetadata
from .dataframe import DataFrame

Expand Down Expand Up @@ -67,6 +67,15 @@ def is_integer_slice(x):
INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types.
"""

_SETTING_WITHOUT_COPYING_WARING = """
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._setitem_with_indexer(indexer, value)
"""


def _parse_tuple(tup):
"""Unpack the user input for getitem and setitem and compute ndim
Expand Down Expand Up @@ -137,10 +146,7 @@ def __init__(self, ray_df):
self.row_coord_df = ray_df._row_metadata._coord_df
self.block_oids = ray_df._block_partitions

self.is_view = False
if isinstance(ray_df, DataFrameView):
self.block_oids = ray_df._block_partitions_data
self.is_view = True
self.is_view = self.df._is_view

def __getitem__(self, row_lookup, col_lookup, ndim):
"""
Expand Down Expand Up @@ -200,19 +206,24 @@ def _generate_view(self, row_lookup, col_lookup):
for i in col_lookup["partition"]:
col_lengths[i] += 1

row_lengths_oid = ray.put(np.array(row_lengths))
col_lengths_oid = ray.put(np.array(col_lengths))

row_metadata_view = _IndexMetadata(
coord_df_oid=row_lookup, lengths_oid=row_lengths)
coord_df_oid=row_lookup, lengths_oid=row_lengths_oid)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we modify the metadata class to take in an np.array here, as well as an OID, instead of using ray.put?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should be just oid here because the keyword arg is _oid. When we refactor IndexMetaData (which will be another PR and coming soon), we can make it taking np.array. I don't want this to be a refactoring PR.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we file an issue or mark a todo in the code in that case?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. I think @devin-petersohn will soon write up a design doc for the new index (using two B+Tree) that will basically re-write _IndexMetaData.

Copy link
Collaborator

@pschafhalter pschafhalter Jul 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, refactoring index is WIP. Exact datastructure is still an open discussion 😉


col_metadata_view = _IndexMetadata(
coord_df_oid=col_lookup, lengths_oid=col_lengths)
coord_df_oid=col_lookup, lengths_oid=col_lengths_oid)

df_view = DataFrameView(
df_view = DataFrame(
block_partitions=self.block_oids,
row_metadata=row_metadata_view,
col_metadata=col_metadata_view,
index=row_metadata_view.index,
columns=col_metadata_view.index)

df_view._is_view = True

return df_view

def __setitem__(self, row_lookup, col_lookup, item):
Expand Down Expand Up @@ -273,15 +284,14 @@ def _write_items(self, row_lookup, col_lookup, item):
result_oid = writer.remote(block_oid, row_idx, col_idx,
item_to_write)

if self.is_view:
self.df._block_partitions_data[row_blk,
col_blk] = result_oid
else:
self.df._block_partitions[row_blk, col_blk] = result_oid
self.df._block_partitions[row_blk, col_blk] = result_oid

col_item_index += col_len
row_item_index += row_len

if self.is_view:
warn(_SETTING_WITHOUT_COPYING_WARING)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this warning be imported from pandas?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh weird



class _Loc_Indexer(_Location_Indexer_Base):
"""A indexer for ray_df.loc[] functionality"""
Expand Down Expand Up @@ -365,8 +375,11 @@ def _enlarge_axis(self, locator, axis):

lens = major_meta._lengths
lens = np.concatenate([lens, np.array([num_nan_labels])])
lens_oid = ray.put(np.array(lens))

metadata_view = _IndexMetadata(coord_df_oid=coord_df, lengths_oid=lens)
metadata_view = _IndexMetadata(
coord_df_oid=coord_df,
lengths_oid=lens_oid)
return metadata_view

def _compute_enlarge_labels(self, locator, base_index):
Expand Down Expand Up @@ -448,27 +461,3 @@ def _check_dtypes(self, locator):

if not any([is_int, is_int_slice, is_int_list, is_bool_arr]):
raise ValueError(_ILOC_INT_ONLY_ERROR)


class DataFrameView(DataFrame):
"""A subclass of DataFrame where the index can be smaller than blocks.
"""

def __init__(self, block_partitions, row_metadata, col_metadata, index,
columns):
self._block_partitions = block_partitions
self._row_metadata = row_metadata
self._col_metadata = col_metadata
self.index = index
self.columns = columns

def _get_block_partitions(self):
oid_arr = _mask_block_partitions(self._block_partitions_data,
self._row_metadata,
self._col_metadata)
return oid_arr

def _set_block_partitions(self, new_block_partitions):
self._block_partitions_data = new_block_partitions

_block_partitions = property(_get_block_partitions, _set_block_partitions)
28 changes: 0 additions & 28 deletions modin/pandas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,34 +254,6 @@ def writer(df_chunk, row_loc, col_loc, item):
return df_chunk


def _mask_block_partitions(blk_partitions, row_metadata, col_metadata):
"""Return the squeezed/expanded block partitions as defined by
row_metadata and col_metadata.

Note:
Very naive implementation. Extract one scaler at a time in a double
for loop.
"""
col_df = col_metadata._coord_df
row_df = row_metadata._coord_df

result_oids = []
shape = (len(row_df.index), len(col_df.index))

for _, row_partition_data in row_df.iterrows():
for _, col_partition_data in col_df.iterrows():
row_part = row_partition_data.partition
col_part = col_partition_data.partition
block_oid = blk_partitions[row_part, col_part]

row_idx = row_partition_data['index_within_partition']
col_idx = col_partition_data['index_within_partition']

result_oid = extractor.remote(block_oid, [row_idx], [col_idx])
result_oids.append(result_oid)
return np.array(result_oids).reshape(shape)


@ray.remote
def _deploy_func(func, dataframe, *args):
"""Deploys a function for the _map_partitions call.
Expand Down