-
Notifications
You must be signed in to change notification settings - Fork 653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix loc len #45
Fix loc len #45
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
from warnings import warn | ||
|
||
from .utils import (_get_nan_block_id, extractor, | ||
_mask_block_partitions, writer, _blocks_to_series) | ||
writer, _blocks_to_series) | ||
from .index_metadata import _IndexMetadata | ||
from .dataframe import DataFrame | ||
|
||
|
@@ -67,6 +67,15 @@ def is_integer_slice(x): | |
INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types. | ||
""" | ||
|
||
_SETTING_WITHOUT_COPYING_WARING = """ | ||
SettingWithCopyWarning: | ||
A value is trying to be set on a copy of a slice from a DataFrame | ||
|
||
See the caveats in the documentation: | ||
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy | ||
self._setitem_with_indexer(indexer, value) | ||
""" | ||
|
||
|
||
def _parse_tuple(tup): | ||
"""Unpack the user input for getitem and setitem and compute ndim | ||
|
@@ -137,10 +146,7 @@ def __init__(self, ray_df): | |
self.row_coord_df = ray_df._row_metadata._coord_df | ||
self.block_oids = ray_df._block_partitions | ||
|
||
self.is_view = False | ||
if isinstance(ray_df, DataFrameView): | ||
self.block_oids = ray_df._block_partitions_data | ||
self.is_view = True | ||
self.is_view = self.df._is_view | ||
|
||
def __getitem__(self, row_lookup, col_lookup, ndim): | ||
""" | ||
|
@@ -200,19 +206,24 @@ def _generate_view(self, row_lookup, col_lookup): | |
for i in col_lookup["partition"]: | ||
col_lengths[i] += 1 | ||
|
||
row_lengths_oid = ray.put(np.array(row_lengths)) | ||
col_lengths_oid = ray.put(np.array(col_lengths)) | ||
|
||
row_metadata_view = _IndexMetadata( | ||
coord_df_oid=row_lookup, lengths_oid=row_lengths) | ||
coord_df_oid=row_lookup, lengths_oid=row_lengths_oid) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we modify the metadata class to take in an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it should be just oid here because the keyword arg is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we file an issue or mark a todo in the code in that case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep. I think @devin-petersohn will soon write up a design doc for the new index (using two B+Tree) that will basically re-write There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, refactoring index is WIP. Exact datastructure is still an open discussion 😉 |
||
|
||
col_metadata_view = _IndexMetadata( | ||
coord_df_oid=col_lookup, lengths_oid=col_lengths) | ||
coord_df_oid=col_lookup, lengths_oid=col_lengths_oid) | ||
|
||
df_view = DataFrameView( | ||
df_view = DataFrame( | ||
block_partitions=self.block_oids, | ||
row_metadata=row_metadata_view, | ||
col_metadata=col_metadata_view, | ||
index=row_metadata_view.index, | ||
columns=col_metadata_view.index) | ||
|
||
df_view._is_view = True | ||
|
||
return df_view | ||
|
||
def __setitem__(self, row_lookup, col_lookup, item): | ||
|
@@ -273,15 +284,14 @@ def _write_items(self, row_lookup, col_lookup, item): | |
result_oid = writer.remote(block_oid, row_idx, col_idx, | ||
item_to_write) | ||
|
||
if self.is_view: | ||
self.df._block_partitions_data[row_blk, | ||
col_blk] = result_oid | ||
else: | ||
self.df._block_partitions[row_blk, col_blk] = result_oid | ||
self.df._block_partitions[row_blk, col_blk] = result_oid | ||
|
||
col_item_index += col_len | ||
row_item_index += row_len | ||
|
||
if self.is_view: | ||
warn(_SETTING_WITHOUT_COPYING_WARING) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this warning be imported from pandas? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sadly, no... Pandas uses inline warning text: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh weird |
||
|
||
|
||
class _Loc_Indexer(_Location_Indexer_Base): | ||
"""A indexer for ray_df.loc[] functionality""" | ||
|
@@ -365,8 +375,11 @@ def _enlarge_axis(self, locator, axis): | |
|
||
lens = major_meta._lengths | ||
lens = np.concatenate([lens, np.array([num_nan_labels])]) | ||
lens_oid = ray.put(np.array(lens)) | ||
|
||
metadata_view = _IndexMetadata(coord_df_oid=coord_df, lengths_oid=lens) | ||
metadata_view = _IndexMetadata( | ||
coord_df_oid=coord_df, | ||
lengths_oid=lens_oid) | ||
return metadata_view | ||
|
||
def _compute_enlarge_labels(self, locator, base_index): | ||
|
@@ -448,27 +461,3 @@ def _check_dtypes(self, locator): | |
|
||
if not any([is_int, is_int_slice, is_int_list, is_bool_arr]): | ||
raise ValueError(_ILOC_INT_ONLY_ERROR) | ||
|
||
|
||
class DataFrameView(DataFrame): | ||
"""A subclass of DataFrame where the index can be smaller than blocks. | ||
""" | ||
|
||
def __init__(self, block_partitions, row_metadata, col_metadata, index, | ||
columns): | ||
self._block_partitions = block_partitions | ||
self._row_metadata = row_metadata | ||
self._col_metadata = col_metadata | ||
self.index = index | ||
self.columns = columns | ||
|
||
def _get_block_partitions(self): | ||
oid_arr = _mask_block_partitions(self._block_partitions_data, | ||
self._row_metadata, | ||
self._col_metadata) | ||
return oid_arr | ||
|
||
def _set_block_partitions(self, new_block_partitions): | ||
self._block_partitions_data = new_block_partitions | ||
|
||
_block_partitions = property(_get_block_partitions, _set_block_partitions) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this check be done in
_fix_blocks_dimensions
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes!