-
Notifications
You must be signed in to change notification settings - Fork 3
read_csv changes and improvements in performance #10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f46892a
050effc
b7e7686
13ccd3e
8b45198
8b1a07d
b013cae
813c1d2
942d72b
0cbd230
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| from __future__ import print_function | ||
|
|
||
| import numpy as np | ||
| import ray | ||
| import pandas | ||
|
|
||
| from .remote_partition import RayRemotePartition | ||
|
|
@@ -99,7 +100,7 @@ def block_lengths(self): | |
| # The first column will have the correct lengths. We have an | ||
| # invariant that requires that all blocks be the same length in a | ||
| # row of blocks. | ||
| self._lengths_cache = [obj.length for obj in self.partitions.T[0]] | ||
| self._lengths_cache = [obj.length().get() for obj in self.partitions.T[0]] | ||
| return self._lengths_cache | ||
|
|
||
| # Widths of the blocks | ||
|
|
@@ -116,7 +117,7 @@ def block_widths(self): | |
| # The first column will have the correct lengths. We have an | ||
| # invariant that requires that all blocks be the same width in a | ||
| # column of blocks. | ||
| self._widths_cache = [obj.width for obj in self.partitions[0]] | ||
| self._widths_cache = [obj.width().get() for obj in self.partitions[0]] | ||
| return self._widths_cache | ||
|
|
||
| def full_reduce(self, map_func, reduce_func, axis): | ||
|
|
@@ -672,6 +673,9 @@ def __getitem__(self, key): | |
| cls = type(self) | ||
| return cls(self.partitions[key]) | ||
|
|
||
| def __len__(self): | ||
| return sum(self.block_lengths) | ||
|
|
||
|
|
||
| class RayBlockPartitions(BlockPartitions): | ||
| """This method implements the interface in `BlockPartitions`.""" | ||
|
|
@@ -682,6 +686,43 @@ class RayBlockPartitions(BlockPartitions): | |
| def __init__(self, partitions): | ||
| self.partitions = partitions | ||
|
|
||
| # We override these for performance reasons. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the critical change that made it work? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If that’s the case, can we drop block partitions implementation all together?
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. we were doing a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we cache the remote function?
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We actually cache the lengths themselves with this structure. We also cache the object id's from the lengths right now so we can do a |
||
| # Lengths of the blocks | ||
| _lengths_cache = None | ||
|
|
||
| # These are set up as properties so that we only use them when we need | ||
| # them. We also do not want to trigger this computation on object creation. | ||
| @property | ||
| def block_lengths(self): | ||
| """Gets the lengths of the blocks. | ||
|
|
||
| Note: This works with the property structure `_lengths_cache` to avoid | ||
| having to recompute these values each time they are needed. | ||
| """ | ||
| if self._lengths_cache is None: | ||
| # The first column will have the correct lengths. We have an | ||
| # invariant that requires that all blocks be the same length in a | ||
| # row of blocks. | ||
| self._lengths_cache = ray.get([obj.length().oid for obj in self.partitions.T[0]]) | ||
| return self._lengths_cache | ||
|
|
||
| # Widths of the blocks | ||
| _widths_cache = None | ||
|
|
||
| @property | ||
| def block_widths(self): | ||
| """Gets the widths of the blocks. | ||
|
|
||
| Note: This works with the property structure `_widths_cache` to avoid | ||
| having to recompute these values each time they are needed. | ||
| """ | ||
| if self._widths_cache is None: | ||
| # The first column will have the correct lengths. We have an | ||
| # invariant that requires that all blocks be the same width in a | ||
| # column of blocks. | ||
| self._widths_cache = ray.get([obj.width().oid for obj in self.partitions[0]]) | ||
| return self._widths_cache | ||
|
|
||
| @property | ||
| def column_partitions(self): | ||
| """A list of `RayColumnPartition` objects.""" | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe
widthandlengthshould be functions instead of an attributes?