Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Optimizations for cudf.concat when axis=1 #9333

Merged
merged 64 commits into from
Oct 19, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
826cd6c
add tests
galipremsagar Sep 21, 2021
50b8850
multiindex union
galipremsagar Sep 21, 2021
61b56cc
merge
galipremsagar Sep 21, 2021
910e682
add number of index apis
galipremsagar Sep 29, 2021
d252aae
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Sep 29, 2021
35406f6
cleanup
galipremsagar Sep 29, 2021
160093d
cleanup
galipremsagar Sep 29, 2021
9d5f7df
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Sep 29, 2021
60daeaf
cover all tests for mulitIndex.union
galipremsagar Sep 29, 2021
a43de79
add MultiIndex.intersections tests
galipremsagar Sep 29, 2021
28c13ff
add Index.union tests
galipremsagar Sep 29, 2021
d4c1ebd
add index intersection tests
galipremsagar Sep 29, 2021
ea32e41
remove print
galipremsagar Sep 29, 2021
205d947
add union docstring
galipremsagar Sep 29, 2021
e6f0ea5
add intersection docs
galipremsagar Sep 29, 2021
a943842
add docstrings
galipremsagar Sep 30, 2021
abecd07
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Sep 30, 2021
fcf2664
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Sep 30, 2021
c702396
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Sep 30, 2021
838a34c
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 1, 2021
aaab3a5
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 4, 2021
3adab76
add caching to distinct_count
galipremsagar Oct 5, 2021
a56dbfa
fix union
galipremsagar Oct 5, 2021
f7d9a8f
reorganize
galipremsagar Oct 5, 2021
5d57597
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 5, 2021
d897e1d
cleanup
galipremsagar Oct 5, 2021
a136bf1
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 7, 2021
7e26d25
add sort validation
galipremsagar Oct 7, 2021
1ef2b02
Apply suggestions from code review
galipremsagar Oct 7, 2021
c44b099
use BaseIndex
galipremsagar Oct 7, 2021
bf399db
Merge branch '9223' of https://github.com/galipremsagar/cudf into 9223
galipremsagar Oct 7, 2021
8d46320
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 7, 2021
53ef02b
cache end value
galipremsagar Oct 7, 2021
fcf5f6c
address reviews
galipremsagar Oct 8, 2021
dabb543
cleanup
galipremsagar Oct 8, 2021
346fe0e
simplify
galipremsagar Oct 8, 2021
29df86a
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 8, 2021
7a5301a
make cache value a dict
galipremsagar Oct 8, 2021
eab7194
make is_* methods specific to classes
galipremsagar Oct 8, 2021
340d2dc
make _extended_gcd free function
galipremsagar Oct 8, 2021
ce53913
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 8, 2021
6f389da
change Index call
galipremsagar Oct 8, 2021
04ac7fc
remove class name
galipremsagar Oct 11, 2021
838fe88
pass names in multiIndex construction itself
galipremsagar Oct 11, 2021
26564d3
remove comments
galipremsagar Oct 11, 2021
b4ea5a3
add todo
galipremsagar Oct 11, 2021
c20dfe6
add comment
galipremsagar Oct 11, 2021
fba6d78
add comment explaining materializations
galipremsagar Oct 11, 2021
ae25694
fix res_name
galipremsagar Oct 11, 2021
d595dc3
add comments
galipremsagar Oct 11, 2021
0873d39
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 11, 2021
92a9a40
add todo
galipremsagar Oct 11, 2021
49cabf9
Apply suggestions from code review
galipremsagar Oct 12, 2021
1a75299
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 12, 2021
5e1c7cc
style
galipremsagar Oct 12, 2021
0221386
remove paranthesis
galipremsagar Oct 12, 2021
897c25a
add todo
galipremsagar Oct 12, 2021
21f5a97
add more test coverage
galipremsagar Oct 12, 2021
975566d
refactor if/elif blocks
galipremsagar Oct 12, 2021
048ec0a
Update python/cudf/cudf/core/index.py
galipremsagar Oct 13, 2021
b783de0
address reviews
galipremsagar Oct 13, 2021
1365f36
merge
galipremsagar Oct 13, 2021
46d54fc
Merge remote-tracking branch 'upstream/branch-21.12' into 9223
galipremsagar Oct 13, 2021
47ce5d1
add tests for is_* methods
galipremsagar Oct 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add number of index apis
  • Loading branch information
galipremsagar committed Sep 29, 2021
commit 910e682c5ac831edee99a5b5c4dc204183db6aa7
71 changes: 71 additions & 0 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,55 @@ def set_names(self, names, level=None, inplace=False):

return self._set_names(names=names, inplace=inplace)

@property
def has_duplicates(self):
return not self.is_unique

def union(self, other, sort=None):
if not isinstance(other, cudf.Index):
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
other = cudf.Index(other, name=self.name)

res_name = self.name or other.name
if not len(other) or self.equals(other):
if res_name != self.name:
return self.rename(res_name)
else:
return self
elif not len(self):
if res_name != other.name:
return other.rename(res_name)
else:
return other

result = self._union(other, sort=sort)
result.name = res_name
return result

def intersection(self, other, sort=False):
if not isinstance(other, cudf.Index):
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
other = cudf.Index(other, name=self.name)

res_name = self.name or other.name
if self.equals(other):
if self.has_duplicates:
result = self.unique()
else:
if res_name != self.name:
result = self.copy(deep=False)
else:
result = self
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
elif (self.is_boolean() and other.is_numeric()) or (
self.is_numeric() and other.is_boolean()
):
if isinstance(self, cudf.MultiIndex):
return self[:0].rename(res_name)
else:
return cudf.Index([], name=res_name)

result = self._intersection(other, sort=sort)
result.name = res_name
return result

def fillna(self, value, downcast=None):
"""
Fill null values with the specified value.
Expand Down Expand Up @@ -520,6 +569,28 @@ def difference(self, other, sort=None):

return difference

def _union(self, other, sort=None):
# import pdb;pdb.set_trace()
# union_result = self.join(other, how='outer')
self_df = self.to_frame(index=False, name=0)
other_df = other.to_frame(index=False, name=0)
self_df["order"] = self_df.index
other_df["order"] = other_df.index
res = self_df.merge(other_df, on=[0], how="outer")
res = res.sort_values(by=res.columns[1:], ignore_index=True)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
union_result = cudf.Index(res[0])
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

if sort is None and len(other):
return union_result.sort_values()
return union_result

def _intersection(self, other, sort=None):
intersection_result = self.join(other, how="inner")

if sort is None and len(other):
return intersection_result.sort_values()
return intersection_result

def sort_values(self, return_indexer=False, ascending=True, key=None):
"""
Return a sorted copy of the index, and optionally return the indices
Expand Down
176 changes: 176 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ class RangeIndex(BaseIndex):
RangeIndex(start=1, stop=10, step=1, name='a')
"""

_range: range

def __init__(
self, start, stop=None, step=1, dtype=None, copy=False, name=None
):
Expand All @@ -163,6 +165,7 @@ def __init__(
self._step = int(step) if step is not None else 1
self._index = None
self._name = name
self._range = range(self._start, self._stop, self._step)

def _copy_type_metadata(
self, other: Frame, include_index: bool = True
Expand Down Expand Up @@ -216,6 +219,30 @@ def _values(self):
else:
return column.column_empty(0, masked=False, dtype=self.dtype)

def is_numeric(self):
return True

def is_boolean(self):
return False

def is_integer(self):
return True

def is_floating(self):
return False

def is_object(self):
return False

def is_categorical(self):
return False

def is_interval(self):
return False

def is_mixed(self):
return False

@property
def _data(self):
return cudf.core.column_accessor.ColumnAccessor(
Expand Down Expand Up @@ -540,6 +567,126 @@ def get_loc(self, key, method=None, tolerance=None):
raise KeyError(key)
return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int)

def _union(self, other, sort=None):
if isinstance(other, RangeIndex) and sort is None:
start_s, step_s = self.start, self.step
end_s = self.start + self.step * (len(self) - 1)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
start_o, step_o = other.start, other.step
end_o = other.start + other.step * (len(other) - 1)
if self.step < 0:
start_s, step_s, end_s = end_s, -step_s, start_s
if other.step < 0:
start_o, step_o, end_o = end_o, -step_o, start_o
if len(self) == 1 and len(other) == 1:
step_s = step_o = abs(self.start - other.start)
elif len(self) == 1:
step_s = step_o
elif len(other) == 1:
step_o = step_s
start_r = min(start_s, start_o)
end_r = max(end_s, end_o)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
if step_o == step_s:
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
if (
(start_s - start_o) % step_s == 0
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
and (start_s - end_o) <= step_s
and (start_o - end_s) <= step_s
):
return type(self)(start_r, end_r + step_s, step_s)
if (
(step_s % 2 == 0)
and (abs(start_s - start_o) <= step_s / 2)
and (abs(end_s - end_o) <= step_s / 2)
):
return type(self)(start_r, end_r + step_s / 2, step_s / 2)
elif step_o % step_s == 0:
if (
(start_o - start_s) % step_s == 0
and (start_o + step_s >= start_s)
and (end_o - step_s <= end_s)
):
return type(self)(start_r, end_r + step_s, step_s)
elif step_s % step_o == 0:
if (
(start_s - start_o) % step_o == 0
and (start_s + step_o >= start_o)
and (end_s - step_o <= end_o)
):
return type(self)(start_r, end_r + step_o, step_o)

return cudf.Int64Index(self._values)._union(other, sort=sort)

def _extended_gcd(self, a: int, b: int) -> Tuple[int, int, int]:
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
"""
Extended Euclidean algorithms to solve Bezout's identity:
a*x + b*y = gcd(x, y)
Finds one particular solution for x, y: s, t
Returns: gcd, s, t
"""
s, old_s = 0, 1
t, old_t = 1, 0
r, old_r = b, a
while r:
quotient = old_r // r
old_r, r = r, old_r - quotient * r
old_s, s = s, old_s - quotient * s
old_t, t = t, old_t - quotient * t
return old_r, old_s, old_t

def _min_fitting_element(self, lower_limit: int) -> int:
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
"""Returns the smallest element greater than or equal to the limit"""
no_steps = -(-(lower_limit - self.start) // abs(self.step))
return self.start + abs(self.step) * no_steps

def _intersection(self, other, sort=False):
# import pdb;pdb.set_trace()
if not isinstance(other, RangeIndex):
# Int64Index
return super()._intersection(other, sort=sort)

if not len(self) or not len(other):
return RangeIndex(0)

first = self._range[::-1] if self.step < 0 else self._range
second = other._range[::-1] if other.step < 0 else other._range

# check whether intervals intersect
# deals with in- and decreasing ranges
int_low = max(first.start, second.start)
int_high = min(first.stop, second.stop)
if int_high <= int_low:
return RangeIndex(0)

# Method hint: linear Diophantine equation
# solve intersection problem
# performance hint: for identical step sizes, could use
# cheaper alternative
gcd, s, _ = self._extended_gcd(first.step, second.step)

# check whether element sets intersect
if (first.start - second.start) % gcd:
return RangeIndex(0)

# calculate parameters for the RangeIndex describing the
# intersection disregarding the lower bounds
tmp_start = (
first.start + (second.start - first.start) * first.step // gcd * s
vyasr marked this conversation as resolved.
Show resolved Hide resolved
)
new_step = first.step * second.step // gcd
new_range = range(tmp_start, int_high, new_step)
new_index = RangeIndex(new_range)

# adjust index to limiting interval
new_start = new_index._min_fitting_element(int_low)
new_range = range(new_start, new_index.stop, new_index.step)
new_index = RangeIndex(new_range)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

if (self.step < 0 and other.step < 0) is not (new_index.step < 0):
new_index = new_index[::-1]
if sort is None:
new_index = new_index.sort_values()

return new_index


# Patch in all binops and unary ops, which bypass __getattr__ on the instance
# and prevent the above overload from working.
Expand Down Expand Up @@ -972,6 +1119,35 @@ def dtype(self):
"""
return self._values.dtype

def is_numeric(self):
if cudf.api.types.is_numeric_dtype(
self.dtype
) and self.dtype != np.dtype("bool"):
return True
else:
return False

def is_boolean(self):
return self.dtype == "bool"

def is_integer(self):
return cudf.api.types.is_integer_dtype(self.dtype)

def is_floating(self):
return cudf.api.types.is_float_dtype(self.dtype)

def is_object(self):
return self.dtype == np.dtype("object")

def is_categorical(self):
return isinstance(self.dtype, cudf.CategoricalDtype)

def is_interval(self):
return self.dtype == "interval"

def is_mixed(self):
return False

def find_label_range(self, first, last):
"""Find range that starts with *first* and ends with *last*,
inclusively.
Expand Down
35 changes: 35 additions & 0 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,30 @@ def get_level_values(self, level):
level_values = as_index(self._data[level], name=self.names[level_idx])
return level_values

def is_numeric(self):
return False

def is_boolean(self):
return False

def is_integer(self):
return False

def is_floating(self):
return False

def is_object(self):
return False

def is_categorical(self):
return False

def is_interval(self):
return False

def is_mixed(self):
return True

@classmethod
def _concat(cls, objs):

Expand Down Expand Up @@ -1787,3 +1811,14 @@ def _union(self, other, sort=None):
if sort is None and len(other):
return midx.sort_values()
return midx

def _intersection(self, other, sort=None):
other_df = other.to_frame()
self_df = self.to_frame()

result_df = cudf.merge(self_df, other_df, how="inner")
midx = MultiIndex.from_frame(result_df)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
midx.names = self.names
if sort is None and len(other):
return midx.sort_values()
return midx
Loading