[DataFrame] Implementation for head, idxmax, idxmin, pop, tail, and R…

…ay Index (ray-project#1520) * Adding head implementation * Adding idxmax, idxmin, pop, tail * Adding index skeleton * Addressing reviewer comments * Fixing tests to reflect Series constructor changes
rongou · Feb 7, 2018 · fa37564 · fa37564
1 parent ff8e7f8
commit fa37564
Show file tree

Hide file tree

Showing 5 changed files with 151 additions and 31 deletions.
diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py
@@ -614,18 +614,67 @@ def gt(self, other, axis='columns', level=None):
         raise NotImplementedError("Not Yet implemented.")
 
     def head(self, n=5):
-        raise NotImplementedError("Not Yet implemented.")
+        """Get the first n rows of the dataframe.
+
+        Args:
+            n (int): The number of rows to return.
+
+        Returns:
+            A new dataframe with the first n rows of the dataframe.
+        """
+        sizes = ray.get(self._map_partitions(lambda df: df.size)._df)
+        new_dfs = []
+        i = 0
+        while n > 0 and i < len(self._df):
+            if (n - sizes[i]) < 0:
+                new_dfs.append(_deploy_func.remote(lambda df: df.head(n),
+                                                   self._df[i]))
+                break
+            else:
+                new_dfs.append(self._df[i])
+                n -= sizes[i]
+                i += 1
+
+        return DataFrame(new_dfs, self.columns)
 
     def hist(self, data, column=None, by=None, grid=True, xlabelsize=None,
              xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
              sharey=False, figsize=None, layout=None, bins=10, **kwds):
         raise NotImplementedError("Not Yet implemented.")
 
     def idxmax(self, axis=0, skipna=True):
-        raise NotImplementedError("Not Yet implemented.")
+        """Get the index of the first occurrence of the max value of the axis.
+
+        Args:
+            axis (int): Identify the max over the rows (1) or columns (0).
+            skipna (bool): Whether or not to skip NA values.
+
+        Returns:
+            A Series with the index for each maximum value for the axis
+                specified.
+        """
+        if axis == 1:
+            return to_pandas(self._map_partitions(
+                lambda df: df.idxmax(axis=axis, skipna=skipna)))
+        else:
+            return self.T.idxmax(axis=1, skipna=skipna)
 
     def idxmin(self, axis=0, skipna=True):
-        raise NotImplementedError("Not Yet implemented.")
+        """Get the index of the first occurrence of the min value of the axis.
+
+        Args:
+            axis (int): Identify the min over the rows (1) or columns (0).
+            skipna (bool): Whether or not to skip NA values.
+
+        Returns:
+            A Series with the index for each minimum value for the axis
+                specified.
+        """
+        if axis == 1:
+            return to_pandas(self._map_partitions(
+                lambda df: df.idxmin(axis=axis, skipna=skipna)))
+        else:
+            return self.T.idxmin(axis=1, skipna=skipna)
 
     def infer_objects(self):
         raise NotImplementedError("Not Yet implemented.")
@@ -771,7 +820,20 @@ def plot(self, x=None, y=None, kind='line', ax=None, subplots=False,
         raise NotImplementedError("Not Yet implemented.")
 
     def pop(self, item):
-        raise NotImplementedError("Not Yet implemented.")
+        """Pops an item from this DataFrame and returns it.
+
+        Args:
+            item (str): Column label to be popped
+
+        Returns:
+            A Series containing the popped values. Also modifies this
+            DataFrame.
+        """
+
+        popped = to_pandas(self._map_partitions(
+            lambda df: df.pop(item)))
+        self._df = self._map_partitions(lambda df: df.drop([item], axis=1))._df
+        return popped
 
     def pow(self, other, axis='columns', level=None, fill_value=None):
         raise NotImplementedError("Not Yet implemented.")
@@ -934,7 +996,29 @@ def swaplevel(self, i=-2, j=-1, axis=0):
         raise NotImplementedError("Not Yet implemented.")
 
     def tail(self, n=5):
-        raise NotImplementedError("Not Yet implemented.")
+        """Get the last n rows of the dataframe.
+
+        Args:
+            n (int): The number of rows to return.
+
+        Returns:
+            A new dataframe with the last n rows of this dataframe.
+        """
+        sizes = ray.get(self._map_partitions(lambda df: df.size)._df)
+        new_dfs = []
+        i = len(self._df) - 1
+        while n > 0 and i >= 0:
+            if (n - sizes[i]) < 0:
+                new_dfs.append(_deploy_func.remote(lambda df: df.head(n),
+                                                   self._df[i]))
+                break
+            else:
+                new_dfs.append(self._df[i])
+                n -= sizes[i]
+                i -= 1
+        # we were adding in reverse order, so make it right.
+        new_dfs.reverse()
+        return DataFrame(new_dfs, self.columns)
 
     def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs):
         raise NotImplementedError("Not Yet implemented.")

diff --git a/python/ray/dataframe/index.py b/python/ray/dataframe/index.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pandas as pd
+
+
+class Index(object):
+
+    def __init__(self, idx):
+        self.idx = idx
+
+    @classmethod
+    def to_pandas(indices):
+        if isinstance(indices[0], pd.RangeIndex):
+            merged = indices[0]
+            for index in indices[1:]:
+                merged = merged.union(index)
+            return merged
+        else:
+            return indices[0].append(indices[1:])
diff --git a/python/ray/dataframe/series.py b/python/ray/dataframe/series.py
@@ -13,6 +13,14 @@ def na_op():
 
 class Series(object):
 
+    def __init__(self, series_oids):
+        """Constructor for a Series object.
+
+        Args:
+            series_oids ([ObjectID]): The list of remote Series objects.
+        """
+        self.series_oids = series_oids
+
     @property
     def T(self):
         raise NotImplementedError("Not Yet implemented.")

diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py
@@ -165,6 +165,11 @@ def test_int_dataframe():
     test___deepcopy__(ray_df, pandas_df)
     test_bool(ray_df, pandas_df)
     test_count(ray_df, pandas_df)
+    test_head(ray_df, pandas_df)
+    test_tail(ray_df, pandas_df)
+    test_idxmax(ray_df, pandas_df)
+    test_idxmin(ray_df, pandas_df)
+    test_pop(ray_df, pandas_df)
 
 
 def test_float_dataframe():
@@ -212,6 +217,11 @@ def test_float_dataframe():
     test___deepcopy__(ray_df, pandas_df)
     test_bool(ray_df, pandas_df)
     test_count(ray_df, pandas_df)
+    test_head(ray_df, pandas_df)
+    test_tail(ray_df, pandas_df)
+    test_idxmax(ray_df, pandas_df)
+    test_idxmin(ray_df, pandas_df)
+    test_pop(ray_df, pandas_df)
 
 
 def test_add():
@@ -663,11 +673,9 @@ def test_gt():
         ray_df.gt(None)
 
 
-def test_head():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.head()
+@pytest.fixture
+def test_head(ray_df, pandas_df):
+    ray_df_equals_pandas(ray_df.head(), pandas_df.head())
 
 
 def test_hist():
@@ -677,18 +685,16 @@ def test_hist():
         ray_df.hist(None)
 
 
-def test_idxmax():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.idxmax()
-
+@pytest.fixture
+def test_idxmax(ray_df, pandas_df):
+    assert \
+        ray_df.idxmax().sort_index().equals(pandas_df.idxmax().sort_index())
 
-def test_idxmin():
-    ray_df = create_test_dataframe()
 
-    with pytest.raises(NotImplementedError):
-        ray_df.idxmin()
+@pytest.fixture
+def test_idxmin(ray_df, pandas_df):
+    assert \
+        ray_df.idxmin().sort_index().equals(pandas_df.idxmin().sort_index())
 
 
 def test_infer_objects():
@@ -971,11 +977,14 @@ def test_plot():
         ray_df.plot()
 
 
-def test_pop():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.pop(None)
+@pytest.fixture
+def test_pop(ray_df, pandas_df):
+    temp_ray_df = ray_df._map_partitions(lambda df: df)
+    temp_pandas_df = pandas_df.copy()
+    ray_popped = temp_ray_df.pop('col2')
+    pandas_popped = temp_pandas_df.pop('col2')
+    assert ray_popped.sort_index().equals(pandas_popped.sort_index())
+    ray_df_equals_pandas(temp_ray_df, temp_pandas_df)
 
 
 def test_pow():
@@ -1292,11 +1301,9 @@ def test_swaplevel():
         ray_df.swaplevel()
 
 
-def test_tail():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.tail()
+@pytest.fixture
+def test_tail(ray_df, pandas_df):
+    ray_df_equals_pandas(ray_df.tail(), pandas_df.tail())
 
 
 def test_take():

diff --git a/python/ray/dataframe/test/test_series.py b/python/ray/dataframe/test/test_series.py
@@ -11,7 +11,7 @@
 
 @pytest.fixture
 def create_test_series():
-    return rdf.Series()
+    return rdf.Series(None)
 
 
 def test_T():