[SPARK-3519] add distinct(n) to SchemaRDD in PySpark

Matthew Farrellee · Matthew Farrellee · commit 6bc4a2c8a184 · 2014-09-15T09:57:48.000-04:00
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
@@ -1694,8 +1694,11 @@ def coalesce(self, numPartitions, shuffle=False):
         rdd = self._jschema_rdd.coalesce(numPartitions, shuffle)
         return SchemaRDD(rdd, self.sql_ctx)
 
-    def distinct(self):
-        rdd = self._jschema_rdd.distinct()
+    def distinct(self, numPartitions=None):
+        if numPartitions is None:
+            rdd = self._jschema_rdd.distinct()
+        else:
+            rdd = self._jschema_rdd.distinct(numPartitions)
         return SchemaRDD(rdd, self.sql_ctx)
 
     def intersection(self, other):
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
@@ -646,6 +646,19 @@ def test_basic_functions(self):
         srdd.count()
         srdd.collect()
 
+    def test_distinct(self):
+        rdd = self.sc.parallelize(['{"a": 1}', '{"b": 2}', '{"c": 3}']*10)
+        srdd = self.sqlCtx.jsonRDD(rdd).distinct()
+        self.assertEquals(srdd.count(), 3)
+
+    def test_distinct_numPartitions(self):
+        rdd = self.sc.parallelize(['{"a": 1}', '{"b": 2}', '{"c": 3}']*10, 10)
+        srdd = self.sqlCtx.jsonRDD(rdd)
+        self.assertEquals(srdd.getNumPartitions(), 10)
+        result = srdd.distinct(5)
+        self.assertEquals(result.getNumPartitions(), 5)
+        self.assertEquals(result.count(), 3)
+
 
 class TestIO(PySparkTestCase):