SPARK-1414. Python API for SparkContext.wholeTextFiles

mateiz · mateiz · commit 60e18ce7dd10 · 2014-04-04T17:29:29.000-07:00
Also clarified comment on each file having to fit in memory Author: Matei Zaharia <matei@databricks.com> Closes #327 from mateiz/py-whole-files and squashes the following commits: 9ad64a5 [Matei Zaharia] SPARK-1414. Python API for SparkContext.wholeTextFiles
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -395,7 +395,7 @@ class SparkContext(
    *   (a-hdfs-path/part-nnnnn, its content)
    * }}}
    *
-   * @note Small files are perferred, large file is also allowable, but may cause bad performance.
+   * @note Small files are preferred, as each file will be loaded fully in memory.
    */
   def wholeTextFiles(path: String): RDD[(String, String)] = {
     newAPIHadoopFile(
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -177,7 +177,7 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    *   (a-hdfs-path/part-nnnnn, its content)
    * }}}
    *
-   * @note Small files are perferred, large file is also allowable, but may cause bad performance.
+   * @note Small files are preferred, as each file will be loaded fully in memory.
    */
   def wholeTextFiles(path: String): JavaPairRDD[String, String] =
     new JavaPairRDD(sc.wholeTextFiles(path))
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -19,6 +19,7 @@ package org.apache.spark.api.python
 
 import java.io._
 import java.net._
+import java.nio.charset.Charset
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
 import scala.collection.JavaConversions._
@@ -206,6 +207,7 @@ private object SpecialLengths {
 }
 
 private[spark] object PythonRDD {
+  val UTF8 = Charset.forName("UTF-8")
 
   def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int):
   JavaRDD[Array[Byte]] = {
@@ -266,7 +268,7 @@ private[spark] object PythonRDD {
   }
 
   def writeUTF(str: String, dataOut: DataOutputStream) {
-    val bytes = str.getBytes("UTF-8")
+    val bytes = str.getBytes(UTF8)
     dataOut.writeInt(bytes.length)
     dataOut.write(bytes)
   }
@@ -286,7 +288,7 @@ private[spark] object PythonRDD {
 
 private
 class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] {
-  override def call(arr: Array[Byte]) : String = new String(arr, "UTF-8")
+  override def call(arr: Array[Byte]) : String = new String(arr, PythonRDD.UTF8)
 }
 
 /**
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
@@ -28,7 +28,8 @@
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
-from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer, \
+        PairDeserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark import rdd
 from pyspark.rdd import RDD
@@ -257,6 +258,45 @@ def textFile(self, name, minSplits=None):
         return RDD(self._jsc.textFile(name, minSplits), self,
                    UTF8Deserializer())
 
+    def wholeTextFiles(self, path):
+        """
+        Read a directory of text files from HDFS, a local file system
+        (available on all nodes), or any  Hadoop-supported file system
+        URI. Each file is read as a single record and returned in a
+        key-value pair, where the key is the path of each file, the
+        value is the content of each file.
+
+        For example, if you have the following files::
+
+          hdfs://a-hdfs-path/part-00000
+          hdfs://a-hdfs-path/part-00001
+          ...
+          hdfs://a-hdfs-path/part-nnnnn
+
+        Do C{rdd = sparkContext.wholeTextFiles("hdfs://a-hdfs-path")},
+        then C{rdd} contains::
+
+          (a-hdfs-path/part-00000, its content)
+          (a-hdfs-path/part-00001, its content)
+          ...
+          (a-hdfs-path/part-nnnnn, its content)
+
+        NOTE: Small files are preferred, as each file will be loaded
+        fully in memory.
+
+        >>> dirPath = os.path.join(tempdir, "files")
+        >>> os.mkdir(dirPath)
+        >>> with open(os.path.join(dirPath, "1.txt"), "w") as file1:
+        ...    file1.write("1")
+        >>> with open(os.path.join(dirPath, "2.txt"), "w") as file2:
+        ...    file2.write("2")
+        >>> textFiles = sc.wholeTextFiles(dirPath)
+        >>> sorted(textFiles.collect())
+        [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')]
+        """
+        return RDD(self._jsc.wholeTextFiles(path), self,
+                   PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
+
     def _checkpointFile(self, name, input_deserializer):
         jrdd = self._jsc.checkpointFile(name)
         return RDD(jrdd, self, input_deserializer)
@@ -425,7 +465,7 @@ def _test():
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
     globs['tempdir'] = tempfile.mkdtemp()
     atexit.register(lambda: shutil.rmtree(globs['tempdir']))
-    (failure_count, test_count) = doctest.testmod(globs=globs)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -290,7 +290,7 @@ class MarshalSerializer(FramedSerializer):
 
 class UTF8Deserializer(Serializer):
     """
-    Deserializes streams written by getBytes.
+    Deserializes streams written by String.getBytes.
     """
 
     def loads(self, stream):

Original file line number	Diff line number	Diff line change
`@@ -395,7 +395,7 @@ class SparkContext(`
`395`	`395`	`* (a-hdfs-path/part-nnnnn, its content)`
`396`	`396`	`* }}}`
`397`	`397`	`*`
`398`		`- * @note Small files are perferred, large file is also allowable, but may cause bad performance.`
	`398`	`+ * @note Small files are preferred, as each file will be loaded fully in memory.`
`399`	`399`	`*/`
`400`	`400`	`def wholeTextFiles(path: String): RDD[(String, String)] = {`
`401`	`401`	`newAPIHadoopFile(`
Original file line number	Diff line number	Diff line change
`@@ -177,7 +177,7 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork`
`177`	`177`	`* (a-hdfs-path/part-nnnnn, its content)`
`178`	`178`	`* }}}`
`179`	`179`	`*`
`180`		`- * @note Small files are perferred, large file is also allowable, but may cause bad performance.`
	`180`	`+ * @note Small files are preferred, as each file will be loaded fully in memory.`
`181`	`181`	`*/`
`182`	`182`	`def wholeTextFiles(path: String): JavaPairRDD[String, String] =`
`183`	`183`	`new JavaPairRDD(sc.wholeTextFiles(path))`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ package org.apache.spark.api.python`
`19`	`19`
`20`	`20`	`import java.io._`
`21`	`21`	`import java.net._`
	`22`	`+import java.nio.charset.Charset`
`22`	`23`	`import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}`
`23`	`24`
`24`	`25`	`import scala.collection.JavaConversions._`
`@@ -206,6 +207,7 @@ private object SpecialLengths {`
`206`	`207`	`}`
`207`	`208`
`208`	`209`	`private[spark] object PythonRDD {`
	`210`	`+ val UTF8 = Charset.forName("UTF-8")`
`209`	`211`
`210`	`212`	`def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int):`
`211`	`213`	`JavaRDD[Array[Byte]] = {`
`@@ -266,7 +268,7 @@ private[spark] object PythonRDD {`
`266`	`268`	`}`
`267`	`269`
`268`	`270`	`def writeUTF(str: String, dataOut: DataOutputStream) {`
`269`		`- val bytes = str.getBytes("UTF-8")`
	`271`	`+ val bytes = str.getBytes(UTF8)`
`270`	`272`	`dataOut.writeInt(bytes.length)`
`271`	`273`	`dataOut.write(bytes)`
`272`	`274`	`}`
`@@ -286,7 +288,7 @@ private[spark] object PythonRDD {`
`286`	`288`
`287`	`289`	`private`
`288`	`290`	`class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] {`
`289`		`- override def call(arr: Array[Byte]) : String = new String(arr, "UTF-8")`
	`291`	`+ override def call(arr: Array[Byte]) : String = new String(arr, PythonRDD.UTF8)`
`290`	`292`	`}`
`291`	`293`
`292`	`294`	`/**`