@@ -612,20 +612,19 @@ private[spark] object PythonRDD extends Logging {
612612 compressionCodecClass : String ) = {
613613 saveAsHadoopFile(
614614 pyRDD, batchSerialized, path, " org.apache.hadoop.mapred.SequenceFileOutputFormat" ,
615- null , null , null , null , new java.util.HashMap (), compressionCodecClass, false )
615+ null , null , null , null , new java.util.HashMap (), compressionCodecClass)
616616 }
617617
618618 /**
619- * Output a Python RDD of key-value pairs to any Hadoop file system, using either old
620- * (mapred package) or new (mapreduce package) Hadoop `OutputFormat` . Keys and values are
621- * converted to suitable output types using either user specified converters or, if not
622- * specified, [[org.apache.spark.api.python.JavaToWritableConverter ]]. Post-conversion types
619+ * Output a Python RDD of key-value pairs to any Hadoop file system, using old Hadoop
620+ * `OutputFormat` in mapred package. Keys and values are converted to suitable output
621+ * types using either user specified converters or, if not specified,
622+ * [[org.apache.spark.api.python.JavaToWritableConverter ]]. Post-conversion types
623623 * `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in
624624 * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
625- * this RDD. Compression codec `codecClass` is only effective with the old format.
625+ * this RDD.
626626 */
627- def saveAsHadoopFile [K , V , F <: OutputFormat [_, _], G <: NewOutputFormat [_, _],
628- C <: CompressionCodec ](
627+ def saveAsHadoopFile [K , V , F <: OutputFormat [_, _], C <: CompressionCodec ](
629628 pyRDD : JavaRDD [Array [Byte ]],
630629 batchSerialized : Boolean ,
631630 path : String ,
@@ -635,22 +634,45 @@ private[spark] object PythonRDD extends Logging {
635634 keyConverterClass : String ,
636635 valueConverterClass : String ,
637636 confAsMap : java.util.HashMap [String , String ],
638- compressionCodecClass : String ,
639- useNewAPI : Boolean ) = {
637+ compressionCodecClass : String ) = {
640638 val rdd = SerDeUtil .pythonToPairRDD(pyRDD, batchSerialized)
641639 val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
642640 inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
643641 val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
644642 val codec = Option (compressionCodecClass).map(Class .forName(_).asInstanceOf [Class [C ]])
645643 val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
646644 new JavaToWritableConverter )
647- if (useNewAPI) {
648- val fc = Class .forName(outputFormatClass).asInstanceOf [Class [G ]]
649- converted.saveAsNewAPIHadoopFile(path, kc, vc, fc, mergedConf)
650- } else {
651- val fc = Class .forName(outputFormatClass).asInstanceOf [Class [F ]]
652- converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf (mergedConf), codec= codec)
653- }
645+ val fc = Class .forName(outputFormatClass).asInstanceOf [Class [F ]]
646+ converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf (mergedConf), codec= codec)
647+ }
648+
649+ /**
650+ * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop
651+ * `OutputFormat` in mapreduce package. Keys and values are converted to suitable output
652+ * types using either user specified converters or, if not specified,
653+ * [[org.apache.spark.api.python.JavaToWritableConverter ]]. Post-conversion types
654+ * `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in
655+ * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
656+ * this RDD.
657+ */
658+ def saveAsNewAPIHadoopFile [K , V , F <: NewOutputFormat [_, _]](
659+ pyRDD : JavaRDD [Array [Byte ]],
660+ batchSerialized : Boolean ,
661+ path : String ,
662+ outputFormatClass : String ,
663+ keyClass : String ,
664+ valueClass : String ,
665+ keyConverterClass : String ,
666+ valueConverterClass : String ,
667+ confAsMap : java.util.HashMap [String , String ]) = {
668+ val rdd = SerDeUtil .pythonToPairRDD(pyRDD, batchSerialized)
669+ val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
670+ inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
671+ val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
672+ val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
673+ new JavaToWritableConverter )
674+ val fc = Class .forName(outputFormatClass).asInstanceOf [Class [F ]]
675+ converted.saveAsNewAPIHadoopFile(path, kc, vc, fc, mergedConf)
654676 }
655677
656678 /**
@@ -665,9 +687,9 @@ private[spark] object PythonRDD extends Logging {
665687 pyRDD : JavaRDD [Array [Byte ]],
666688 batchSerialized : Boolean ,
667689 confAsMap : java.util.HashMap [String , String ],
668- useNewAPI : Boolean ,
669690 keyConverterClass : String ,
670- valueConverterClass : String ) = {
691+ valueConverterClass : String ,
692+ useNewAPI : Boolean ) = {
671693 val conf = PythonHadoopUtil .mapToConf(confAsMap)
672694 val converted = convertRDD(SerDeUtil .pythonToPairRDD(pyRDD, batchSerialized),
673695 keyConverterClass, valueConverterClass, new JavaToWritableConverter )
0 commit comments