Skip to content

Commit d998ad6

Browse files
committed
[SPARK-2024] refectoring to get method params below 10
1 parent 57a7a5e commit d998ad6

File tree

2 files changed

+46
-25
lines changed

2 files changed

+46
-25
lines changed

core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -612,20 +612,19 @@ private[spark] object PythonRDD extends Logging {
612612
compressionCodecClass: String) = {
613613
saveAsHadoopFile(
614614
pyRDD, batchSerialized, path, "org.apache.hadoop.mapred.SequenceFileOutputFormat",
615-
null, null, null, null, new java.util.HashMap(), compressionCodecClass, false)
615+
null, null, null, null, new java.util.HashMap(), compressionCodecClass)
616616
}
617617

618618
/**
619-
* Output a Python RDD of key-value pairs to any Hadoop file system, using either old
620-
* (mapred package) or new (mapreduce package) Hadoop `OutputFormat`. Keys and values are
621-
* converted to suitable output types using either user specified converters or, if not
622-
* specified, [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion types
619+
* Output a Python RDD of key-value pairs to any Hadoop file system, using old Hadoop
620+
* `OutputFormat` in mapred package. Keys and values are converted to suitable output
621+
* types using either user specified converters or, if not specified,
622+
* [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion types
623623
* `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in
624624
* `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
625-
* this RDD. Compression codec `codecClass` is only effective with the old format.
625+
* this RDD.
626626
*/
627-
def saveAsHadoopFile[K, V, F <: OutputFormat[_, _], G <: NewOutputFormat[_, _],
628-
C <: CompressionCodec](
627+
def saveAsHadoopFile[K, V, F <: OutputFormat[_, _], C <: CompressionCodec](
629628
pyRDD: JavaRDD[Array[Byte]],
630629
batchSerialized: Boolean,
631630
path: String,
@@ -635,22 +634,45 @@ private[spark] object PythonRDD extends Logging {
635634
keyConverterClass: String,
636635
valueConverterClass: String,
637636
confAsMap: java.util.HashMap[String, String],
638-
compressionCodecClass: String,
639-
useNewAPI: Boolean) = {
637+
compressionCodecClass: String) = {
640638
val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
641639
val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
642640
inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
643641
val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
644642
val codec = Option(compressionCodecClass).map(Class.forName(_).asInstanceOf[Class[C]])
645643
val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
646644
new JavaToWritableConverter)
647-
if (useNewAPI) {
648-
val fc = Class.forName(outputFormatClass).asInstanceOf[Class[G]]
649-
converted.saveAsNewAPIHadoopFile(path, kc, vc, fc, mergedConf)
650-
} else {
651-
val fc = Class.forName(outputFormatClass).asInstanceOf[Class[F]]
652-
converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec=codec)
653-
}
645+
val fc = Class.forName(outputFormatClass).asInstanceOf[Class[F]]
646+
converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec=codec)
647+
}
648+
649+
/**
650+
* Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop
651+
* `OutputFormat` in mapreduce package. Keys and values are converted to suitable output
652+
* types using either user specified converters or, if not specified,
653+
* [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion types
654+
* `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in
655+
* `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
656+
* this RDD.
657+
*/
658+
def saveAsNewAPIHadoopFile[K, V, F <: NewOutputFormat[_, _]](
659+
pyRDD: JavaRDD[Array[Byte]],
660+
batchSerialized: Boolean,
661+
path: String,
662+
outputFormatClass: String,
663+
keyClass: String,
664+
valueClass: String,
665+
keyConverterClass: String,
666+
valueConverterClass: String,
667+
confAsMap: java.util.HashMap[String, String]) = {
668+
val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
669+
val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
670+
inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
671+
val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
672+
val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
673+
new JavaToWritableConverter)
674+
val fc = Class.forName(outputFormatClass).asInstanceOf[Class[F]]
675+
converted.saveAsNewAPIHadoopFile(path, kc, vc, fc, mergedConf)
654676
}
655677

656678
/**
@@ -665,9 +687,9 @@ private[spark] object PythonRDD extends Logging {
665687
pyRDD: JavaRDD[Array[Byte]],
666688
batchSerialized: Boolean,
667689
confAsMap: java.util.HashMap[String, String],
668-
useNewAPI: Boolean,
669690
keyConverterClass: String,
670-
valueConverterClass: String) = {
691+
valueConverterClass: String,
692+
useNewAPI: Boolean) = {
671693
val conf = PythonHadoopUtil.mapToConf(confAsMap)
672694
val converted = convertRDD(SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized),
673695
keyConverterClass, valueConverterClass, new JavaToWritableConverter)

python/pyspark/rdd.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,7 +1052,7 @@ def saveAsNewAPIHadoopDataset(self, conf, keyConverter=None, valueConverter=None
10521052
pickled = self._toPickleSerialization()
10531053
batched = isinstance(pickled._jrdd_deserializer, BatchedSerializer)
10541054
self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickled._jrdd, batched, jconf,
1055-
True, keyConverter, valueConverter)
1055+
keyConverter, valueConverter, True)
10561056

10571057
def saveAsNewAPIHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None,
10581058
keyConverter=None, valueConverter=None, conf=None):
@@ -1078,9 +1078,8 @@ def saveAsNewAPIHadoopFile(self, path, outputFormatClass, keyClass=None, valueCl
10781078
jconf = self.ctx._dictToJavaMap(conf)
10791079
pickled = self._toPickleSerialization()
10801080
batched = isinstance(pickled._jrdd_deserializer, BatchedSerializer)
1081-
self.ctx._jvm.PythonRDD.saveAsHadoopFile(pickled._jrdd, batched, path,
1082-
outputFormatClass, keyClass, valueClass, keyConverter, valueConverter,
1083-
jconf, None, True)
1081+
self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickled._jrdd, batched, path,
1082+
outputFormatClass, keyClass, valueClass, keyConverter, valueConverter, jconf)
10841083

10851084
def saveAsHadoopDataset(self, conf, keyConverter=None, valueConverter=None):
10861085
"""
@@ -1097,7 +1096,7 @@ def saveAsHadoopDataset(self, conf, keyConverter=None, valueConverter=None):
10971096
pickled = self._toPickleSerialization()
10981097
batched = isinstance(pickled._jrdd_deserializer, BatchedSerializer)
10991098
self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickled._jrdd, batched, jconf,
1100-
False, keyConverter, valueConverter)
1099+
keyConverter, valueConverter, False)
11011100

11021101
def saveAsHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None,
11031102
keyConverter=None, valueConverter=None, conf=None, compressionCodecClass=None):
@@ -1126,7 +1125,7 @@ def saveAsHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=No
11261125
batched = isinstance(pickled._jrdd_deserializer, BatchedSerializer)
11271126
self.ctx._jvm.PythonRDD.saveAsHadoopFile(pickled._jrdd, batched,
11281127
path, outputFormatClass, keyClass, valueClass, keyConverter, valueConverter,
1129-
jconf, compressionCodecClass, False)
1128+
jconf, compressionCodecClass)
11301129

11311130
def saveAsSequenceFile(self, path, compressionCodecClass=None):
11321131
"""

0 commit comments

Comments
 (0)