Skip to content

Commit bfdbdd3

Browse files
ferdonlinesrowen
authored andcommitted
[SPARK-23029][DOCS] Specifying default units of configuration entries
## What changes were proposed in this pull request? This PR completes the docs, specifying the default units assumed in configuration entries of type size. This is crucial since unit-less values are accepted and the user might assume the base unit is bytes, which in most cases it is not, leading to hard-to-debug problems. ## How was this patch tested? This patch updates only documentation only. Author: Fernando Pereira <fernando.pereira@epfl.ch> Closes #20269 from ferdonline/docs_units. (cherry picked from commit 9678941) Signed-off-by: Sean Owen <sowen@cloudera.com>
1 parent bd0a162 commit bfdbdd3

File tree

3 files changed

+85
-68
lines changed

3 files changed

+85
-68
lines changed

core/src/main/scala/org/apache/spark/SparkConf.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -640,9 +640,9 @@ private[spark] object SparkConf extends Logging {
640640
translation = s => s"${s.toLong * 10}s")),
641641
"spark.reducer.maxSizeInFlight" -> Seq(
642642
AlternateConfig("spark.reducer.maxMbInFlight", "1.4")),
643-
"spark.kryoserializer.buffer" ->
644-
Seq(AlternateConfig("spark.kryoserializer.buffer.mb", "1.4",
645-
translation = s => s"${(s.toDouble * 1000).toInt}k")),
643+
"spark.kryoserializer.buffer" -> Seq(
644+
AlternateConfig("spark.kryoserializer.buffer.mb", "1.4",
645+
translation = s => s"${(s.toDouble * 1000).toInt}k")),
646646
"spark.kryoserializer.buffer.max" -> Seq(
647647
AlternateConfig("spark.kryoserializer.buffer.max.mb", "1.4")),
648648
"spark.shuffle.file.buffer" -> Seq(

core/src/main/scala/org/apache/spark/internal/config/package.scala

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,13 @@ package object config {
3838
ConfigBuilder("spark.driver.userClassPathFirst").booleanConf.createWithDefault(false)
3939

4040
private[spark] val DRIVER_MEMORY = ConfigBuilder("spark.driver.memory")
41+
.doc("Amount of memory to use for the driver process, in MiB unless otherwise specified.")
4142
.bytesConf(ByteUnit.MiB)
4243
.createWithDefaultString("1g")
4344

4445
private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.driver.memoryOverhead")
46+
.doc("The amount of off-heap memory to be allocated per driver in cluster mode, " +
47+
"in MiB unless otherwise specified.")
4548
.bytesConf(ByteUnit.MiB)
4649
.createOptional
4750

@@ -62,6 +65,7 @@ package object config {
6265
.createWithDefault(false)
6366

6467
private[spark] val EVENT_LOG_OUTPUT_BUFFER_SIZE = ConfigBuilder("spark.eventLog.buffer.kb")
68+
.doc("Buffer size to use when writing to output streams, in KiB unless otherwise specified.")
6569
.bytesConf(ByteUnit.KiB)
6670
.createWithDefaultString("100k")
6771

@@ -81,10 +85,13 @@ package object config {
8185
ConfigBuilder("spark.executor.userClassPathFirst").booleanConf.createWithDefault(false)
8286

8387
private[spark] val EXECUTOR_MEMORY = ConfigBuilder("spark.executor.memory")
88+
.doc("Amount of memory to use per executor process, in MiB unless otherwise specified.")
8489
.bytesConf(ByteUnit.MiB)
8590
.createWithDefaultString("1g")
8691

8792
private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.executor.memoryOverhead")
93+
.doc("The amount of off-heap memory to be allocated per executor in cluster mode, " +
94+
"in MiB unless otherwise specified.")
8895
.bytesConf(ByteUnit.MiB)
8996
.createOptional
9097

@@ -353,7 +360,7 @@ package object config {
353360
private[spark] val BUFFER_WRITE_CHUNK_SIZE =
354361
ConfigBuilder("spark.buffer.write.chunkSize")
355362
.internal()
356-
.doc("The chunk size during writing out the bytes of ChunkedByteBuffer.")
363+
.doc("The chunk size in bytes during writing out the bytes of ChunkedByteBuffer.")
357364
.bytesConf(ByteUnit.BYTE)
358365
.checkValue(_ <= Int.MaxValue, "The chunk size during writing out the bytes of" +
359366
" ChunkedByteBuffer should not larger than Int.MaxValue.")
@@ -368,9 +375,9 @@ package object config {
368375

369376
private[spark] val SHUFFLE_ACCURATE_BLOCK_THRESHOLD =
370377
ConfigBuilder("spark.shuffle.accurateBlockThreshold")
371-
.doc("When we compress the size of shuffle blocks in HighlyCompressedMapStatus, we will " +
372-
"record the size accurately if it's above this config. This helps to prevent OOM by " +
373-
"avoiding underestimating shuffle block size when fetch shuffle blocks.")
378+
.doc("Threshold in bytes above which the size of shuffle blocks in " +
379+
"HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM " +
380+
"by avoiding underestimating shuffle block size when fetch shuffle blocks.")
374381
.bytesConf(ByteUnit.BYTE)
375382
.createWithDefault(100 * 1024 * 1024)
376383

@@ -389,23 +396,23 @@ package object config {
389396

390397
private[spark] val REDUCER_MAX_BLOCKS_IN_FLIGHT_PER_ADDRESS =
391398
ConfigBuilder("spark.reducer.maxBlocksInFlightPerAddress")
392-
.doc("This configuration limits the number of remote blocks being fetched per reduce task" +
393-
" from a given host port. When a large number of blocks are being requested from a given" +
394-
" address in a single fetch or simultaneously, this could crash the serving executor or" +
395-
" Node Manager. This is especially useful to reduce the load on the Node Manager when" +
396-
" external shuffle is enabled. You can mitigate the issue by setting it to a lower value.")
399+
.doc("This configuration limits the number of remote blocks being fetched per reduce task " +
400+
"from a given host port. When a large number of blocks are being requested from a given " +
401+
"address in a single fetch or simultaneously, this could crash the serving executor or " +
402+
"Node Manager. This is especially useful to reduce the load on the Node Manager when " +
403+
"external shuffle is enabled. You can mitigate the issue by setting it to a lower value.")
397404
.intConf
398405
.checkValue(_ > 0, "The max no. of blocks in flight cannot be non-positive.")
399406
.createWithDefault(Int.MaxValue)
400407

401408
private[spark] val MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM =
402409
ConfigBuilder("spark.maxRemoteBlockSizeFetchToMem")
403-
.doc("Remote block will be fetched to disk when size of the block is " +
404-
"above this threshold. This is to avoid a giant request takes too much memory. We can " +
405-
"enable this config by setting a specific value(e.g. 200m). Note this configuration will " +
406-
"affect both shuffle fetch and block manager remote block fetch. For users who " +
407-
"enabled external shuffle service, this feature can only be worked when external shuffle" +
408-
" service is newer than Spark 2.2.")
410+
.doc("Remote block will be fetched to disk when size of the block is above this threshold " +
411+
"in bytes. This is to avoid a giant request takes too much memory. We can enable this " +
412+
"config by setting a specific value(e.g. 200m). Note this configuration will affect " +
413+
"both shuffle fetch and block manager remote block fetch. For users who enabled " +
414+
"external shuffle service, this feature can only be worked when external shuffle" +
415+
"service is newer than Spark 2.2.")
409416
.bytesConf(ByteUnit.BYTE)
410417
.createWithDefault(Long.MaxValue)
411418

@@ -419,9 +426,9 @@ package object config {
419426

420427
private[spark] val SHUFFLE_FILE_BUFFER_SIZE =
421428
ConfigBuilder("spark.shuffle.file.buffer")
422-
.doc("Size of the in-memory buffer for each shuffle file output stream. " +
423-
"These buffers reduce the number of disk seeks and system calls made " +
424-
"in creating intermediate shuffle files.")
429+
.doc("Size of the in-memory buffer for each shuffle file output stream, in KiB unless " +
430+
"otherwise specified. These buffers reduce the number of disk seeks and system calls " +
431+
"made in creating intermediate shuffle files.")
425432
.bytesConf(ByteUnit.KiB)
426433
.checkValue(v => v > 0 && v <= Int.MaxValue / 1024,
427434
s"The file buffer size must be greater than 0 and less than ${Int.MaxValue / 1024}.")
@@ -430,15 +437,15 @@ package object config {
430437
private[spark] val SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE =
431438
ConfigBuilder("spark.shuffle.unsafe.file.output.buffer")
432439
.doc("The file system for this buffer size after each partition " +
433-
"is written in unsafe shuffle writer.")
440+
"is written in unsafe shuffle writer. In KiB unless otherwise specified.")
434441
.bytesConf(ByteUnit.KiB)
435442
.checkValue(v => v > 0 && v <= Int.MaxValue / 1024,
436443
s"The buffer size must be greater than 0 and less than ${Int.MaxValue / 1024}.")
437444
.createWithDefaultString("32k")
438445

439446
private[spark] val SHUFFLE_DISK_WRITE_BUFFER_SIZE =
440447
ConfigBuilder("spark.shuffle.spill.diskWriteBufferSize")
441-
.doc("The buffer size to use when writing the sorted records to an on-disk file.")
448+
.doc("The buffer size, in bytes, to use when writing the sorted records to an on-disk file.")
442449
.bytesConf(ByteUnit.BYTE)
443450
.checkValue(v => v > 0 && v <= Int.MaxValue,
444451
s"The buffer size must be greater than 0 and less than ${Int.MaxValue}.")

0 commit comments

Comments
 (0)