|
17 | 17 |
|
18 | 18 | package org.apache.spark.input |
19 | 19 |
|
20 | | -import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} |
| 20 | +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream, IOException} |
| 21 | +import java.util |
21 | 22 |
|
22 | 23 | import scala.collection.JavaConverters._ |
23 | 24 |
|
| 25 | +import com.google.common.collect.Lists |
24 | 26 | import com.google.common.io.{ByteStreams, Closeables} |
25 | 27 | import org.apache.hadoop.conf.Configuration |
26 | | -import org.apache.hadoop.fs.Path |
| 28 | +import org.apache.hadoop.fs.{FileStatus, Path, PathFilter} |
| 29 | +import org.apache.hadoop.mapred.LocatedFileStatusFetcher |
27 | 30 | import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} |
28 | | -import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit} |
| 31 | +import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit, FileInputFormat} |
29 | 32 |
|
30 | 33 | import org.apache.spark.SparkContext |
31 | 34 | import org.apache.spark.annotation.Since |
32 | | -import org.apache.spark.internal.config |
| 35 | +import org.apache.spark.internal.{config, Logging} |
33 | 36 |
|
34 | 37 | /** |
35 | 38 | * A general format for reading whole files in as streams, byte arrays, |
36 | 39 | * or other functions to be added |
37 | 40 | */ |
38 | 41 | private[spark] abstract class StreamFileInputFormat[T] |
39 | | - extends CombineFileInputFormat[String, T] |
| 42 | + extends CombineFileInputFormat[String, T] with Logging |
40 | 43 | { |
41 | 44 | override protected def isSplitable(context: JobContext, file: Path): Boolean = false |
42 | 45 |
|
@@ -69,6 +72,35 @@ private[spark] abstract class StreamFileInputFormat[T] |
69 | 72 |
|
70 | 73 | def createRecordReader(split: InputSplit, taContext: TaskAttemptContext): RecordReader[String, T] |
71 | 74 |
|
| 75 | + override protected def listStatus(job: JobContext): util.List[FileStatus] = { |
| 76 | + val dirs: Array[Path] = FileInputFormat.getInputPaths(job) |
| 77 | + if (dirs.length == 0) throw new IOException("No input paths specified in job") |
| 78 | + // Whether we need to recursive look into the directory structure |
| 79 | + val recursive = FileInputFormat.getInputDirRecursive(job) |
| 80 | + val jobFilter: PathFilter = FileInputFormat.getInputPathFilter(job) |
| 81 | + val inputFilter = new PathFilter { |
| 82 | + override def accept(path: Path): Boolean = { |
| 83 | + val name = path.getName |
| 84 | + val hidden = name.startsWith("_") || name.startsWith(".") |
| 85 | + !hidden && ( |
| 86 | + if (jobFilter != null) { |
| 87 | + jobFilter.accept(path) |
| 88 | + } else { |
| 89 | + true |
| 90 | + }) |
| 91 | + } |
| 92 | + } |
| 93 | + |
| 94 | + var result: util.List[FileStatus] = null |
| 95 | + |
| 96 | + val locatedFileStatusFetcher = |
| 97 | + new LocatedFileStatusFetcher(job.getConfiguration, dirs, recursive, inputFilter, true) |
| 98 | + val locatedFiles = locatedFileStatusFetcher.getFileStatuses() |
| 99 | + result = Lists.newArrayList(locatedFiles) |
| 100 | + |
| 101 | + logInfo("Total input paths to process : " + result.size) |
| 102 | + result |
| 103 | + } |
72 | 104 | } |
73 | 105 |
|
74 | 106 | /** |
|
0 commit comments