[CARMEL-7470][CARMEL-6471] Upload API sometimes throw Delegation Token error (apache#146)

wangyum · GitHub Enterprise · commit b6e852d9e44f · 2024-02-01T19:35:46.000-06:00
diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
@@ -17,26 +17,29 @@
 
 package org.apache.spark.input
 
-import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream, IOException}
+import java.util
 
 import scala.collection.JavaConverters._
 
+import com.google.common.collect.Lists
 import com.google.common.io.{ByteStreams, Closeables}
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path, PathFilter}
+import org.apache.hadoop.mapred.LocatedFileStatusFetcher
 import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit}
+import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit, FileInputFormat}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Since
-import org.apache.spark.internal.config
+import org.apache.spark.internal.{config, Logging}
 
 /**
  * A general format for reading whole files in as streams, byte arrays,
  * or other functions to be added
  */
 private[spark] abstract class StreamFileInputFormat[T]
-  extends CombineFileInputFormat[String, T]
+  extends CombineFileInputFormat[String, T] with Logging
 {
   override protected def isSplitable(context: JobContext, file: Path): Boolean = false
 
@@ -69,6 +72,35 @@ private[spark] abstract class StreamFileInputFormat[T]
 
   def createRecordReader(split: InputSplit, taContext: TaskAttemptContext): RecordReader[String, T]
 
+  override protected def listStatus(job: JobContext): util.List[FileStatus] = {
+    val dirs: Array[Path] = FileInputFormat.getInputPaths(job)
+    if (dirs.length == 0) throw new IOException("No input paths specified in job")
+    // Whether we need to recursive look into the directory structure
+    val recursive = FileInputFormat.getInputDirRecursive(job)
+    val jobFilter: PathFilter = FileInputFormat.getInputPathFilter(job)
+    val inputFilter = new PathFilter {
+      override def accept(path: Path): Boolean = {
+        val name = path.getName
+        val hidden = name.startsWith("_") || name.startsWith(".")
+        !hidden && (
+          if (jobFilter != null) {
+            jobFilter.accept(path)
+          } else {
+            true
+          })
+      }
+    }
+
+    var result: util.List[FileStatus] = null
+
+    val locatedFileStatusFetcher =
+        new LocatedFileStatusFetcher(job.getConfiguration, dirs, recursive, inputFilter, true)
+    val locatedFiles = locatedFileStatusFetcher.getFileStatuses()
+    result = Lists.newArrayList(locatedFiles)
+
+    logInfo("Total input paths to process : " + result.size)
+    result
+  }
 }
 
 /**