1. lambdaS3 bucket to pick spark binaries

Venkata krishnan Sowrirajan · Venkata krishnan Sowrirajan · commit 684e0879a4bb · 2017-11-20T11:49:55.000-08:00
2. Moved AWSLambdaClient creation to the top and trying to use the same everywhere
3. Pass java xmx to be the memory from the lambda function
4. Clean up qubole occurrences and other redundant stuff
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/JavaUtils.java b/common/network-common/src/main/java/org/apache/spark/network/util/JavaUtils.java
@@ -345,12 +345,4 @@ public static byte[] bufferToArray(ByteBuffer buffer) {
     }
   }
 
-  public static Path localFileToS3(String s3PrefixLocation, File path) throws IOException {
-    return new Path(s3PrefixLocation, path.getCanonicalPath());
-  }
-
-  public static File s3ToLocalFile(String s3PrefixLocation, Path path) {
-    return new File(path.toString().replace(s3PrefixLocation, ""));
-  }
-
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -119,7 +119,7 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
 
   private SparkConf conf;
   private boolean shuffleOverS3 = false;
-  private String s3PrefixLocation = "";
+  private String shuffleS3Bucket = "";
 
   private Configuration hadoopConf;
   private FileSystem hadoopFileSystem;
@@ -144,8 +144,8 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
     this.serializer = dep.serializer();
     this.shuffleBlockResolver = shuffleBlockResolver;
     this.conf = conf;
-    this.shuffleOverS3 = conf.getBoolean("spark.shuffle.s3.enabled", shuffleOverS3);
-    this.s3PrefixLocation = conf.get("spark.qubole.s3PrefixLocation", "s3://dev.canopydata.com/vsowrira/");
+    this.shuffleOverS3 = blockManager.shuffleOverS3Enabled();
+    this.shuffleS3Bucket = BlockManager.getS3Bucket(conf);
     this.hadoopConf = BlockManager.getHadoopConf(conf);
     this.hadoopFileSystem = BlockManager.getHadoopFileSystem(conf);
   }
@@ -219,7 +219,7 @@ long[] getPartitionLengths() {
    */
   private long[] writePartitionedFile(File outputFile) throws IOException {
     if(shuffleOverS3) {
-      Path outputPath = Utils.localFileToS3(s3PrefixLocation, outputFile);
+      Path outputPath = Utils.localFileToS3(shuffleS3Bucket, outputFile);
       return writePartitionedFileToS3(outputPath);
     }
 
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -273,7 +273,7 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
 
     if (shuffleOverS3Enabled) {
       Path outputPath = Utils.localFileToS3(
-              blockManager.getS3PrefixLocation(sparkConf), outputFile);
+              blockManager.getS3Bucket(sparkConf), outputFile);
       FileSystem fileSystem = outputPath.getFileSystem(
               BlockManager.getHadoopConf(sparkConf));
       FSDataOutputStream outputStream;
@@ -379,7 +379,7 @@ private long[] mergeSpillsWithFileStream(
 
     if (blockManager.shuffleOverS3Enabled()) {
      Path outputPath = Utils.localFileToS3(
-             blockManager.getS3PrefixLocation(sparkConf), outputFile);
+             blockManager.getS3Bucket(sparkConf), outputFile);
      FileSystem fileSystem = outputPath.getFileSystem(
              BlockManager.getHadoopConf(sparkConf));
      FSDataOutputStream outputStream = fileSystem.create(outputPath);
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -68,9 +68,9 @@ private[spark] class CoarseGrainedExecutorBackend(
         exitExecutor(1, s"Cannot register with driver: $driverUrl", e, notifyDriver = false)
     }(ThreadUtils.sameThread)
 
-    val requestId = env.conf.get("spark.qubole.lambda.awsRequestId")
-    val logGroupName = env.conf.get("spark.qubole.lambda.logGroupName")
-    val logStreamName = env.conf.get("spark.qubole.lambda.logStreamName")
+    val requestId = env.conf.get("spark.lambda.awsRequestId")
+    val logGroupName = env.conf.get("spark.lambda.logGroupName")
+    val logStreamName = env.conf.get("spark.lambda.logStreamName")
     rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref =>
       // This is a very fast action so we can use "ThreadUtils.sameThread"
       driver = Some(ref)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/LambdaSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/LambdaSchedulerBackend.scala
@@ -20,6 +20,7 @@ package org.apache.spark.scheduler.cluster
 import java.util.concurrent.atomic.AtomicInteger
 
 import com.amazonaws.ClientConfiguration
+import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials}
 import com.amazonaws.services.lambda.AWSLambdaClientBuilder
 import com.amazonaws.services.lambda.invoke.{LambdaFunction, LambdaInvokerFactory}
 import com.amazonaws.services.lambda.model.InvokeRequest
@@ -126,8 +127,8 @@ private[spark] class LambdaSchedulerBackend(
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
   with Logging {
 
-  val lambdaFunctionName = sc.conf.get("spark.qubole.lambda.function.name", "get_spark_from_s3")
-  val s3SparkVersion = sc.conf.get("spark.qubole.lambda.spark.software.version", "LATEST")
+  val lambdaFunctionName = sc.conf.get("spark.lambda.function.name", "get_spark_from_s3")
+  val s3SparkVersion = sc.conf.get("spark.lambda.spark.software.version", "LATEST")
   var numExecutorsExpected = 0
   var numExecutorsRegistered = new AtomicInteger(0)
   var executorId = new AtomicInteger(0)
@@ -137,7 +138,7 @@ private[spark] class LambdaSchedulerBackend(
   // Set of executorIds which are currently alive
   val liveExecutors = new HashSet[String]
 
-  var lambdaContainerMemoryBytes: Int = 0
+  var lambdaContainerMemory: Int = 0
   var lambdaContainerTimeoutSecs: Int = 0
 
   val clientConfig = new ClientConfiguration()
@@ -146,22 +147,32 @@ private[spark] class LambdaSchedulerBackend(
   clientConfig.setRequestTimeout(345680)
   clientConfig.setSocketTimeout(345681)
 
+  val lambdaBucket = Option(sc.getConf.get("spark.lambda.s3.bucket"))
+
+  if (!lambdaBucket.isDefined) {
+    throw new Exception(s"spark.lambda.s3.bucket should" +
+      s" have a valid S3 bucket name having Spark binaries")
+  }
+
+  val lambdaClient = AWSLambdaClientBuilder
+                        .standard()
+                        .withClientConfiguration(clientConfig)
+                        .build()
+
   final val lambdaExecutorService: LambdaExecutorService =
     LambdaInvokerFactory.builder()
-    .lambdaClient(AWSLambdaClientBuilder.standard().withClientConfiguration(clientConfig).build())
+    .lambdaClient(lambdaClient)
     .build(classOf[LambdaExecutorService])
   logInfo(s"Created LambdaExecutorService: $lambdaExecutorService")
 
-  val maxConcurrentRequests = sc.conf.getInt("spark.qubole.lambda.concurrent.requests.max", 100)
+  val maxConcurrentRequests = sc.conf.getInt("spark.lambda.concurrent.requests.max", 100)
   val limiter = RateLimiter.create(maxConcurrentRequests)
 
   override def start() {
     super.start()
     logInfo("start")
     numExecutorsExpected = getInitialTargetExecutorNumber(conf)
 
-    val lambdaClient = AWSLambdaClientBuilder.defaultClient()
-
     val request = new com.amazonaws.services.lambda.model.GetFunctionRequest
     request.setFunctionName(lambdaFunctionName)
     val result = lambdaClient.getFunction(request)
@@ -170,7 +181,7 @@ private[spark] class LambdaSchedulerBackend(
     val request2 = new com.amazonaws.services.lambda.model.GetFunctionConfigurationRequest
     request2.setFunctionName(lambdaFunctionName)
     val result2 = lambdaClient.getFunctionConfiguration(request2)
-    lambdaContainerMemoryBytes = result2.getMemorySize * 1024 * 1024
+    lambdaContainerMemory = result2.getMemorySize
     lambdaContainerTimeoutSecs = result2.getTimeout
     logDebug(s"LAMBDA: 16001: Function configuration: ${result2.toString}")
 
@@ -191,97 +202,11 @@ private[spark] class LambdaSchedulerBackend(
 
   override def applicationId(): String = {
     val appId = super.applicationId()
-    logInfo(s"applicationId: $appId")
+    logDebug(s"applicationId: $appId")
     return appId
   }
 
   private def launchExecutorsOnLambda(newExecutorsNeeded: Int) : Future[Boolean] = {
-    Future {
-      // TODO: Can we launch in parallel?
-      // TODO: Can we track each thread separately and audit
-      (1 to newExecutorsNeeded).foreach { x =>
-        val request = new Request
-        request.setSparkS3Bucket("bharatb")
-        request.setSparkS3Key(s"lambda/spark-small-${s3SparkVersion}.zip")
-        request.setHadoop2S3Bucket("bharatb")
-        request.setHadoop2S3Key(s"lambda/hadoop2-small-${s3SparkVersion}.zip")
-        request.setHive12S3Bucket("bharatb")
-        request.setHive12S3Key(s"lambda/hive1.2-small-${s3SparkVersion}.zip")
-        val hostname = sc.env.rpcEnv.address.host
-        val port = sc.env.rpcEnv.address.port.toString
-        request.setSparkDriverHostname(hostname)
-        request.setSparkDriverPort(port)
-
-        val classpathSeq = Seq("spark/assembly/target/scala-2.11/jars/*",
-          "spark/conf",
-          "hadoop2/share/hadoop/*",
-          "hadoop2/share/hadoop/common/lib/*",
-          "hadoop2/share/hadoop/common/*",
-          "hadoop2/share/hadoop/hdfs",
-          "hadoop2/share/hadoop/hdfs/lib/*",
-          "hadoop2/share/hadoop/hdfs/*",
-          "hadoop2/share/hadoop/yarn/lib/*",
-          "hadoop2/share/hadoop/yarn/*",
-          "hadoop2/share/hadoop/mapreduce/*",
-          "hadoop2/share/hadoop/tools/lib/*",
-          "hadoop2/share/hadoop/tools/*",
-          "hadoop2/share/hadoop/qubole/lib/*",
-          "hadoop2/share/hadoop/qubole/*",
-          "hadoop2/etc/hadoop/*",
-          "hive1.2/lib/*"
-        )
-        val classpaths = classpathSeq.map(x => s"/tmp/lambda/$x").mkString(":")
-        val currentExecutorId = executorId.addAndGet(1)
-        val containerId = applicationId() + "_%08d".format(currentExecutorId)
-        request.setSparkCommandLine(
-          s"java -cp ${classpaths} " +
-            "-server -Xmx1400m " +
-            "-Djava.net.preferIPv4Stack=true " +
-            s"-Dspark.driver.port=${port} " +
-            // "-Dspark.blockManager.port=12345 " +
-            "-Dspark.dynamicAllocation.enabled=true " +
-            "-Dspark.shuffle.service.enabled=false " +
-            "org.apache.spark.executor.CoarseGrainedExecutorBackend " +
-            s"--driver-url spark://CoarseGrainedScheduler@${hostname}:${port} " +
-            s"--executor-id ${currentExecutorId} " +
-            "--hostname LAMBDA " +
-            "--cores 1 " +
-            s"--app-id ${applicationId()} " +
-            s"--container-id ${containerId} " +
-            s"--container-size ${lambdaContainerMemoryBytes} " +
-            "--user-class-path file:/tmp/lambda/* "
-        )
-
-        val lambdaRequesterThread = new Thread() {
-          override def run() {
-            val executorId = currentExecutorId.toString
-            logDebug(s"LAMBDA: 9002: Invoking lambda for $executorId: $request")
-            numLambdaCallsPending.addAndGet(1)
-            try {
-              val response = lambdaExecutorService.runExecutor(request)
-              logDebug(s"LAMBDA: 9003: Returned from lambda $executorId: $response")
-            } catch {
-              case t: Throwable => logError(s"Exception in Lambda invocation: $t")
-            } finally {
-              logDebug(s"LAMBDA: 9003: Returned from lambda $executorId: finally block")
-              numLambdaCallsPending.addAndGet(-1)
-              pendingLambdaRequests.remove(executorId)
-            }
-          }
-        }
-        lambdaRequesterThread.setDaemon(true)
-        lambdaRequesterThread.setName(s"Lambda Requester Thread for $currentExecutorId")
-        pendingLambdaRequests(currentExecutorId.toString) = lambdaRequesterThread
-        logDebug(s"LAMBDA: 9004: starting lambda requester thread for $currentExecutorId")
-        lambdaRequesterThread.start()
-
-        logDebug(s"LAMBDA: 9005: returning from launchExecutorsOnLambda for $currentExecutorId")
-      }
-      true // TODO: Return true/false properly
-    }
-  }
-
-  private def launchExecutorsOnLambda2(newExecutorsNeeded: Int) : Future[Boolean] = {
     Future {
       // TODO: Can we launch in parallel?
       // TODO: Can we track each thread separately and audit
@@ -311,29 +236,30 @@ private[spark] class LambdaSchedulerBackend(
         val containerId = applicationId() + "_%08d".format(currentExecutorId)
 
         val javaPartialCommandLine = s"java -cp ${classpaths} " +
-            "-server -Xmx1400m " +
+            s"-server -Xmx${lambdaContainerMemory}m " +
             "-Djava.net.preferIPv4Stack=true " +
             s"-Dspark.driver.port=${port} " +
-            // "-Dspark.blockManager.port=12345 " +
             "-Dspark.dynamicAllocation.enabled=true " +
             "-Dspark.shuffle.service.enabled=false "
+
         val executorPartialCommandLine = "org.apache.spark.executor.CoarseGrainedExecutorBackend " +
             s"--driver-url spark://CoarseGrainedScheduler@${hostname}:${port} " +
             s"--executor-id ${currentExecutorId} " +
             "--hostname LAMBDA " +
             "--cores 1 " +
             s"--app-id ${applicationId()} " +
             s"--container-id ${containerId} " +
-            s"--container-size ${lambdaContainerMemoryBytes} " +
+            s"--container-size ${lambdaContainerMemory} " +
             "--user-class-path file:/tmp/lambda/* "
+
         val commandLine = javaPartialCommandLine + executorPartialCommandLine
 
         val request = new LambdaRequestPayload(
-          sparkS3Bucket = "bharatb",
+          sparkS3Bucket = lambdaBucket.get,
           sparkS3Key = s"lambda/spark-small-${s3SparkVersion}.zip",
-          hadoop2S3Bucket = "bharatb",
+          hadoop2S3Bucket = lambdaBucket.get,
           hadoop2S3Key = s"lambda/hadoop2-small-${s3SparkVersion}.zip",
-          hive12S3Bucket = "bharatb",
+          hive12S3Bucket = lambdaBucket.get,
           hive12S3Key = s"lambda/hive1.2-small-${s3SparkVersion}.zip",
           sparkDriverHostname = hostname,
           sparkDriverPort = port,
@@ -349,9 +275,7 @@ private[spark] class LambdaSchedulerBackend(
             limiter.acquire()
             logDebug(s"LAMBDA: 9050.1: LambdaRequesterThread started $executorId")
             numLambdaCallsPending.addAndGet(1)
-            // TODO: Can we reuse the same client across calls?
-            val lambdaClient = AWSLambdaClientBuilder.standard()
-              .withClientConfiguration(clientConfig).build()
+
             val invokeRequest = new InvokeRequest
             try {
               invokeRequest.setFunctionName(lambdaFunctionName)
@@ -396,11 +320,11 @@ private[spark] class LambdaSchedulerBackend(
     if (newExecutorsNeeded <= 0) {
       return Future { true }
     }
-    return launchExecutorsOnLambda2(newExecutorsNeeded)
+    return launchExecutorsOnLambda(newExecutorsNeeded)
   }
 
   override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = {
-    // TODO: Fill this function
+    // TODO: Right now not implemented
     logDebug(s"LAMBDA: 10200: doKillExecutors: $executorIds")
     Future {
       executorIds.foreach { x =>
diff --git a/core/src/main/scala/org/apache/spark/shuffle/S3ShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/S3ShuffleBlockResolver.scala
@@ -40,12 +40,10 @@ private[spark] class S3ShuffleBlockResolver(
     _blockManager: BlockManager = null)
     extends IndexShuffleBlockResolver(conf, _blockManager = null)
     with Logging {
-  val shuffleOverS3 = conf.getBoolean("spark.shuffle.s3.enabled", false)
-  val s3PrefixLocation = conf.get("spark.qubole.s3PrefixLocation",
-    "s3://dev.canopydata.com/vsowrira/")
-
   private lazy val blockManager = Option(_blockManager).getOrElse(SparkEnv.get.blockManager)
 
+  val shuffleS3Bucket = BlockManager.getS3Bucket(conf)
+
   private lazy val hadoopConf = BlockManager.getHadoopConf(conf)
 
   private lazy val hadoopFileSystem = BlockManager.getHadoopFileSystem(conf)
@@ -84,7 +82,7 @@ private[spark] class S3ShuffleBlockResolver(
     * If so, return the partition lengths in the data file. Otherwise return null.
     */
   private def checkIndexAndDataFile(index: File, data: File, blocks: Int): Array[Long] = {
-    val indexFilePath = Utils.localFileToS3(s3PrefixLocation, index)
+    val indexFilePath = Utils.localFileToS3(shuffleS3Bucket, index)
     try {
       if (hadoopFileSystem.getFileStatus(indexFilePath).getLen != (blocks + 1) * 8) {
         return null
@@ -124,7 +122,7 @@ private[spark] class S3ShuffleBlockResolver(
       in.close()
     }
 
-    val dataPath = Utils.localFileToS3(s3PrefixLocation, data)
+    val dataPath = Utils.localFileToS3(shuffleS3Bucket, data)
 
     if (hadoopFileSystem.getFileStatus(dataPath).getLen == lengths.sum) {
       logInfo(s"${dataPath} lengths (${lengths.sum}) match with index file length")
@@ -153,8 +151,8 @@ private[spark] class S3ShuffleBlockResolver(
     val indexFile = getIndexFile(shuffleId, mapId)
     val indexTmp = Utils.tempFileWith(indexFile)
 
-    val indexFilePath = Utils.localFileToS3(s3PrefixLocation, indexFile)
-    val indexTmpPath = Utils.localFileToS3(s3PrefixLocation, indexTmp)
+    val indexFilePath = Utils.localFileToS3(shuffleS3Bucket, indexFile)
+    val indexTmpPath = Utils.localFileToS3(shuffleS3Bucket, indexTmp)
 
     try {
       val outputStream = hadoopFileSystem.create(indexTmpPath)
@@ -171,8 +169,8 @@ private[spark] class S3ShuffleBlockResolver(
       }
 
       val dataFile = getDataFile(shuffleId, mapId)
-      val dataFilePath = Utils.localFileToS3(s3PrefixLocation, dataFile)
-      val dataTmpPath = Utils.localFileToS3(s3PrefixLocation, dataTmp)
+      val dataFilePath = Utils.localFileToS3(shuffleS3Bucket, dataFile)
+      val dataTmpPath = Utils.localFileToS3(shuffleS3Bucket, dataTmp)
       // There is only one IndexShuffleBlockResolver per executor, this synchronization make sure
       // the following check and rename are atomic.
       synchronized {
@@ -238,15 +236,15 @@ private[spark] class S3ShuffleBlockResolver(
     }
 
     val indexFile = getFile(executorLocalDirs, subDirs, shuffleIndexFile)
-    val indexFilePath = Utils.localFileToS3(s3PrefixLocation, indexFile)
+    val indexFilePath = Utils.localFileToS3(shuffleS3Bucket, indexFile)
     val in = hadoopFileSystem.open(indexFilePath)
 
     try {
       ByteStreams.skipFully(in, reduceId * 8)
       val offset = in.readLong()
       val nextOffset = in.readLong()
       val dataFile = getFile(executorLocalDirs, subDirs, shuffleDataFile)
-      val dataFilePath = Utils.localFileToS3(s3PrefixLocation, dataFile)
+      val dataFilePath = Utils.localFileToS3(shuffleS3Bucket, dataFile)
 
       logDebug("S3Segment managed buffer created for path - " + dataFilePath.toString)
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.ConcurrentHashMap
 import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.shuffle._
+import org.apache.spark.storage.BlockManager
 
 /**
  * In sort-based shuffle, incoming records are sorted according to their target partition ids, then
@@ -79,7 +80,7 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
    */
   private[this] val numMapsForShuffle = new ConcurrentHashMap[Int, Int]()
 
-  val shuffleOverS3 = conf.getBoolean("spark.shuffle.s3.enabled", false)
+  val shuffleOverS3 = BlockManager.shuffleOverS3Enabled(conf)
 
   override val shuffleBlockResolver = if (shuffleOverS3) {
     new S3ShuffleBlockResolver(conf)
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala

Original file line number	Diff line number	Diff line change
`@@ -345,12 +345,4 @@ public static byte[] bufferToArray(ByteBuffer buffer) {`
`345`	`345`	`}`
`346`	`346`	`}`
`347`	`347`
`348`		`- public static Path localFileToS3(String s3PrefixLocation, File path) throws IOException {`
`349`		`- return new Path(s3PrefixLocation, path.getCanonicalPath());`
`350`		`- }`
`351`		`-`
`352`		`- public static File s3ToLocalFile(String s3PrefixLocation, Path path) {`
`353`		`- return new File(path.toString().replace(s3PrefixLocation, ""));`
`354`		`- }`
`355`		`-`
`356`	`348`	`}`