Skip to content

Commit e9e8bb3

Browse files
onursaticidongjoon-hyun
authored andcommitted
[SPARK-27023][K8S] Make k8s client timeouts configurable
## What changes were proposed in this pull request? Make k8s client timeouts configurable. No test suite exists for the client factory class, happy to add one if needed Closes #23928 from onursatici/os/k8s-client-timeouts. Lead-authored-by: Onur Satici <osatici@palantir.com> Co-authored-by: Onur Satici <onursatici@gmail.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
1 parent cb20fbc commit e9e8bb3

File tree

5 files changed

+74
-0
lines changed

5 files changed

+74
-0
lines changed

docs/running-on-kubernetes.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -990,6 +990,34 @@ See the [configuration page](configuration.html) for information on Spark config
990990
Specify whether executor pods should be deleted in case of failure or normal termination.
991991
</td>
992992
</tr>
993+
<tr>
994+
<td><code>spark.kubernetes.submission.connectionTimeout</code></td>
995+
<td>10000</td>
996+
<td>
997+
Connection timeout in milliseconds for the kubernetes client to use for starting the driver.
998+
</td>
999+
</tr>
1000+
<tr>
1001+
<td><code>spark.kubernetes.submission.requestTimeout</code></td>
1002+
<td>10000</td>
1003+
<td>
1004+
Request timeout in milliseconds for the kubernetes client to use for starting the driver.
1005+
</td>
1006+
</tr>
1007+
<tr>
1008+
<td><code>spark.kubernetes.driver.connectionTimeout</code></td>
1009+
<td>10000</td>
1010+
<td>
1011+
Connection timeout in milliseconds for the kubernetes client in driver to use when requesting executors.
1012+
</td>
1013+
</tr>
1014+
<tr>
1015+
<td><code>spark.kubernetes.driver.requestTimeout</code></td>
1016+
<td>10000</td>
1017+
<td>
1018+
Request timeout in milliseconds for the kubernetes client in driver to use when requesting executors.
1019+
</td>
1020+
</tr>
9931021
</table>
9941022

9951023
#### Pod template properties

resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,30 @@ private[spark] object Config extends Logging {
8686
val CLIENT_CERT_FILE_CONF_SUFFIX = "clientCertFile"
8787
val CA_CERT_FILE_CONF_SUFFIX = "caCertFile"
8888

89+
val SUBMISSION_CLIENT_REQUEST_TIMEOUT =
90+
ConfigBuilder("spark.kubernetes.submission.requestTimeout")
91+
.doc("request timeout to be used in milliseconds for starting the driver")
92+
.intConf
93+
.createWithDefault(10000)
94+
95+
val SUBMISSION_CLIENT_CONNECTION_TIMEOUT =
96+
ConfigBuilder("spark.kubernetes.submission.connectionTimeout")
97+
.doc("connection timeout to be used in milliseconds for starting the driver")
98+
.intConf
99+
.createWithDefault(10000)
100+
101+
val DRIVER_CLIENT_REQUEST_TIMEOUT =
102+
ConfigBuilder("spark.kubernetes.driver.requestTimeout")
103+
.doc("request timeout to be used in milliseconds for driver to request executors")
104+
.intConf
105+
.createWithDefault(10000)
106+
107+
val DRIVER_CLIENT_CONNECTION_TIMEOUT =
108+
ConfigBuilder("spark.kubernetes.driver.connectionTimeout")
109+
.doc("connection timeout to be used in milliseconds for driver to request executors")
110+
.intConf
111+
.createWithDefault(10000)
112+
89113
val KUBERNETES_SERVICE_ACCOUNT_NAME =
90114
ConfigBuilder(s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.serviceAccountName")
91115
.doc("Service account that is used when running the driver pod. The driver pod uses " +

resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import okhttp3.Dispatcher
2828
import org.apache.spark.SparkConf
2929
import org.apache.spark.deploy.k8s.Config._
3030
import org.apache.spark.internal.Logging
31+
import org.apache.spark.internal.config.ConfigEntry
3132
import org.apache.spark.util.ThreadUtils
3233

3334
/**
@@ -41,6 +42,7 @@ private[spark] object SparkKubernetesClientFactory extends Logging {
4142
master: String,
4243
namespace: Option[String],
4344
kubernetesAuthConfPrefix: String,
45+
clientType: ClientType.Value,
4446
sparkConf: SparkConf,
4547
defaultServiceAccountToken: Option[File],
4648
defaultServiceAccountCaCert: Option[File]): KubernetesClient = {
@@ -79,6 +81,8 @@ private[spark] object SparkKubernetesClientFactory extends Logging {
7981
.withApiVersion("v1")
8082
.withMasterUrl(master)
8183
.withWebsocketPingInterval(0)
84+
.withRequestTimeout(clientType.requestTimeout(sparkConf))
85+
.withConnectionTimeout(clientType.connectionTimeout(sparkConf))
8286
.withOption(oauthTokenValue) {
8387
(token, configBuilder) => configBuilder.withOauthToken(token)
8488
}.withOption(oauthTokenFile) {
@@ -111,4 +115,20 @@ private[spark] object SparkKubernetesClientFactory extends Logging {
111115
}.getOrElse(configBuilder)
112116
}
113117
}
118+
119+
object ClientType extends Enumeration {
120+
import scala.language.implicitConversions
121+
val Driver = Val(DRIVER_CLIENT_REQUEST_TIMEOUT, DRIVER_CLIENT_CONNECTION_TIMEOUT)
122+
val Submission = Val(SUBMISSION_CLIENT_REQUEST_TIMEOUT, SUBMISSION_CLIENT_CONNECTION_TIMEOUT)
123+
124+
protected case class Val(
125+
requestTimeoutEntry: ConfigEntry[Int],
126+
connectionTimeoutEntry: ConfigEntry[Int])
127+
extends super.Val {
128+
def requestTimeout(conf: SparkConf): Int = conf.get(requestTimeoutEntry)
129+
def connectionTimeout(conf: SparkConf): Int = conf.get(connectionTimeoutEntry)
130+
}
131+
132+
implicit def convert(value: Value): Val = value.asInstanceOf[Val]
133+
}
114134
}

resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ private[spark] class KubernetesClientApplication extends SparkApplication {
220220
master,
221221
Some(kubernetesConf.namespace),
222222
KUBERNETES_AUTH_SUBMISSION_CONF_PREFIX,
223+
SparkKubernetesClientFactory.ClientType.Submission,
223224
sparkConf,
224225
None,
225226
None)) { kubernetesClient =>

resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
6565
apiServerUri,
6666
Some(sc.conf.get(KUBERNETES_NAMESPACE)),
6767
authConfPrefix,
68+
SparkKubernetesClientFactory.ClientType.Driver,
6869
sc.conf,
6970
defaultServiceAccountToken,
7071
defaultServiceAccountCaCrt)

0 commit comments

Comments
 (0)