@@ -22,8 +22,7 @@ import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
22
22
import com .google .common .cache .{Cache , CacheBuilder , RemovalNotification }
23
23
import io .fabric8 .kubernetes .api .model .Pod
24
24
import io .fabric8 .kubernetes .client .KubernetesClient
25
- import io .fabric8 .kubernetes .client .informers .ResourceEventHandler
26
- import io .fabric8 .kubernetes .client .informers .SharedIndexInformer
25
+ import io .fabric8 .kubernetes .client .informers .{ResourceEventHandler , SharedIndexInformer }
27
26
28
27
import org .apache .kyuubi .Logging
29
28
import org .apache .kyuubi .config .KyuubiConf
@@ -48,22 +47,22 @@ class KubernetesApplicationOperation extends ApplicationOperation with Logging {
48
47
case Some (client) =>
49
48
info(s " Initialized Kubernetes Client connect to: ${client.getMasterUrl}" )
50
49
submitTimeout = conf.get(KyuubiConf .ENGINE_SUBMIT_TIMEOUT )
51
- // Using Kubernetes Informer to update application state
52
- // Set 0 for no resync, see more details in
53
- // https://github.com/fabric8io/kubernetes-client/discussions/5015
54
- enginePodInformer =
55
- client.pods().withLabel(LABEL_KYUUBI_UNIQUE_KEY ).runnableInformer(0 )
56
- enginePodInformer.addEventHandler(new SparkEnginePodEventHandler ()).start()
50
+ // Disable resync, see https://github.com/fabric8io/kubernetes-client/discussions/5015
51
+ enginePodInformer = client.pods()
52
+ .withLabel(LABEL_KYUUBI_UNIQUE_KEY )
53
+ .inform(new SparkEnginePodEventHandler )
57
54
info(" Start Kubernetes Client Informer." )
58
- // Using Cache help clean delete app info
59
- val cachePeriod = conf.get(KyuubiConf .KUBERNETES_TERMINATED_APPLICATION_RETAIN_PERIOD )
55
+ // Defer cleaning terminated application information to key expired
56
+ // Use Cache help clean delete app info
57
+ val retainPeriod = conf.get(KyuubiConf .KUBERNETES_TERMINATED_APPLICATION_RETAIN_PERIOD )
60
58
deletedAppInfoCache = CacheBuilder
61
59
.newBuilder()
62
- .expireAfterWrite(cachePeriod , TimeUnit .MILLISECONDS )
60
+ .expireAfterWrite(retainPeriod , TimeUnit .MILLISECONDS )
63
61
.removalListener((notification : RemovalNotification [String , ApplicationState ]) => {
64
- debug(s " Remove cached appInfo[tag: ${notification.getKey}], " +
65
- s " due to app state: ${notification.getValue}. " )
66
- appInfoStore.remove(notification.getKey)
62
+ Option (appInfoStore.remove(notification.getKey)).foreach { removed =>
63
+ info(s " Remove terminated application ${removed.id} with " +
64
+ s " tag ${notification.getKey} and state ${removed.state}" )
65
+ }
67
66
})
68
67
.build()
69
68
client
@@ -85,8 +84,8 @@ class KubernetesApplicationOperation extends ApplicationOperation with Logging {
85
84
}
86
85
debug(s " Deleting application info from Kubernetes cluster by $tag tag " )
87
86
try {
88
- val info = appInfoStore.getOrDefault(tag, ApplicationInfo .notFound )
89
- debug(s " Application info[tag: ${ tag} ] is in ${info.state}" )
87
+ val info = appInfoStore.getOrDefault(tag, ApplicationInfo .NOT_FOUND )
88
+ debug(s " Application info[tag: $tag] is in ${info.state}" )
90
89
info.state match {
91
90
case NOT_FOUND | FAILED | UNKNOWN =>
92
91
(
@@ -109,30 +108,30 @@ class KubernetesApplicationOperation extends ApplicationOperation with Logging {
109
108
}
110
109
debug(s " Getting application info from Kubernetes cluster by $tag tag " )
111
110
try {
112
- val info = appInfoStore.getOrDefault(tag, ApplicationInfo .notFound )
113
- info .state match {
111
+ val appInfo = appInfoStore.getOrDefault(tag, ApplicationInfo .NOT_FOUND )
112
+ (appInfo .state, submitTime) match {
114
113
// Kyuubi should wait second if pod is not be created
115
- case NOT_FOUND if submitTime.nonEmpty =>
116
- val elapsedTime = System .currentTimeMillis() - submitTime.get
114
+ case ( NOT_FOUND , Some (_submitTime)) =>
115
+ val elapsedTime = System .currentTimeMillis - _submitTime
117
116
if (elapsedTime > submitTimeout) {
118
117
error(s " Can't find target driver pod by tag: $tag, " +
119
118
s " elapsed time: ${elapsedTime}ms exceeds ${submitTimeout}ms. " )
120
- ApplicationInfo (id = null , name = null , ApplicationState .NOT_FOUND )
119
+ ApplicationInfo .NOT_FOUND
121
120
} else {
122
121
warn(" Wait for driver pod to be created, " +
123
122
s " elapsed time: ${elapsedTime}ms, return UNKNOWN status " )
124
- ApplicationInfo (id = null , name = null , ApplicationState .UNKNOWN )
123
+ ApplicationInfo .UNKNOWN
125
124
}
126
- case NOT_FOUND =>
127
- ApplicationInfo (id = null , name = null , ApplicationState .NOT_FOUND )
125
+ case ( NOT_FOUND , None ) =>
126
+ ApplicationInfo .NOT_FOUND
128
127
case _ =>
129
- debug(s " Successfully got application info by $tag: $info " )
130
- info
128
+ debug(s " Successfully got application info by $tag: $appInfo " )
129
+ appInfo
131
130
}
132
131
} catch {
133
132
case e : Exception =>
134
133
error(s " Failed to get application with $tag, due to ${e.getMessage}" )
135
- ApplicationInfo (id = null , name = null , ApplicationState .NOT_FOUND )
134
+ ApplicationInfo .NOT_FOUND
136
135
}
137
136
}
138
137
@@ -163,41 +162,39 @@ class KubernetesApplicationOperation extends ApplicationOperation with Logging {
163
162
override def onUpdate (oldPod : Pod , newPod : Pod ): Unit = {
164
163
if (isSparkEnginePod(newPod)) {
165
164
updateApplicationState(newPod)
166
- toApplicationState(newPod.getStatus.getPhase) match {
167
- case state if isTerminated(state) =>
168
- markTerminated(newPod)
169
- case _ =>
170
- // do nothing
165
+ val appState = toApplicationState(newPod.getStatus.getPhase)
166
+ if (isTerminated(appState)) {
167
+ markApplicationTerminated(newPod)
171
168
}
172
169
}
173
170
}
174
171
175
172
override def onDelete (pod : Pod , deletedFinalStateUnknown : Boolean ): Unit = {
176
173
if (isSparkEnginePod(pod)) {
177
174
updateApplicationState(pod)
178
- markTerminated (pod)
175
+ markApplicationTerminated (pod)
179
176
}
180
177
}
181
178
}
182
179
183
180
private def isSparkEnginePod (pod : Pod ): Boolean = {
184
- pod.getMetadata.getLabels.containsKey(LABEL_KYUUBI_UNIQUE_KEY )
181
+ val labels = pod.getMetadata.getLabels
182
+ labels.containsKey(LABEL_KYUUBI_UNIQUE_KEY ) && labels.containsKey(SPARK_APP_ID_LABEL )
185
183
}
186
184
187
185
private def updateApplicationState (pod : Pod ): Unit = {
188
- val metaData = pod.getMetadata
189
- val state = toApplicationState(pod.getStatus.getPhase)
190
- debug(s " Driver Informer change pod: ${metaData.getName} state: $state" )
186
+ val appState = toApplicationState(pod.getStatus.getPhase)
187
+ debug(s " Driver Informer changes pod: ${pod.getMetadata.getName} to state: $appState" )
191
188
appInfoStore.put(
192
- metaData .getLabels.get(LABEL_KYUUBI_UNIQUE_KEY ),
189
+ pod.getMetadata .getLabels.get(LABEL_KYUUBI_UNIQUE_KEY ),
193
190
ApplicationInfo (
194
- id = metaData .getLabels.get(SPARK_APP_ID_LABEL ),
195
- name = metaData .getName,
196
- state = state ,
191
+ id = pod.getMetadata .getLabels.get(SPARK_APP_ID_LABEL ),
192
+ name = pod.getMetadata .getName,
193
+ state = appState ,
197
194
error = Option (pod.getStatus.getReason)))
198
195
}
199
196
200
- private def markTerminated (pod : Pod ): Unit = {
197
+ private def markApplicationTerminated (pod : Pod ): Unit = {
201
198
deletedAppInfoCache.put(
202
199
pod.getMetadata.getLabels.get(LABEL_KYUUBI_UNIQUE_KEY ),
203
200
toApplicationState(pod.getStatus.getPhase))
0 commit comments