@@ -52,19 +52,22 @@ import org.opensearch.indexmanagement.transform.util.TransformLockManager
5252import org.opensearch.indexmanagement.util.NO_ID
5353import org.opensearch.indexmanagement.util.SecurityUtils.Companion.DEFAULT_INJECT_ROLES
5454import org.opensearch.indexmanagement.util.SecurityUtils.Companion.INTERNAL_REQUEST
55- import org.opensearch.jobscheduler.spi.LockModel
5655import org.opensearch.jobscheduler.spi.utils.LockService
5756import org.opensearch.rest.RestStatus
57+ import org.opensearch.tasks.TaskCancelledException
5858import org.opensearch.transport.RemoteTransportException
5959import java.io.IOException
6060import java.time.Instant
61+ import java.util.regex.Pattern
6162import kotlin.coroutines.CoroutineContext
6263import kotlin.coroutines.resume
6364import kotlin.coroutines.resumeWithException
6465import kotlin.coroutines.suspendCoroutine
6566
6667const val OPENDISTRO_SECURITY_PROTECTED_INDICES_CONF_REQUEST = " _opendistro_security_protected_indices_conf_request"
67- private const val TIME_EXCEED_MESSAGE = " Time exceeded"
68+
69+ // Timeout pattern used for checking the timeout message which is in unique format if the transform search timeout was set programmatically
70+ private val timeoutMessagePattern = Pattern .compile(" cancelled task with reason: Cancellation timeout of (.*) is expired" )
6871
6972fun contentParser (bytesReference : BytesReference ): XContentParser {
7073 return XContentHelper .createParser(
@@ -193,14 +196,14 @@ suspend fun <T> BackoffPolicy.retry(
193196 *
194197 * If all retries fail the final exception will be rethrown. Exceptions caught during intermediate retries are
195198 * logged as warnings to [logger]. Similar to [org.opensearch.action.bulk.Retry], except these retries on
196- * 502, 503, 504, 429 error codes as well as 500 with Time exceeded . If the request is timeout, lock will be renewed
199+ * 502, 503, 504 error codes as well as when TaskCancelledException is being raised as cause . If the request is timeout, lock will be renewed
197200 *
198201 * @param logger - logger used to log intermediate failures
199202 * @param transformLockManager - lock manager that stores current lock used in order to renew the lock if the request timed out
200203 * @param retryOn - any additional [RestStatus] values that should be retried
201204 * @param block - the block of code to retry. This should be a suspend function.
202205 */
203- suspend fun <T > BackoffPolicy.retry (
206+ suspend fun <T > BackoffPolicy.retryTransformSearch (
204207 logger : Logger ,
205208 transformLockManager : TransformLockManager ,
206209 retryOn : List <RestStatus > = emptyList(),
@@ -212,22 +215,24 @@ suspend fun <T> BackoffPolicy.retry(
212215 try {
213216 return block(backoff)
214217 } catch (e: OpenSearchException ) {
215- if (iter.hasNext() && (e.isRetryable() || e.isTimedOut() || retryOn.contains(e.status()))) {
216- backoff = iter.next()
217- logger.warn(" Operation failed. Retrying in $backoff ." , e)
218- delay(backoff.millis)
219- // In the case of time out, renew the lock
220- if (e.isTimedOut()) {
221- transformLockManager.renewLockForScheduledJob()
222- }
223- } else {
218+ if (! iter.hasNext() || ! e.isRetryable(retryOn, e)) {
224219 throw e
225220 }
221+ backoff = iter.next()
222+ logger.warn(" Operation failed. Retrying in $backoff ." , e)
223+ delay(backoff.millis)
224+ if (isTransformOperationTimedOut(e)) {
225+ // In the case of time out, renew the lock
226+ transformLockManager.renewLockForScheduledJob()
227+ }
226228 }
227229 } while (true )
228230}
229231
230- fun LockModel.lockExpirationInSeconds () = lockTime.epochSecond + lockDurationSeconds - Instant .now().epochSecond
232+ private fun OpenSearchException.isRetryable (
233+ retryOn : List <RestStatus >,
234+ ex : OpenSearchException
235+ ) = this .isRetryable() || isTransformOperationTimedOut(ex) || retryOn.contains(this .status())
231236
232237/* *
233238 * Retries on 502, 503 and 504 per elastic client's behavior: https://github.com/elastic/elasticsearch-net/issues/2061
@@ -238,11 +243,17 @@ fun OpenSearchException.isRetryable(): Boolean {
238243}
239244
240245/* *
241- * Retries on 500 and Time exceeded message which means that the timeout occurred. In that case
242- * retry request with reduced size param and timeout param set based on the lock expiration
246+ * Retries on 408 or on TaskCancelledException once the message matches the given pattern.
247+ * In that case retry request with reduced size param and timeout param set based on the lock expiration
243248 */
244- fun OpenSearchException.isTimedOut (): Boolean {
245- return status() == RestStatus .INTERNAL_SERVER_ERROR && TIME_EXCEED_MESSAGE == message
249+ fun isTransformOperationTimedOut (ex : OpenSearchException ): Boolean {
250+ if (RestStatus .REQUEST_TIMEOUT == ex.status()) {
251+ return true
252+ }
253+ if (ex.cause != null && ex.cause is TaskCancelledException ) {
254+ return timeoutMessagePattern.matcher((ex.cause as TaskCancelledException ).message).matches()
255+ }
256+ return false
246257}
247258
248259/* *
0 commit comments