@@ -27,6 +27,8 @@ pub struct QuorumStateChecker {
2727 cancellation_token : CancellationToken ,
2828 check_interval : Duration ,
2929 quorum_state : QuorumState ,
30+ max_retries : u32 ,
31+ retry_initial_delay : Duration ,
3032}
3133
3234impl QuorumStateChecker {
@@ -42,6 +44,8 @@ impl QuorumStateChecker {
4244 quorum_state : QuorumState {
4345 available : Arc :: new ( Default :: default ( ) ) ,
4446 } ,
47+ max_retries : 3 ,
48+ retry_initial_delay : Duration :: from_secs ( 2 ) ,
4549 } ;
4650
4751 // first check MUST succeed, otherwise we shouldn't start
@@ -56,7 +60,102 @@ impl QuorumStateChecker {
5660 self . quorum_state . clone ( )
5761 }
5862
63+ fn is_retryable_error ( & self , err : & CredentialProxyError ) -> bool {
64+ let err_str = err. to_string ( ) . to_lowercase ( ) ;
65+
66+ // Check for DNS-related errors
67+ if err_str. contains ( "dns" )
68+ || err_str. contains ( "lookup" )
69+ || err_str. contains ( "name resolution" )
70+ || err_str. contains ( "temporary failure" )
71+ || err_str. contains ( "failed to lookup address" )
72+ {
73+ return true ;
74+ }
75+
76+ // Check if it's a Tendermint RPC error (which could be DNS/timeout related)
77+ if let CredentialProxyError :: NyxdFailure { source : nyxd_err } = err {
78+ let nyxd_err_str = nyxd_err. to_string ( ) . to_lowercase ( ) ;
79+ if nyxd_err_str. contains ( "tendermint rpc request failed" ) {
80+ return true ;
81+ }
82+
83+ if nyxd_err. is_tendermint_response_timeout ( ) {
84+ return true ;
85+ }
86+ }
87+
88+ false
89+ }
90+
5991 async fn check_quorum_state ( & self ) -> Result < bool , CredentialProxyError > {
92+ self . check_quorum_state_with_retry ( ) . await
93+ }
94+
95+ async fn check_quorum_state_with_retry ( & self ) -> Result < bool , CredentialProxyError > {
96+ let mut last_error_msg = None ;
97+ let delay = self . retry_initial_delay ;
98+
99+ for attempt in 0 ..=self . max_retries {
100+ match self . check_quorum_state_once ( ) . await {
101+ Ok ( result) => {
102+ if attempt > 0 {
103+ info ! ( "quorum check succeeded after {} retry attempt(s)" , attempt) ;
104+ }
105+ return Ok ( result) ;
106+ }
107+ Err ( err) => {
108+ let err_msg = err. to_string ( ) ;
109+
110+ // Check if this error is retryable
111+ if !self . is_retryable_error ( & err) {
112+ return Err ( err) ;
113+ }
114+
115+ last_error_msg = Some ( err_msg. clone ( ) ) ;
116+
117+ if attempt >= self . max_retries {
118+ break ;
119+ }
120+
121+ // Log the retry attempt
122+ warn ! (
123+ "quorum check failed (attempt {}/{}): {}. Retrying in {:?}..." ,
124+ attempt + 1 ,
125+ self . max_retries + 1 ,
126+ err_msg,
127+ delay
128+ ) ;
129+
130+ // Wait before retrying with exponential backoff
131+ tokio:: time:: sleep ( delay) . await ;
132+ }
133+ }
134+ }
135+
136+ // try one final time to get the actual error
137+ match self . check_quorum_state_once ( ) . await {
138+ Ok ( result) => {
139+ warn ! (
140+ "quorum check succeeded on final attempt after {} retries" ,
141+ self . max_retries
142+ ) ;
143+ Ok ( result)
144+ }
145+ Err ( err) => {
146+ if let Some ( error_msg) = last_error_msg {
147+ error ! (
148+ "quorum check failed after {} retry attempts. Last error: {}" ,
149+ self . max_retries + 1 ,
150+ error_msg
151+ ) ;
152+ }
153+ Err ( err)
154+ }
155+ }
156+ }
157+
158+ async fn check_quorum_state_once ( & self ) -> Result < bool , CredentialProxyError > {
60159 let client_guard = self . client . query_chain ( ) . await ;
61160
62161 // split the operation as we only need to hold the reference to chain client for the first part
@@ -93,7 +192,7 @@ impl QuorumStateChecker {
93192 break
94193 }
95194 _ = tokio:: time:: sleep( self . check_interval) => {
96- match self . check_quorum_state ( ) . await {
195+ match self . check_quorum_state_with_retry ( ) . await {
97196 Ok ( available) => self . quorum_state. available. store( available, Ordering :: SeqCst ) ,
98197 Err ( err) => error!( "failed to check current quorum state: {err}" ) ,
99198 }
0 commit comments