onflow · m-Peter · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 29, 2025
@@ -22,6 +22,8 @@ import (
 	"github.com/sethvargo/go-limiter/memorystore"
 	grpcOpts "google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/resolver"
+	"google.golang.org/grpc/resolver/manual"
 	"google.golang.org/grpc/status"
 
 	"github.com/onflow/flow-evm-gateway/api"
@@ -41,13 +43,13 @@ const (
 	// DefaultMaxMessageSize is the default maximum message size for gRPC responses
 	DefaultMaxMessageSize = 1024 * 1024 * 1024
 
-	// DefaultResourceExhaustedRetryDelay is the default delay between retries when the server returns
-	// a ResourceExhausted error.
-	DefaultResourceExhaustedRetryDelay = 100 * time.Millisecond
+	// DefaultRetryDelay is the default delay between retries when a gRPC request
+	// to one of the Access Nodes has errored out.
+	DefaultRetryDelay = 100 * time.Millisecond
 
-	// DefaultResourceExhaustedMaxRetryDelay is the default max request duration when retrying server
-	// ResourceExhausted errors.
-	DefaultResourceExhaustedMaxRetryDelay = 30 * time.Second
+	// DefaultMaxRetryDelay is the default max request duration when retrying failed
+	// gRPC requests to one of the Access Nodes.
+	DefaultMaxRetryDelay = 30 * time.Second
 )
 
 type Storages struct {
@@ -474,16 +476,59 @@ func StartEngine(
 // setupCrossSporkClient sets up a cross-spork AN client.
 func setupCrossSporkClient(config config.Config, logger zerolog.Logger) (*requester.CrossSporkClient, error) {
 	// create access client with cross-spork capabilities
-	currentSporkClient, err := grpc.NewClient(
-		config.AccessNodeHost,
-		grpc.WithGRPCDialOptions(
-			grpcOpts.WithDefaultCallOptions(grpcOpts.MaxCallRecvMsgSize(DefaultMaxMessageSize)),
-			grpcOpts.WithUnaryInterceptor(retryInterceptor(
-				DefaultResourceExhaustedMaxRetryDelay,
-				DefaultResourceExhaustedRetryDelay,
-			)),
-		),
-	)
+	var currentSporkClient *grpc.Client
+	var err error
+
+	if len(config.AccessNodeBackupHosts) > 0 {
+		mr := manual.NewBuilderWithScheme("dns")
+		defer mr.Close()
+
+		// `pick_first` tries to connect to the first address, uses it for all RPCs
+		// if it connects, or try the next address if it fails
+		// (and keep doing that until one connection is successful).
+		// Because of this, all the RPCs will be sent to the same backend. See more on:
+		// https://github.com/grpc/grpc-go/tree/master/examples/features/load_balancing#pick_first
+		json := `{"loadBalancingConfig": [{"pick_first":{}}]}`
+		endpoints := []resolver.Endpoint{
+			{Addresses: []resolver.Address{{Addr: config.AccessNodeHost}}},
+		}
+
+		for _, accessNodeBackupAddr := range config.AccessNodeBackupHosts {
+			endpoints = append(endpoints, resolver.Endpoint{
+				Addresses: []resolver.Address{{Addr: accessNodeBackupAddr}},
+			})
+		}
+
+		mr.InitialState(resolver.State{
+			Endpoints: endpoints,
+		})
+
+		targetHost := fmt.Sprintf("%s:///%s", mr.Scheme(), "flow-access")
+		currentSporkClient, err = grpc.NewClient(
+			targetHost,
+			grpc.WithGRPCDialOptions(
+				grpcOpts.WithDefaultCallOptions(grpcOpts.MaxCallRecvMsgSize(DefaultMaxMessageSize)),
+				grpcOpts.WithResolvers(mr),
+				grpcOpts.WithDefaultServiceConfig(json),
+				grpcOpts.WithUnaryInterceptor(retryInterceptor(
+					DefaultMaxRetryDelay,
+					DefaultRetryDelay,
+				)),
+			),
+		)
+	} else {
+		currentSporkClient, err = grpc.NewClient(
+			config.AccessNodeHost,
+			grpc.WithGRPCDialOptions(
+				grpcOpts.WithDefaultCallOptions(grpcOpts.MaxCallRecvMsgSize(DefaultMaxMessageSize)),
+				grpcOpts.WithUnaryInterceptor(retryInterceptor(
+					DefaultMaxRetryDelay,
+					DefaultRetryDelay,
+				)),
+			),
+		)
+	}
+
 	if err != nil {
 		return nil, fmt.Errorf(
 			"failed to create client connection for host: %s, with error: %w",
@@ -536,7 +581,16 @@ func retryInterceptor(maxDuration, pauseDuration time.Duration) grpcOpts.UnaryCl
 				return nil
 			}
 
-			if status.Code(err) != codes.ResourceExhausted {
+			switch status.Code(err) {
+			case codes.Canceled, codes.DeadlineExceeded:
+				// these kind of errors are guaranteed to fail all requests,
+				// if the source was a local context
+				return err
+			case codes.ResourceExhausted, codes.OutOfRange, codes.NotFound:
+				// when we receive these errors, we pause briefly, so that
+				// the next request on the same AN, has a higher chance
+				// of success.
+			default:
 				return err
 			}
 

@@ -201,6 +201,10 @@ func parseConfigFromFlags() error {
 	}
 	cfg.FilterExpiry = exp
 
+	if accessNodeBackupHosts != "" {
+		cfg.AccessNodeBackupHosts = strings.Split(accessNodeBackupHosts, ",")
+	}
+
 	if accessSporkHosts != "" {
 		heightHosts := strings.Split(accessSporkHosts, ",")
 		cfg.AccessNodePreviousSporkHosts = append(cfg.AccessNodePreviousSporkHosts, heightHosts...)
@@ -242,6 +246,7 @@ var (
 	logWriter,
 	filterExpiry,
 	accessSporkHosts,
+	accessNodeBackupHosts,
 	cloudKMSKey,
 	cloudKMSProjectID,
 	cloudKMSLocationID,
@@ -259,6 +264,7 @@ func init() {
 	Cmd.Flags().IntVar(&cfg.RPCPort, "rpc-port", 8545, "Port for the RPC API server")
 	Cmd.Flags().BoolVar(&cfg.WSEnabled, "ws-enabled", false, "Enable websocket connections")
 	Cmd.Flags().StringVar(&cfg.AccessNodeHost, "access-node-grpc-host", "localhost:3569", "Host to the flow access node gRPC API")
+	Cmd.Flags().StringVar(&accessNodeBackupHosts, "access-node-backup-hosts", "", `Backup AN hosts to use in case of connectivity issues, defined following the schema: {host1},{host2} as a comma separated list (e.g. "host-1.com,host2.com")`)
 	Cmd.Flags().StringVar(&accessSporkHosts, "access-node-spork-hosts", "", `Previous spork AN hosts, defined following the schema: {host1},{host2} as a comma separated list (e.g. "host-1.com,host2.com")`)
 	Cmd.Flags().StringVar(&flowNetwork, "flow-network-id", "flow-emulator", "Flow network ID (flow-emulator, flow-previewnet, flow-testnet, flow-mainnet)")
 	Cmd.Flags().StringVar(&coinbase, "coinbase", "", "Coinbase address to use for fee collection")

@@ -45,6 +45,9 @@ type Config struct {
 	DatabaseDir string
 	// AccessNodeHost defines the current spork Flow network AN host.
 	AccessNodeHost string
+	// AccessNodeBackupHosts contains a list of ANs hosts to use as backup, in
+	// case of connectivity issues with `AccessNodeHost`.
+	AccessNodeBackupHosts []string
 	// AccessNodePreviousSporkHosts contains a list of the ANs hosts for each spork
 	AccessNodePreviousSporkHosts []string
 	// GRPCPort for the RPC API server

@@ -224,23 +224,38 @@ func (r *RPCEventSubscriber) subscribe(ctx context.Context, height uint64) <-cha
 					// we can get not found when reconnecting after a disconnect/restart before the
 					// next block is finalized. just wait briefly and try again
 					time.Sleep(200 * time.Millisecond)
-				case codes.DeadlineExceeded, codes.Internal:
+				case codes.DeadlineExceeded, codes.Internal, codes.Unavailable:
 					// these are sometimes returned when the stream is disconnected by a middleware or the server
 				default:
 					// skip reconnect on all other errors
 					eventsChan <- models.NewBlockEventsError(fmt.Errorf("%w: %w", errs.ErrDisconnected, err))
 					return
 				}
 
-				if err := connect(lastReceivedHeight + 1); err != nil {
-					eventsChan <- models.NewBlockEventsError(
-						fmt.Errorf(
-							"failed to resubscribe for events on height: %d, with: %w",
-							lastReceivedHeight+1,
-							err,
-						),
-					)
-					return
+				start := time.Now()
+				attempts := 0
+				pauseDuration, maxDuration := 200*time.Millisecond, 30*time.Second
+				// Allow reconnect retries for up to 30 seconds, with retry
+				// attempts every 200 ms.
+				for {
+					err := connect(lastReceivedHeight)
+					if err == nil {
+						break
+					}
+
+					attempts++
+					duration := time.Since(start)
+					if duration >= maxDuration {
+						eventsChan <- models.NewBlockEventsError(
+							fmt.Errorf(
+								"failed to resubscribe for events on height: %d, with: %w",
+								lastReceivedHeight,
+								err,
+							),
+						)
+						return
+					}
+					time.Sleep(pauseDuration)
 				}
 			}
 		}

@@ -67,23 +67,18 @@ func testLogWriter() io.Writer {
 	return zerolog.NewConsoleWriter()
 }
 
-func startEmulator(createTestAccounts bool) (*server.EmulatorServer, error) {
+func defaultServerConfig() *server.Config {
 	pkey, err := crypto.DecodePrivateKeyHex(sigAlgo, servicePrivateKey)
 	if err != nil {
-		return nil, err
+		panic(err)
 	}
 
 	genesisToken, err := cadence.NewUFix64("10000.0")
 	if err != nil {
-		return nil, err
+		panic(err)
 	}
 
-	log := logger.With().Timestamp().Str("component", "emulator").Logger().Level(zerolog.DebugLevel)
-	if logOutput == "false" {
-		log = zerolog.Nop()
-	}
-
-	srv := server.NewEmulatorServer(&log, &server.Config{
+	return &server.Config{
 		ServicePrivateKey:      pkey,
 		ServiceKeySigAlgo:      sigAlgo,
 		ServiceKeyHashAlgo:     hashAlgo,
@@ -94,7 +89,19 @@ func startEmulator(createTestAccounts bool) (*server.EmulatorServer, error) {
 		TransactionMaxGasLimit: flow.DefaultMaxTransactionGasLimit,
 		SetupEVMEnabled:        true,
 		SetupVMBridgeEnabled:   false,
-	})
+	}
+}
+
+func startEmulator(createTestAccounts bool, conf *server.Config) (
+	*server.EmulatorServer,
+	error,
+) {
+	log := logger.With().Timestamp().Str("component", "emulator").Logger().Level(zerolog.DebugLevel)
+	if logOutput == "false" {
+		log = zerolog.Nop()
+	}
+
+	srv := server.NewEmulatorServer(&log, conf)
 
 	go func() {
 		srv.Start()
@@ -133,7 +140,7 @@ func runWeb3TestWithSetup(
 // servicesSetup starts up an emulator and the gateway
 // engines required for operation of the evm gateway.
 func servicesSetup(t *testing.T) (emulator.Emulator, func()) {
-	srv, err := startEmulator(true)
+	srv, err := startEmulator(true, defaultServerConfig())
 	require.NoError(t, err)
 
 	ctx, cancel := context.WithCancel(context.Background())