Skip to content

e2e: Increase all ANR timeouts to 2m to ensure CI reliability. #1733

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 19, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 10 additions & 13 deletions tests/e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ const (
// Enough for test/custom networks.
DefaultConfirmTxTimeout = 20 * time.Second

DefaultShutdownTimeout = 2 * time.Minute
DefaultTimeout = 2 * time.Minute
)

// Env is the global struct containing all we need to test
Expand Down Expand Up @@ -116,7 +116,7 @@ func (te *TestEnvironment) ConfigCluster(
return fmt.Errorf("could not setup network-runner client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
presp, err := te.GetRunnerClient().Ping(ctx)
cancel()
if err != nil {
Expand Down Expand Up @@ -156,7 +156,7 @@ func (te *TestEnvironment) StartCluster() error {
switch te.clusterType {
case StandAlone:
tests.Outf("{{magenta}}starting network-runner with %q{{/}}\n", te.avalancheGoExecPath)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
resp, err := te.GetRunnerClient().Start(ctx, te.avalancheGoExecPath,
runner_sdk.WithNumNodes(5),
runner_sdk.WithGlobalNodeConfig(fmt.Sprintf(`{"log-level":"%s"}`, te.avalancheGoLogLevel)),
Expand All @@ -167,10 +167,7 @@ func (te *TestEnvironment) StartCluster() error {
}
tests.Outf("{{green}}successfully started network-runner: {{/}} %+v\n", resp.ClusterInfo.NodeNames)

// start is async, so wait some time for cluster health
time.Sleep(time.Minute)
Comment on lines -170 to -171
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this sleep just never needed? Like a minute long sleep is long

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it was needed in the past, but it doesn't seem to be needed now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not neede anymore

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And to be clear, this sleep ensured each e2e job was wasting 30-40s. At least with a timeout the check can complete as soon as the nodes are ready, but this sleep will never exit early if the nodes are healthy earlier than expected.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The missing piece here is understanding that Health actually blocks until Healthy.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup that was very old ANR tech dep. with the first grpc server implementation, start (with blockchain creation -I know this is not the case but this is a copy probably-) broked posterior health call without sleeping some time.


ctx, cancel = context.WithTimeout(context.Background(), 2*time.Minute)
ctx, cancel = context.WithTimeout(context.Background(), DefaultTimeout)
_, err = te.GetRunnerClient().Health(ctx)
cancel()
if err != nil {
Expand All @@ -188,7 +185,7 @@ func (te *TestEnvironment) StartCluster() error {
}

func (te *TestEnvironment) refreshURIs() error {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
uriSlice, err := te.GetRunnerClient().URIs(ctx)
cancel()
if err != nil {
Expand Down Expand Up @@ -284,7 +281,7 @@ func (te *TestEnvironment) ShutdownCluster() error {
}

tests.Outf("{{red}}shutting down network-runner cluster{{/}}\n")
ctx, cancel := context.WithTimeout(context.Background(), DefaultShutdownTimeout)
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
_, err := runnerCli.Stop(ctx)
cancel()
if err != nil {
Expand All @@ -303,7 +300,7 @@ func (te *TestEnvironment) SnapInitialState() error {
return nil // initial state snapshot already captured
}

ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
_, err := te.runnerCli.SaveSnapshot(ctx, te.snapshotName)
cancel()
if err != nil {
Expand All @@ -318,21 +315,21 @@ func (te *TestEnvironment) RestoreInitialState(switchOffNetworkFirst bool) error
defer te.snapMu.Unlock()

if switchOffNetworkFirst {
ctx, cancel := context.WithTimeout(context.Background(), DefaultShutdownTimeout)
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
_, err := te.GetRunnerClient().Stop(ctx)
cancel()
gomega.Expect(err).Should(gomega.BeNil())
}

ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
_, err := te.GetRunnerClient().LoadSnapshot(ctx, te.snapshotName)
cancel()
if err != nil {
return err
}

// make sure cluster goes back to health before moving on
ctx, cancel = context.WithTimeout(context.Background(), DefaultShutdownTimeout)
ctx, cancel = context.WithTimeout(context.Background(), DefaultTimeout)
_, err = te.GetRunnerClient().Health(ctx)
cancel()
if err != nil {
Expand Down