Skip to content

Commit

Permalink
roachtest: deflake ldr/disconnect
Browse files Browse the repository at this point in the history
This test could previously disconnect a set of nodes that could cause a cluster
to loose quorom. With this patch, the test now disconnects a src-dest node pair
that are replicating data.

Fixes cockroachdb#133801

Release note: none
  • Loading branch information
msbutler committed Nov 8, 2024
1 parent 5b9c1d8 commit 9b2c593
Showing 1 changed file with 2 additions and 21 deletions.
23 changes: 2 additions & 21 deletions pkg/cmd/roachtest/tests/logical_data_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -534,27 +534,8 @@ func TestLDROnNetworkPartition(
// Let workload run for a bit before we kill a node
time.Sleep(ldrWorkload.workload.(replicateKV).debugRunDuration / 10)

failNodesLength := len(setup.CRDBNodes()) / 2
nodesToFail, err := setup.CRDBNodes().SeededRandList(setup.rng, failNodesLength)
if err != nil {
t.Fatal(err)
}

// We're not using the entire blackholeFailer setup, so break the interface contract and use this directly
blackholeFailer := &blackholeFailer{t: t, c: c, input: true, output: true}
disconnectDuration := ldrWorkload.workload.(replicateKV).debugRunDuration / 5
t.L().Printf("Disconnecting nodes %v", nodesToFail)
for _, nodeID := range nodesToFail {
blackholeFailer.FailPartial(ctx, nodeID, setup.CRDBNodes())
}

// Sleep while workload continues
t.L().Printf("Sleeping for %.2f minutes", disconnectDuration.Minutes())
time.Sleep(disconnectDuration)

// Re-enable
blackholeFailer.Cleanup(ctx)
t.L().Printf("Nodes reconnected. Waiting for workload to complete")
disconnectDuration := ldrWorkload.workload.(replicateKV).debugRunDuration / 3
partitionPair(ctx, c, t, setup.left.nodes, disconnectDuration)

monitor.Wait()
VerifyCorrectness(ctx, c, t, setup, leftJobID, rightJobID, 5*time.Minute, ldrWorkload)
Expand Down

0 comments on commit 9b2c593

Please sign in to comment.