Skip to content
This repository has been archived by the owner on Jan 22, 2025. It is now read-only.

Add system test to measure recovery after partition #20902

Merged
merged 5 commits into from
Nov 8, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions system-test/automation_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,21 @@ function get_current_stake {
'$HOME/.cargo/bin/solana --url http://127.0.0.1:8899 validators --output=json | grep -o "totalCurrentStake\": [0-9]*" | cut -d: -f2'
}

function get_validator_confirmation_time {
SINCE=$1
declare q_mean_confirmation='
SELECT ROUND(MEAN("duration_ms")) as "mean_confirmation_ms"
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
WHERE time > now() - '"$SINCE"'s'

mean_confirmation_ms=$( \
curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
--data-urlencode "db=${TESTNET_TAG}" \
--data-urlencode "q=$q_mean_confirmation" |
python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py |
cut -d' ' -f2)
}

function collect_performance_statistics {
execution_step "Collect performance statistics about run"
declare q_mean_tps='
Expand Down
22 changes: 22 additions & 0 deletions system-test/partition-testcases/gce-partition-recovery.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
steps:
- command: "system-test/testnet-automation.sh"
label: "Partition recovery on GCE"
env:
UPLOAD_RESULTS_TO_SLACK: "false"
CLOUD_PROVIDER: "gce"
ENABLE_GPU: "false"
NUMBER_OF_VALIDATOR_NODES: 4
VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
NUMBER_OF_CLIENT_NODES: 1
ADDITIONAL_FLAGS: "--dedicated"
SKIP_PERF_RESULTS: "true"
EXTRA_PRIMORDIAL_STAKES: 4
WAIT_FOR_EQUAL_STAKE: "true"
TEST_TYPE: "script"
WARMUP_SLOTS_BEFORE_TEST: 400
PRE_PARTITION_DURATION: 120
PARTITION_DURATION: 120
NETEM_CONFIG_FILE: "system-test/netem-configs/complete-loss-two-partitions"
CUSTOM_SCRIPT: "system-test/partition-testcases/measure-partition-recovery.sh"
agents:
- "queue=gce-deploy"
56 changes: 56 additions & 0 deletions system-test/partition-testcases/measure-partition-recovery.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env bash

set -ex

# shellcheck disable=SC1090
# shellcheck disable=SC1091
source "$(dirname "$0")"/../automation_utils.sh

RESULT_FILE="$1"

[[ -n $TESTNET_TAG ]] || TESTNET_TAG=${CLOUD_PROVIDER}-testnet-automation

if [[ -z $NETEM_CONFIG_FILE ]]; then
echo "Error: For this test NETEM_CONFIG_FILE must be specified"
exit 1
fi

if [[ -z $PRE_PARTITION_DURATION ]]; then
PRE_PARTITION_DURATION=60
fi

if [[ -z $PARTITION_DURATION ]]; then
PARTITION_DURATION=300
fi

num_online_nodes=$(( NUMBER_OF_VALIDATOR_NODES + 1 ))
if [[ -n "$NUMBER_OF_OFFLINE_NODES" ]]; then
num_online_nodes=$(( num_online_nodes - NUMBER_OF_OFFLINE_NODES ))
fi

execution_step "Measuring validator confirmation time for $PRE_PARTITION_DURATION seconds"
sleep "$PRE_PARTITION_DURATION"
get_validator_confirmation_time "$PRE_PARTITION_DURATION"
# shellcheck disable=SC2154
execution_step "Pre partition validator confirmation time is $mean_confirmation_ms ms"
echo "Pre partition validator confirmation time: $mean_confirmation_ms ms" >> "$RESULT_FILE"

execution_step "Applying partition config $NETEM_CONFIG_FILE for $PARTITION_DURATION seconds"
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes
sleep "$PARTITION_DURATION"

execution_step "Resolving partition"
"${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes

target=$mean_confirmation_ms
get_validator_confirmation_time 10
time=0
echo "Validator confirmation is $mean_confirmation_ms ms immediately after the partition" >> "$RESULT_FILE"

while [[ $mean_confirmation_ms == "expected" || $mean_confirmation_ms -gt $target ]]; do
sleep 1
time=$(( time + 1 ))
get_validator_confirmation_time 10
done

echo "$time seconds after resolving the partition, validator confirmation time fell to $mean_confirmation_ms" >> "$RESULT_FILE"