diff --git a/system-test/automation_utils.sh b/system-test/automation_utils.sh index a08f07576a3435..f4a8d405611659 100755 --- a/system-test/automation_utils.sh +++ b/system-test/automation_utils.sh @@ -121,7 +121,7 @@ function get_validator_confirmation_time { curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ --data-urlencode "db=${TESTNET_TAG}" \ --data-urlencode "q=$q_mean_confirmation" | - python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py | + python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py --empty_error | cut -d' ' -f2) } diff --git a/system-test/partition-testcases/gce-partition-recovery.yml b/system-test/partition-testcases/gce-partition-recovery.yml index bd610d6a6d2125..2463ef6018c2b3 100755 --- a/system-test/partition-testcases/gce-partition-recovery.yml +++ b/system-test/partition-testcases/gce-partition-recovery.yml @@ -2,10 +2,10 @@ steps: - command: "system-test/testnet-automation.sh" label: "Partition recovery on GCE" env: - UPLOAD_RESULTS_TO_SLACK: "false" + UPLOAD_RESULTS_TO_SLACK: "true" CLOUD_PROVIDER: "gce" ENABLE_GPU: "false" - NUMBER_OF_VALIDATOR_NODES: 4 + NUMBER_OF_VALIDATOR_NODES: 9 VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16" NUMBER_OF_CLIENT_NODES: 1 ADDITIONAL_FLAGS: "--dedicated" @@ -15,8 +15,8 @@ steps: TEST_TYPE: "script" WARMUP_SLOTS_BEFORE_TEST: 400 PRE_PARTITION_DURATION: 120 - PARTITION_DURATION: 600 - PARTITION_INCREMENT: 120 + PARTITION_DURATION: 360 + PARTITION_INCREMENT: 60 NETEM_CONFIG_FILE: "system-test/netem-configs/complete-loss-two-partitions" CUSTOM_SCRIPT: "system-test/partition-testcases/measure-partition-recovery.sh" agents: diff --git a/system-test/partition-testcases/measure-partition-recovery.sh b/system-test/partition-testcases/measure-partition-recovery.sh index 03ff1995ac06f5..3c1df03bdef9c2 100755 --- a/system-test/partition-testcases/measure-partition-recovery.sh +++ b/system-test/partition-testcases/measure-partition-recovery.sh @@ -42,6 +42,7 @@ target=$mean_confirmation_ms while true; do execution_step "Applying partition config $NETEM_CONFIG_FILE for $PARTITION_DURATION seconds" + echo "Partitioning for $PARTITION_DURATION seconds" >> "$RESULT_FILE" "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes sleep "$PARTITION_DURATION" @@ -49,21 +50,32 @@ while true; do "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes get_validator_confirmation_time 10 - time=0 - echo "Validator confirmation is $mean_confirmation_ms ms immediately after resolving the partition" + SECONDS=0 - while [[ $mean_confirmation_ms == "expected" || $mean_confirmation_ms -gt $target ]]; do - sleep 1 - time=$(( time + 1 )) + # This happens when we haven't confirmed anything recently so the query returns an empty string + while [[ -z $mean_confirmation_ms ]]; do + sleep 5 + get_validator_confirmation_time 10 + if [[ $SECONDS -gt $PARTITION_DURATION ]]; then + echo " No confirmations seen after $SECONDS seconds" >> "$RESULT_FILE" + exit 0 + fi + done + echo " Validator confirmation is $mean_confirmation_ms ms $SECONDS seconds after resolving the partition" >> "$RESULT_FILE" + + last="" + while [[ -z $mean_confirmation_ms || $mean_confirmation_ms -gt $target ]]; do + sleep 5 - if [[ $time -gt $PARTITION_DURATION ]]; then - echo "Partition Duration: $PARTITION_DURATION: Unable to make progress after $time seconds. Confirmation time did not fall below pre partition confirmation time" >> "$RESULT_FILE" + if [[ -n $mean_confirmation_ms && -n $last && $mean_confirmation_ms -gt $(echo "$last * 1.2" | bc) || $SECONDS -gt $PARTITION_DURATION ]]; then + echo " Unable to make progress after $SECONDS seconds. Last confirmation time was $mean_confirmation_ms ms" >> "$RESULT_FILE" exit 0 fi + last=$mean_confirmation_ms get_validator_confirmation_time 10 done - echo "Partition Duration: $PARTITION: $time seconds for validator confirmation to fall to $mean_confirmation_ms ms" >> "$RESULT_FILE" + echo " Recovered in $SECONDS seconds: validator confirmation to fall to $mean_confirmation_ms ms" >> "$RESULT_FILE" PARTITION_DURATION=$(( PARTITION_DURATION + PARTITION_INCREMENT )) done diff --git a/system-test/testnet-automation-json-parser.py b/system-test/testnet-automation-json-parser.py index 10f82b13e60df7..37959b025855b0 100755 --- a/system-test/testnet-automation-json-parser.py +++ b/system-test/testnet-automation-json-parser.py @@ -1,5 +1,9 @@ #!/usr/bin/env python3 -import sys, json +import sys, json, argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--empty_error", action="store_true", help="If present, do not print error message") +args = parser.parse_args() data=json.load(sys.stdin) @@ -7,7 +11,7 @@ for result in data['results']: if 'series' in result: print(result['series'][0]['columns'][1] + ': ' + str(result['series'][0]['values'][0][1])) - else: + elif not args.empty_error: print("An expected result from CURL request is missing") -else: +elif not args.empty_error: print("No results returned from CURL request")