From d1933ab72f09b068dfd20780f81c77db4cbba007 Mon Sep 17 00:00:00 2001
From: Jason Nochlin <91577+hundredwatt@users.noreply.github.com>
Date: Sat, 13 Jan 2024 04:19:29 -0700
Subject: [PATCH] evaluate2.sh: Check output of warmup run and abort early if
 failed (#333)

* refactor: replace xtrace with "print_and_execute" function

* nit: stylize error messages

* replace out_expected.txt with measurements_1B.out

* print

* prevent errors on cleanup

* run tests and check warmup run output before running benchmark

* move "git diff" pretty diff output to test.sh

* Ensure "set -e" is re-enabled if we followed a "continue" branch

* add timeouts to test.sh invocations

* use diff with tocsv.sh to show differences on failed test

* add --quiet mode to test.sh

* move prepare_$fork.sh invocation to right below hyperfine since test.sh also invokes it

* Revert "add --quiet mode to test.sh"

This reverts commit 13e9fb7f395c1bd64a62528b8349803bc1366941.

* use tee to capture test output to a temp file and print contents on failure

---------

Co-authored-by: Jason Nochlin <hundredwatt@users.noreply.github.com>
---
 evaluate2.sh | 137 ++++++++++++++++++++++++++++++---------------------
 test.sh      |   2 +-
 2 files changed, 83 insertions(+), 56 deletions(-)
diff --git a/evaluate2.sh b/evaluate2.sh
index 04eebb6332..8a9e42f875 100755
--- a/evaluate2.sh
+++ b/evaluate2.sh
@@ -33,9 +33,22 @@ RED='\033[0;31m'
 BOLD_YELLOW='\033[1;33m'
 RESET='\033[0m' # No Color
 
+MEASUREMENTS_FILE="measurements_1B.txt"
+RUNS=5
 DEFAULT_JAVA_VERSION="21.0.1-open"
 RUN_TIME_LIMIT=300 # seconds
 
+TIMEOUT=""
+if [ "$(uname -s)" == "Linux" ]; then
+  TIMEOUT="timeout -v $RUN_TIME_LIMIT"
+else # MacOs
+  if [ -x "$(command -v gtimeout)" ]; then
+    TIMEOUT="gtimeout -v $RUN_TIME_LIMIT" # from `brew install coreutils`
+  else
+    echo -e "${BOLD_YELLOW}WARNING${RESET} gtimeout not available, benchmark runs may take indefinitely long."
+  fi
+fi
+
 function check_command_installed {
   if ! [ -x "$(command -v $1)" ]; then
     echo "Error: $1 is not installed." >&2
@@ -43,6 +56,11 @@ function check_command_installed {
   fi
 }
 
+function print_and_execute() {
+  echo "+ $@" >&2
+  "$@"
+}
+
 check_command_installed java
 check_command_installed hyperfine
 check_command_installed jq
@@ -51,7 +69,7 @@ check_command_installed bc
 # Validate that ./calculate_average_<fork>.sh exists for each fork
 for fork in "$@"; do
   if [ ! -f "./calculate_average_$fork.sh" ]; then
-    echo "Error: ./calculate_average_$fork.sh does not exist." >&2
+    echo -e "${BOLD_RED}ERROR${RESET}: ./calculate_average_$fork.sh does not exist." >&2
     exit 1
   fi
 done
@@ -59,7 +77,7 @@ done
 ## SDKMAN Setup
 # 1. Custom check for sdkman installed; not sure why check_command_installed doesn't detect it properly
 if [ ! -f "$HOME/.sdkman/bin/sdkman-init.sh" ]; then
-    echo "Error: sdkman is not installed." >&2
+     echo -e "${BOLD_RED}ERROR${RESET}: sdkman is not installed." >&2
     exit 1
 fi
 
@@ -68,8 +86,7 @@ source "$HOME/.sdkman/bin/sdkman-init.sh"
 
 # 3. make sure the default java version is installed
 if [ ! -d "$HOME/.sdkman/candidates/java/$DEFAULT_JAVA_VERSION" ]; then
-  echo "+ sdk install java $DEFAULT_JAVA_VERSION"
-  sdk install java $DEFAULT_JAVA_VERSION
+  print_and_execute sdk install java $DEFAULT_JAVA_VERSION
 fi
 
 # 4. Install missing SDK java versions in any of the prepare_*.sh scripts for the provided forks
@@ -77,8 +94,7 @@ for fork in "$@"; do
   if [ -f "./prepare_$fork.sh" ]; then
     grep -h "^sdk use" "./prepare_$fork.sh" | cut -d' ' -f4 | while read -r version; do
       if [ ! -d "$HOME/.sdkman/candidates/java/$version" ]; then
-        echo "+ sdk install java $version"
-        sdk install java $version
+        print_and_execute sdk install java $version
       fi
     done || true # grep returns exit code 1 when no match, `|| true` prevents the script from exiting early
   fi
@@ -99,44 +115,68 @@ if [ -f "/sys/devices/system/cpu/cpufreq/boost" ]; then
   fi
 fi
 
-set -o xtrace
+print_and_execute java --version
+print_and_execute ./mvnw --quiet clean verify
 
-java --version
-
-./mvnw --quiet clean verify
-
-rm -f measurements.txt
-ln -s measurements_1B.txt measurements.txt
-
-set +o xtrace
+print_and_execute rm -f measurements.txt
+print_and_execute ln -s $MEASUREMENTS_FILE measurements.txt
 
 echo ""
 
-# check if out_expected.txt exists
-if [ ! -f "out_expected.txt" ]; then
-  echo "Error: out_expected.txt does not exist." >&2
+# check if measurements_xxx.out exists
+if [ ! -f "${MEASUREMENTS_FILE%.txt}.out" ]; then
+  echo -e "${BOLD_RED}ERROR${RESET}: ${MEASUREMENTS_FILE%.txt}.out does not exist." >&2
   echo "Please create it with:"
-  echo "  ./calculate_average_baseline.sh > out_expected.txt"
+  echo ""
+  echo "  ./calculate_average_baseline.sh > ${MEASUREMENTS_FILE%.txt}.out"
+  echo ""
   exit 1
 fi
 
-# Prepare commands for running benchmarks for each of the forks
+# Run tests and benchmark for each fork
 filetimestamp=$(date  +"%Y%m%d%H%M%S") # same for all fork.out files from this run
 failed=()
+test_output=$(mktemp)
 for fork in "$@"; do
-  # Use prepare script to invoke SDKMAN
+  set +e # we don't want prepare.sh, test.sh or hyperfine failing on 1 fork to exit the script early
+
+  # Run the test suite
+  print_and_execute $TIMEOUT ./test.sh $fork | tee $test_output > /dev/null 2>&1
+  if [ $? -ne 0 ]; then
+    failed+=("$fork")
+    echo ""
+    echo -e "${BOLD_RED}FAILURE${RESET}: ./test.sh $fork failed"
+    cat $test_output
+    echo ""
+
+    continue
+  fi
+
+  # Run the test on $MEASUREMENTS_FILE; this serves as the warmup
+  print_and_execute $TIMEOUT ./test.sh $fork $MEASUREMENTS_FILE | tee $test_output > /dev/null 2>&1
+  if [ $? -ne 0 ]; then
+    failed+=("$fork")
+    echo ""
+    echo -e "${BOLD_RED}FAILURE${RESET}: ./test.sh $fork $MEASUREMENTS_FILE failed"
+    cat $test_output
+    echo ""
+
+    continue
+  fi
+
+  # re-link measurements.txt since test.sh deleted it
+  print_and_execute rm -f measurements.txt
+  print_and_execute ln -s $MEASUREMENTS_FILE measurements.txt
+
+  # Run prepare script
   if [ -f "./prepare_$fork.sh" ]; then
-    echo "+ source ./prepare_$fork.sh"
-    source "./prepare_$fork.sh"
+    print_and_execute source "./prepare_$fork.sh"
   else
-    echo "+ sdk use java $DEFAULT_JAVA_VERSION"
-    sdk use java $DEFAULT_JAVA_VERSION
+    print_and_execute sdk use java $DEFAULT_JAVA_VERSION
   fi
 
-  # Use hyperfine to run the benchmarks for each fork
-  HYPERFINE_OPTS="--warmup 1 --runs 5 --export-json $fork-$filetimestamp-timing.json --output ./$fork-$filetimestamp.out"
-
-  set +e # we don't want hyperfine or diff failing on 1 fork to exit the script early
+  # Use hyperfine to run the benchmark for each fork
+  HYPERFINE_OPTS="--warmup 0 --runs $RUNS --export-json $fork-$filetimestamp-timing.json --output ./$fork-$filetimestamp.out"
 
   # check if this script is running on a Linux box
   if [ "$(uname -s)" == "Linux" ]; then
@@ -144,36 +184,20 @@ for fork in "$@"; do
 
     # Linux platform
     # prepend this with numactl --physcpubind=0-7 for running it only with 8 cores
-    numactl --physcpubind=0-7 hyperfine $HYPERFINE_OPTS "timeout -v $RUN_TIME_LIMIT ./calculate_average_$fork.sh 2>&1"
+    numactl --physcpubind=0-7 hyperfine $HYPERFINE_OPTS "$TIMEOUT ./calculate_average_$fork.sh 2>&1"
   else # MacOS
-    timeout=""
-    if [ -x "$(command -v gtimeout)" ]; then
-      timeout="gtimeout -v $RUN_TIME_LIMIT" # from `brew install coreutils`
-    else
-      echo -e "${BOLD_YELLOW}WARNING${RESET} gtimeout not available, benchmark runs may take indefinitely long."
-    fi
-    hyperfine $HYPERFINE_OPTS "$timeout ./calculate_average_$fork.sh 2>&1"
+    hyperfine $HYPERFINE_OPTS "$TIMEOUT ./calculate_average_$fork.sh 2>&1"
   fi
   # Catch hyperfine command failed
   if [ $? -ne 0 ]; then
     failed+=("$fork")
+    # Hyperfine already prints the error message
     echo ""
+    continue
   fi
-
-  # Verify output
-  diff <(grep Hamburg $fork-$filetimestamp.out) <(grep Hamburg out_expected.txt) > /dev/null
-  if [ $? -ne 0 ]; then
-    echo ""
-    echo -e "${BOLD_RED}FAILURE${RESET}: output of ${BOLD_WHITE}$fork-$filetimestamp.out${RESET} does not match ${BOLD_WHITE}out_expected.txt${RESET}"
-    echo ""
-
-    git diff --no-index --word-diff out_expected.txt $fork-$filetimestamp.out
-
-    # add $fork to $failed array
-    failed+=("$fork")
-  fi
-  set -e
 done
+set -e
+rm $test_output
 
 # Summary
 echo -e "${BOLD_WHITE}Summary${RESET}"
@@ -285,9 +309,12 @@ rm $leaderboard_temp_file
 # Finalize .out files
 echo "Raw results saved to file(s):"
 for fork in "$@"; do
-  # Append $fork-$filetimestamp-timing.json to $fork-$filetimestamp.out and rm $fork-$filetimestamp-timing.json
-  cat $fork-$filetimestamp-timing.json >> $fork-$filetimestamp.out
-  rm $fork-$filetimestamp-timing.json
+  if [ -f "$fork-$filetimestamp-timing.json" ]; then
+      cat $fork-$filetimestamp-timing.json >> $fork-$filetimestamp.out
+      rm $fork-$filetimestamp-timing.json
+  fi
 
-  echo "  $fork-$filetimestamp.out"
+  if [ -f "$fork-$filetimestamp.out" ]; then
+    echo "  $fork-$filetimestamp.out"
+  fi
 done
diff --git a/test.sh b/test.sh
index c45ca755c7..faf85f4df9 100755
--- a/test.sh
+++ b/test.sh
@@ -45,7 +45,7 @@ for sample in $(ls $INPUT); do
   rm -f measurements.txt
   ln -s $sample measurements.txt
 
-  diff <("./calculate_average_$FORK.sh") ${sample%.txt}.out
+  diff <("./calculate_average_$FORK.sh" | ./tocsv.sh) <(./tocsv.sh < ${sample%.txt}.out)
 done
 
 rm measurements.txt