Skip to content

Commit 9d985ca

Browse files
authored
[NPU] add unit test retry for NPU UT, test=develop (#34443)
1 parent b9d6c98 commit 9d985ca

File tree

1 file changed

+65
-5
lines changed

1 file changed

+65
-5
lines changed

paddle/scripts/paddle_build.sh

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1627,7 +1627,6 @@ function parallel_test_base_npu() {
16271627
EOF
16281628

16291629
set +x
1630-
ut_startTime_s=`date +%s`
16311630
test_cases=$(ctest -N -V) # get all test cases
16321631
get_quickly_disable_ut||disable_ut_quickly='' # indicate whether the case was in quickly disable list
16331632
while read -r line; do
@@ -1643,12 +1642,73 @@ set +x
16431642
done <<< "$test_cases";
16441643
card_test "$single_card_tests" 1
16451644
collect_failed_tests
1646-
set -x
1647-
ut_endTime_s=`date +%s`
1648-
echo "NPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
1645+
# add unit test retry for NPU
1646+
rm -f $tmp_dir/*
1647+
exec_times=0
1648+
retry_unittests_record=''
1649+
retry_time=3
1650+
exec_time_array=('first' 'second' 'third')
1651+
exec_retry_threshold=10
1652+
is_retry_execuate=0
1653+
if [ -n "$failed_test_lists" ];then
1654+
if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
1655+
bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists" # cat logs for tiemout uts which killed by ctest
1656+
fi
1657+
read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
1658+
need_retry_ut_arr=(${need_retry_ut_str})
1659+
need_retry_ut_count=${#need_retry_ut_arr[@]}
1660+
read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
1661+
if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
1662+
while ( [ $exec_times -lt $retry_time ] )
1663+
do
1664+
set +e
1665+
retry_unittests_record="$retry_unittests_record$failed_test_lists"
1666+
failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
1667+
set -e
1668+
if [[ "${exec_times}" == "1" ]];then
1669+
if [[ "${failed_test_lists}" == "" ]];then
1670+
break
1671+
else
1672+
read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
1673+
fi
1674+
fi
1675+
echo "========================================="
1676+
echo "This is the ${exec_time_array[$exec_times]} time to re-run"
1677+
echo "========================================="
1678+
echo "The following unittest will be re-run:"
1679+
echo "${retry_unittests}"
1680+
1681+
for line in ${retry_unittests[@]} ;
1682+
do
1683+
read tmp_one_tmp <<< "$( echo $single_card_tests | grep -oEi $line )"
1684+
if [[ "$tmp_one_tmp" != "" ]]; then
1685+
if [[ "$one_card_retry" == "" ]]; then
1686+
one_card_retry="^$line$"
1687+
else
1688+
one_card_retry="$one_card_retry|^$line$"
1689+
fi
1690+
fi
1691+
done
1692+
1693+
if [[ "$one_card_retry" != "" ]]; then
1694+
card_test "$one_card_retry" 1
1695+
fi
1696+
1697+
exec_times=$[$exec_times+1]
1698+
failed_test_lists=''
1699+
collect_failed_tests
1700+
rm -f $tmp_dir/*
1701+
one_card_retry=''
1702+
done
1703+
else
1704+
# There are more than 10 failed unit tests, so no unit test retry
1705+
is_retry_execuate=1
1706+
fi
1707+
fi
16491708
if [[ "$EXIT_CODE" != "0" ]]; then
1650-
exit 8;
1709+
show_ut_retry_result
16511710
fi
1711+
set -ex
16521712
fi
16531713
}
16541714

0 commit comments

Comments
 (0)