@@ -1627,7 +1627,6 @@ function parallel_test_base_npu() {
16271627EOF
16281628
16291629set +x
1630- ut_startTime_s=` date +%s`
16311630 test_cases=$( ctest -N -V) # get all test cases
16321631 get_quickly_disable_ut|| disable_ut_quickly=' ' # indicate whether the case was in quickly disable list
16331632 while read -r line; do
@@ -1643,12 +1642,73 @@ set +x
16431642 done <<< " $test_cases" ;
16441643 card_test " $single_card_tests " 1
16451644 collect_failed_tests
1646- set -x
1647- ut_endTime_s=` date +%s`
1648- echo " NPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
1645+ # add unit test retry for NPU
1646+ rm -f $tmp_dir /*
1647+ exec_times=0
1648+ retry_unittests_record=' '
1649+ retry_time=3
1650+ exec_time_array=(' first' ' second' ' third' )
1651+ exec_retry_threshold=10
1652+ is_retry_execuate=0
1653+ if [ -n " $failed_test_lists " ]; then
1654+ if [ ${TIMEOUT_DEBUG_HELP:- OFF} == " ON" ]; then
1655+ bash $PADDLE_ROOT /tools/timeout_debug_help.sh " $failed_test_lists " # cat logs for tiemout uts which killed by ctest
1656+ fi
1657+ read need_retry_ut_str <<< $( echo " $failed_test_lists " | grep -oEi " \-.+\(.+\)" | sed ' s/(.\+)//' | sed ' s/- //' )
1658+ need_retry_ut_arr=(${need_retry_ut_str} )
1659+ need_retry_ut_count=${# need_retry_ut_arr[@]}
1660+ read retry_unittests <<< $( echo " $failed_test_lists " | grep -oEi " \-.+\(.+\)" | sed ' s/(.\+)//' | sed ' s/- //' )
1661+ if [ $need_retry_ut_count -lt $exec_retry_threshold ]; then
1662+ while ( [ $exec_times -lt $retry_time ] )
1663+ do
1664+ set +e
1665+ retry_unittests_record=" $retry_unittests_record$failed_test_lists "
1666+ failed_test_lists_ult=` echo " ${failed_test_lists} " | grep -Po ' [^ ].*$' `
1667+ set -e
1668+ if [[ " ${exec_times} " == " 1" ]]; then
1669+ if [[ " ${failed_test_lists} " == " " ]]; then
1670+ break
1671+ else
1672+ read retry_unittests <<< $( echo " $failed_test_lists " | grep -oEi " \-.+\(.+\)" | sed ' s/(.\+)//' | sed ' s/- //' )
1673+ fi
1674+ fi
1675+ echo " ========================================="
1676+ echo " This is the ${exec_time_array[$exec_times]} time to re-run"
1677+ echo " ========================================="
1678+ echo " The following unittest will be re-run:"
1679+ echo " ${retry_unittests} "
1680+
1681+ for line in ${retry_unittests[@]} ;
1682+ do
1683+ read tmp_one_tmp <<< " $( echo $single_card_tests | grep -oEi $line )"
1684+ if [[ " $tmp_one_tmp " != " " ]]; then
1685+ if [[ " $one_card_retry " == " " ]]; then
1686+ one_card_retry=" ^$line $"
1687+ else
1688+ one_card_retry=" $one_card_retry |^$line $"
1689+ fi
1690+ fi
1691+ done
1692+
1693+ if [[ " $one_card_retry " != " " ]]; then
1694+ card_test " $one_card_retry " 1
1695+ fi
1696+
1697+ exec_times=$[$exec_times +1]
1698+ failed_test_lists=' '
1699+ collect_failed_tests
1700+ rm -f $tmp_dir /*
1701+ one_card_retry=' '
1702+ done
1703+ else
1704+ # There are more than 10 failed unit tests, so no unit test retry
1705+ is_retry_execuate=1
1706+ fi
1707+ fi
16491708 if [[ " $EXIT_CODE " != " 0" ]]; then
1650- exit 8 ;
1709+ show_ut_retry_result
16511710 fi
1711+ set -ex
16521712 fi
16531713}
16541714
0 commit comments