Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update a100 loss #8708

Merged
merged 4 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ uvicorn
typer
rich
safetensors
tool_helpers ; platform_system == "Linux"
tool_helpers==0.1.1 ; platform_system == "Linux"
aistudio-sdk>=0.1.3
jinja2
regex
Expand Down
22 changes: 11 additions & 11 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ function llama_static_auto_recompute_bs16_fp16_DP2-MP2-PP2-VPP2-Sharding2_stage2
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=10.0859375
if [ $IS_A100 -ne 0 ];then
loss_base=10.125
loss_base=10.390625
fi
ips_base=-1
mem_base=-1
Expand Down Expand Up @@ -551,7 +551,7 @@ function llama_dygraph_auto_bs8_fp32_DP2() {
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=9.51876831
if [ $IS_A100 -ne 0 ];then
loss_base=9.54253578
loss_base=9.53083992
fi
ips_base=-1
mem_base=-1
Expand Down Expand Up @@ -621,7 +621,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2() {
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=9.35078526
if [ $IS_A100 -ne 0 ];then
loss_base=9.41613197
loss_base=9.38577652
fi
ips_base=-1
mem_base=-1
Expand Down Expand Up @@ -691,7 +691,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=9.35139465
if [ $IS_A100 -ne 0 ];then
loss_base=9.4053154
loss_base=9.39356422
fi
ips_base=-1
mem_base=-1
Expand Down Expand Up @@ -762,7 +762,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=9.41604424
if [ $IS_A100 -ne 0 ];then
loss_base=9.4055109
loss_base=9.46169376
fi
ips_base=-1
mem_base=-1
Expand Down Expand Up @@ -862,7 +862,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
ips=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_tokens_per_second_per_device: ' '{print $2}' | awk -F ',' '{print $1}'`
mem=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=7.5364624
loss_base=7.54158936
ips_base=5442.5208
mem_base=22.387750148773193
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
Expand Down Expand Up @@ -961,7 +961,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw() {
ips=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_tokens_per_second_per_device: ' '{print $2}' | awk -F ',' '{print $1}'`
mem=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=7.5364624
loss_base=7.54158936
ips_base=5864.2898
mem_base=23.745134115219116
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
Expand Down Expand Up @@ -1033,7 +1033,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
ips_base=-1
mem_base=-1
if [ $IS_A100 -ne 0 ];then
loss_base=10.58541679
loss_base=10.60499191
fi
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
Expand Down Expand Up @@ -1106,7 +1106,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
ips_base=-1
mem_base=-1
if [ $IS_A100 -ne 0 ];then
loss_base=10.58452606
loss_base=10.59338379
fi
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
Expand Down Expand Up @@ -1179,7 +1179,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
ips_base=-1
mem_base=-1
if [ $IS_A100 -ne 0 ];then
loss_base=10.57996178
loss_base=10.59612274
fi
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
Expand Down Expand Up @@ -1252,7 +1252,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
ips_base=-1
mem_base=-1
if [ $IS_A100 -ne 0 ];then
loss_base=10.58061218
loss_base=10.58141422
fi
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
Expand Down
1 change: 1 addition & 0 deletions scripts/distribute/run_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ export log_path=/workspace/case_logs
export case_list=()

target_lists_for_gpt=(
"legacy/model_zoo/gpt-3"
"llm/auto_parallel/gpt-3"
"paddlenlp/transformers/gpt/modeling.py"
"paddlenlp/transformers/gpt/modeling_pp.py"
Expand Down
Loading