Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add weekly tests for memory growth #3101

Merged
merged 26 commits into from
Jul 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions qa/L0_client_memory_growth/client_memory_mail.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -35,9 +35,11 @@

if __name__ == '__main__':
today = date.today().strftime("%Y-%m-%d")
subject = "Triton Client Memory Growth Summary: " + today
subject = "Triton Client Memory Growth " + sys.argv[1] + " Summary: " + today
memory_graphs = glob.glob("client_memory_growth*.log")
html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Consolas;\">"
write_up = "<p>This test is run for both HTTP and GRPC protocols using C++ and Python test scripts. The max-allowed difference between mean and maximum memory usage is set to 10MB and 1MB for C++ and Python tests individually.</p>"
write_up += "<p><b>&#8226 What to look for</b><br>A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.</p>"
html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
for mem_graph in sorted(memory_graphs):
html_content += "\n" + mem_graph + "\n"
with open(mem_graph, "r") as f:
Expand Down
20 changes: 16 additions & 4 deletions qa/L0_client_memory_growth/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -53,6 +53,18 @@ SERVER=/opt/tritonserver/bin/tritonserver
SERVER_ARGS="--model-repository=$DATADIR"
source ../common/util.sh

# Set the number of repetitions in nightly and weekly tests
# Set the email subject for nightly and weekly tests
if [ "$TRITON_PERF_WEEKLY" == 1 ]; then
REPETITION_CPP=2000000
REPETITION_PY=2400000
EMAIL_SUBJECT="Weekly"
else
REPETITION_CPP=100000
REPETITION_PY=10000
EMAIL_SUBJECT="Nightly"
fi

mkdir -p $DATADIR/custom_identity_int32/1

RET=0
Expand All @@ -77,11 +89,11 @@ for PROTOCOL in http grpc; do
if [ "$LANG" == "c++" ]; then
MEMORY_GROWTH_TEST=$MEMORY_GROWTH_TEST_CPP
MAX_ALLOWED_ALLOC="10"
EXTRA_ARGS="-r 100000 -i ${PROTOCOL}"
EXTRA_ARGS="-r ${REPETITION_CPP} -i ${PROTOCOL}"
else
MEMORY_GROWTH_TEST="python $MEMORY_GROWTH_TEST_PY"
MAX_ALLOWED_ALLOC="1"
EXTRA_ARGS="-r 10000 -i ${PROTOCOL}"
EXTRA_ARGS="-r ${REPETITION_PY} -i ${PROTOCOL}"
fi

$LEAKCHECK $LEAKCHECK_ARGS $MEMORY_GROWTH_TEST $EXTRA_ARGS >> ${CLIENT_LOG} 2>&1
Expand Down Expand Up @@ -125,7 +137,7 @@ fi

# Run only if both TRITON_FROM and TRITON_TO_DL are set
if [[ ! -z "$TRITON_FROM" ]] || [[ ! -z "$TRITON_TO_DL" ]]; then
python client_memory_mail.py
python client_memory_mail.py $EMAIL_SUBJECT
fi

exit $RET
23 changes: 18 additions & 5 deletions qa/L0_memory_growth/server_memory_mail.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -35,12 +35,25 @@

CoderHam marked this conversation as resolved.
Show resolved Hide resolved
if __name__ == '__main__':
today = date.today().strftime("%Y-%m-%d")
subject = "Triton Server Memory Growth Summary: " + today
memory_graphs = glob.glob("memory_growth*.log")
html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Consolas;\">"
for mem_graph in sorted(memory_graphs):
subject = "Triton Server Memory Growth " + sys.argv[1] + " Summary: " + today
memory_graphs_resnet = glob.glob("memory_growth_resnet*.log")
memory_graphs_busyop = glob.glob("memory_growth_busyop.log")
write_up = "<p>This test uses perf_analyzer as clients running on 4 different models. The max allowed difference between mean and maximum memory usage is set to 150MB.</p>"
write_up += "<p><b>&#8226 What to look for</b><br>A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.</p>"
html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
for mem_graph in sorted(memory_graphs_resnet):
html_content += "\n" + mem_graph + "\n"
with open(mem_graph, "r") as f:
html_content += f.read() + "\n"
# The busy op model causes PTX failures when running the CI.
# Should be uncommented when it's ready for merging.
# TODO Uncomment after PTX issues are resolved.
# write_up = "<p>The busyop test is by design to show that actual memory growth is correctly detected and displayed.</p>"
# write_up += "<p><b>&#8226 What to look for</b><br>The memory usage should increase continually over time, and a linear growth should be observed in the graph below.</p>"
# html_content += "</pre><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
# for mem_graph in sorted(memory_graphs_busyop):
# html_content += "\n" + mem_graph + "\n"
# with open(mem_graph, "r") as f:
# html_content += f.read() + "\n"
html_content += "</pre></body></html>"
nightly_email_helper.send(subject, html_content, is_html=True)
129 changes: 71 additions & 58 deletions qa/L0_memory_growth/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -70,6 +70,16 @@ INSTANCE_CNT=2
CONCURRENCY=32
CLIENT_BS=8

# Set the number of repetitions in nightly and weekly tests
# Set the email subject for nightly and weekly tests
if [ "$TRITON_PERF_WEEKLY" == 1 ]; then
REPETITION=200
EMAIL_SUBJECT="Weekly"
else
REPETITION=3
EMAIL_SUBJECT="Nightly"
fi

# Threshold memory growth in MB
MAX_ALLOWED_ALLOC="150"
export MAX_ALLOWED_ALLOC
Expand Down Expand Up @@ -131,8 +141,8 @@ for MODEL in $(ls models); do

set +e

# Run the perf analyzer 3 times
for i in {1..3}; do
# Run the perf analyzer 'REPETITION' times
for ((i=1; i<=$REPETITION; i++)); do
$PERF_ANALYZER -v -m $MODEL -i grpc --concurrency-range $CONCURRENCY -b $CLIENT_BS >> $CLIENT_LOG 2>&1
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
Expand Down Expand Up @@ -164,60 +174,63 @@ done
# Next perform a test that has unbound memory growth. Use the busy op model
# with a high delay in order to force requests to sit in the queue, and result
# in memory growth.
BUSY_OP_TEST=busy_op_test.py
DELAY_CYCLES=2100000000
NUM_REQUESTS=100

rm -rf test_repo && mkdir test_repo
cp -r ${DATADIR}/qa_custom_ops/tf_custom_ops/graphdef_busyop test_repo/

# Explicitly set library path so custom ops can find TF
LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorflow1
SERVER_ARGS="--model-repository=`pwd`/test_repo"
SERVER_LD_PRELOAD="${DATADIR}/qa_custom_ops/tf_custom_ops/libbusyop.so"

LEAKCHECK_LOG="test_busyop.valgrind.log"
MASSIF_LOG="test_busyop.massif"
GRAPH_LOG="memory_growth_busyop.log"
LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --massif-out-file=$MASSIF_LOG --max-threads=3000 --log-file=$LEAKCHECK_LOG"
SERVER_LOG="test_busyop.server.log"
CLIENT_LOG="test_busyop.client.log"

# Run server
run_server_leakcheck
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

set +e

# Run the busy_op test
python $BUSY_OP_TEST -v -m graphdef_busyop -d $DELAY_CYCLES -n $NUM_REQUESTS > $CLIENT_LOG 2>&1
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test graphdef_busyop Failed\n***"
RET=1
fi
set -e

# Stop Server
kill $SERVER_PID
wait $SERVER_PID

set +e

ms_print ${MASSIF_LOG} | head -n35 >> ${GRAPH_LOG}
cat ${GRAPH_LOG}
# Check the massif output
python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC --start-from-middle >> $CLIENT_LOG 2>&1
if [ $? -ne 1 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test for graphdef_busyop Failed\n***"
RET=1
fi
set -e
# The busy op model causes PTX failures when running the CI.
# Should be uncommented when it's ready for merging.
# TODO Re-enable after PTX issues are resolved.
# BUSY_OP_TEST=busy_op_test.py
# DELAY_CYCLES=2100000000
# NUM_REQUESTS=100

# rm -rf test_repo && mkdir test_repo
# cp -r ${DATADIR}/qa_custom_ops/tf_custom_ops/graphdef_busyop test_repo/

# # Explicitly set library path so custom ops can find TF
# LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorflow1
# SERVER_ARGS="--model-repository=`pwd`/test_repo"
# SERVER_LD_PRELOAD="${DATADIR}/qa_custom_ops/tf_custom_ops/libbusyop.so"

# LEAKCHECK_LOG="test_busyop.valgrind.log"
# MASSIF_LOG="test_busyop.massif"
# GRAPH_LOG="memory_growth_busyop.log"
# LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --massif-out-file=$MASSIF_LOG --max-threads=3000 --log-file=$LEAKCHECK_LOG"
# SERVER_LOG="test_busyop.server.log"
# CLIENT_LOG="test_busyop.client.log"

# # Run server
# run_server_leakcheck
# if [ "$SERVER_PID" == "0" ]; then
# echo -e "\n***\n*** Failed to start $SERVER\n***"
# cat $SERVER_LOG
# exit 1
# fi

# set +e

# # Run the busy_op test
# python $BUSY_OP_TEST -v -m graphdef_busyop -d $DELAY_CYCLES -n $NUM_REQUESTS > $CLIENT_LOG 2>&1
# if [ $? -ne 0 ]; then
# cat $CLIENT_LOG
# echo -e "\n***\n*** Test graphdef_busyop Failed\n***"
# RET=1
# fi
# set -e

# # Stop Server
# kill $SERVER_PID
# wait $SERVER_PID

# set +e

# ms_print ${MASSIF_LOG} | head -n35 >> ${GRAPH_LOG}
# cat ${GRAPH_LOG}
# # Check the massif output
# python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC --start-from-middle >> $CLIENT_LOG 2>&1
# if [ $? -ne 1 ]; then
# cat $CLIENT_LOG
# echo -e "\n***\n*** Test for graphdef_busyop Failed\n***"
# RET=1
# fi
# set -e

if [ $RET -eq 0 ]; then
echo -e "\n***\n*** Test Passed\n***"
Expand All @@ -227,7 +240,7 @@ fi

# Run only if both TRITON_FROM and TRITON_TO_DL are set
if [[ ! -z "$TRITON_FROM" ]] || [[ ! -z "$TRITON_TO_DL" ]]; then
python server_memory_mail.py
python server_memory_mail.py $EMAIL_SUBJECT
fi

exit $RET