Skip to content

Commit f0b089e

Browse files
authored
Merge pull request #94 from kusumachalasani/aiworkloads
Include accelerate benchmarks
2 parents 31264f6 + 7e0f3d5 commit f0b089e

File tree

10 files changed

+421
-192
lines changed

10 files changed

+421
-192
lines changed

common/common_helper.sh

Lines changed: 127 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -292,21 +292,56 @@ function prometheus_install() {
292292
###########################################
293293
function benchmarks_install() {
294294
NAMESPACE="${1:-default}"
295-
MANIFESTS="${2:-default_manifests}"
295+
BENCHMARK="${2:-tfb}"
296+
MANIFESTS="${3:-default_manifests}"
297+
296298
echo
297299
echo "#######################################"
298300
pushd benchmarks >/dev/null
299-
echo "5. Installing TechEmpower (Quarkus REST EASY) benchmark into cluster"
300-
pushd techempower >/dev/null
301-
# Reduce the requests to 1core-512Mi to accomodate the benchmark in resourcehub
302-
sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\10.5/}' ./manifests/${MANIFESTS}/postgres.yaml
303-
sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/postgres.yaml
304-
sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\11.5/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
305-
sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
306-
307-
kubectl apply -f manifests/${MANIFESTS} -n ${NAMESPACE}
308-
check_err "ERROR: TechEmpower app failed to start, exiting"
309-
popd >/dev/null
301+
if [ ${BENCHMARK} == "tfb" ]; then
302+
echo "5. Installing TechEmpower (Quarkus REST EASY) benchmark into cluster"
303+
pushd techempower >/dev/null
304+
# Reduce the requests to 1core-512Mi to accomodate the benchmark in resourcehub
305+
sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\10.5/}' ./manifests/${MANIFESTS}/postgres.yaml
306+
sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/postgres.yaml
307+
sed -i '/requests:/ {n; s/\(cpu: \)\([0-9]*\.[0-9]*\|\([0-9]*\)\)/\11.5/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
308+
sed -i '/requests:/ {n; n; s/\(memory: \)\"[^\"]*\"/\1\"512Mi\"/}' ./manifests/${MANIFESTS}/quarkus-resteasy-hibernate.yaml
309+
kubectl apply -f manifests/${MANIFESTS} -n ${NAMESPACE}
310+
check_err "ERROR: TechEmpower app failed to start, exiting"
311+
popd >/dev/null
312+
fi
313+
if [ ${BENCHMARK} == "human-eval" ]; then
314+
echo "#######################################"
315+
echo "Running HumanEval benchmark job in background"
316+
echo
317+
pushd human-eval-benchmark/manifests >/dev/null
318+
sed -i 's/namespace: kruize-hackathon/namespace: "'"${NAMESPACE}"'"/' pvc.yaml
319+
sed -i 's/namespace: kruize-hackathon/namespace: "'"${NAMESPACE}"'"/' job.yaml
320+
# Update num_prompts value to 150 to run the benchmark for atleast 15 mins
321+
sed -i "s/value: '10'/value: '150'/" job.yaml
322+
oc apply -f pvc.yaml -n ${NAMESPACE}
323+
oc apply -f job.yaml -n ${NAMESPACE}
324+
check_err "ERROR: Human eval job failed to start, exiting"
325+
popd >/dev/null
326+
fi
327+
if [ ${BENCHMARK} == "ttm" ]; then
328+
echo "#######################################"
329+
echo "Running Training TTM benchmark job in background"
330+
pushd AI-MLbenchmarks/ttm >/dev/null
331+
echo ""
332+
./run_ttm.sh ${NAMESPACE} >> ${LOG_FILE} &
333+
check_err "ERROR: Training ttm jobs failed to start, exiting"
334+
popd >/dev/null
335+
fi
336+
if [ ${BENCHMARK} == "llm-rag" ]; then
337+
echo "#######################################"
338+
echo "Installing LLM-RAG benchmark into cluster"
339+
pushd AI-MLbenchmarks/llm-rag >/dev/null
340+
./deploy.sh ${NAMESPACE}
341+
check_err "ERROR: llm-rag benchmark failed to start, exiting"
342+
popd >/dev/null
343+
fi
344+
310345
popd >/dev/null
311346
echo "#######################################"
312347
echo
@@ -317,15 +352,35 @@ function benchmarks_install() {
317352
###########################################
318353
function benchmarks_uninstall() {
319354
NAMESPACE="${1:-default}"
355+
BENCHMARK="${1:-tfb}"
320356
MANIFESTS="${2:-default_manifests}"
321357
echo
322358
echo "#######################################"
323359
pushd benchmarks >/dev/null
324-
echo "Uninstalling TechEmpower (Quarkus REST EASY) benchmark in cluster"
325-
pushd techempower >/dev/null
326-
kubectl delete -f manifests/${MANIFESTS} -n ${NAMESPACE}
327-
check_err "ERROR: TechEmpower app failed to delete, exiting"
328-
popd >/dev/null
360+
if [ ${BENCHMARK} == "tfb" ]; then
361+
echo "Uninstalling TechEmpower (Quarkus REST EASY) benchmark in cluster"
362+
pushd techempower >/dev/null
363+
kubectl delete -f manifests/${MANIFESTS} -n ${NAMESPACE}
364+
check_err "ERROR: TechEmpower app failed to delete, exiting"
365+
popd >/dev/null
366+
fi
367+
if [ ${BENCHMARK} == "human-eval" ]; then
368+
echo "Uninstalling humanEval benchmark job in cluster"
369+
pushd human-eval-benchmark >/dev/null
370+
oc delete -f job.yaml
371+
oc delete -f pvc.yaml
372+
check_err "ERROR: human-eval benchmark failed to delete, exiting"
373+
popd >/dev/null
374+
fi
375+
if [ ${BENCHMARK} == "ttm" ] || [${BENCHMARK} == "llm-rag"]; then
376+
377+
echo "Uninstalling ${BENCHMARK} benchmark in cluster"
378+
pushd AI-MLbenchmarks/ttm >/dev/null
379+
./cleanup.sh ${NAMESPACE}
380+
check_err "ERROR: ${BENCHMARK} benchmark failed to delete, exiting"
381+
popd >/dev/null
382+
fi
383+
329384
popd >/dev/null
330385
echo "#######################################"
331386
echo
@@ -337,23 +392,39 @@ function benchmarks_uninstall() {
337392
function apply_benchmark_load() {
338393
TECHEMPOWER_LOAD_IMAGE="quay.io/kruizehub/tfb_hyperfoil_load:0.25.2"
339394
APP_NAMESPACE="${1:-default}"
340-
LOAD_DURATION="${2:-1200}"
395+
BENCHMARK="${2:-tfb}"
396+
LOAD_DURATION="${3:-1200}"
341397

342-
echo
343-
echo "################################################################################################################"
344-
echo " Starting ${LOAD_DURATION} secs background load against the techempower benchmark in ${APP_NAMESPACE} namespace "
345-
echo "################################################################################################################"
346-
echo
398+
if [ ${BENCHMARK} == "tfb" ]; then
399+
if kubectl get pods --namespace ${APP_NAMESPACE} -o jsonpath='{.items[*].metadata.name}' | grep -q "tfb"; then
400+
echo
401+
echo "################################################################################################################"
402+
echo " Starting ${LOAD_DURATION} secs background load against the techempower benchmark in ${APP_NAMESPACE} namespace "
403+
echo "################################################################################################################"
404+
echo
405+
if [ ${CLUSTER_TYPE} == "kind" ] || [ ${CLUSTER_TYPE} == "minikube" ]; then
406+
TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
407+
elif [ ${CLUSTER_TYPE} == "aks" ]; then
408+
TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
409+
elif [ ${CLUSTER_TYPE} == "openshift" ]; then
410+
TECHEMPOWER_ROUTE=$(oc get route -n ${APP_NAMESPACE} --template='{{range .items}}{{.spec.host}}{{"\n"}}{{end}}')
411+
fi
412+
# docker run -d --rm --network="host" ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} <END_POINT> <DURATION> <THREADS> <CONNECTIONS>
413+
docker run -d --rm --network="host" ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} queries?queries=20 ${LOAD_DURATION} 512 4096 #1024 8096
414+
fi
415+
fi
347416

348-
if [ ${CLUSTER_TYPE} == "kind" ] || [ ${CLUSTER_TYPE} == "minikube" ]; then
349-
TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
350-
elif [ ${CLUSTER_TYPE} == "aks" ]; then
351-
TECHEMPOWER_ROUTE=${TECHEMPOWER_URL}
352-
elif [ ${CLUSTER_TYPE} == "openshift" ]; then
353-
TECHEMPOWER_ROUTE=$(oc get route -n ${APP_NAMESPACE} --template='{{range .items}}{{.spec.host}}{{"\n"}}{{end}}')
417+
if [ ${BENCHMARK} == "llm-rag" ]; then
418+
if kubectl get pods --namespace ${APP_NAMESPACE} -o jsonpath='{.items[*].metadata.name}' | grep -q "llm"; then
419+
pushd benchmarks/AI-MLbenchmarks/llm-rag >/dev/null
420+
echo
421+
echo "################################################################################################################"
422+
echo " Starting background load against the llm-rag benchmark in ${APP_NAMESPACE} namespace "
423+
echo "################################################################################################################"
424+
./run_load.sh ${APP_NAMESPACE} >> ${LOG_FILE} &
425+
popd >/dev/null
426+
fi
354427
fi
355-
# docker run -d --rm --network="host" ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} <END_POINT> <DURATION> <THREADS> <CONNECTIONS>
356-
docker run -d --rm --network="host" ${TECHEMPOWER_LOAD_IMAGE} /opt/run_hyperfoil_load.sh ${TECHEMPOWER_ROUTE} queries?queries=20 ${LOAD_DURATION} 512 4096 #1024 8096
357428

358429
}
359430

@@ -379,10 +450,10 @@ function check_minikube() {
379450
}
380451

381452
###########################################
382-
# Deploy TFB Benchmarks - multiple import
453+
# Create Namespace
383454
###########################################
384455
function create_namespace() {
385-
CAPP_NAMESPACE="${1:-test-multiple-import}"
456+
CAPP_NAMESPACE=$1
386457
echo
387458
echo "#########################################"
388459
if kubectl get namespace "${CAPP_NAMESPACE}" &> /dev/null; then
@@ -613,12 +684,14 @@ function get_urls() {
613684
###########################################
614685
function show_urls() {
615686
if [ ${demo} == "local" ]; then
687+
{
616688
echo
617689
echo "#######################################"
618690
echo "# Quarkus App #"
619691
echo "#######################################"
620692
echo "Info: Access techempower app at http://${TECHEMPOWER_URL}/db"
621693
echo "Info: Access techempower app metrics at http://${TECHEMPOWER_URL}/q/metrics"
694+
} >> "${LOG_FILE}" 2>&1
622695
fi
623696

624697
echo
@@ -659,6 +732,7 @@ function setup_workload() {
659732
#
660733
#
661734
function kruize_local_demo_setup() {
735+
bench=$1
662736
# Start all the installs
663737
start_time=$(get_date)
664738
echo
@@ -667,6 +741,8 @@ function kruize_local_demo_setup() {
667741
echo "#######################################"
668742
echo
669743

744+
{
745+
670746
if [ ${kruize_restart} -eq 0 ]; then
671747
clone_repos autotune
672748
clone_repos benchmarks
@@ -685,7 +761,13 @@ function kruize_local_demo_setup() {
685761
prometheus_install
686762
fi
687763
if [ ${demo} == "local" ]; then
688-
benchmarks_install
764+
create_namespace ${APP_NAMESPACE}
765+
if [ ${#EXPERIMENTS[@]} -ne 0 ]; then
766+
benchmarks_install ${APP_NAMESPACE} ${bench}
767+
fi
768+
echo ""
769+
elif [ ${demo} == "bulk" ]; then
770+
setup_workload
689771
fi
690772
fi
691773
kruize_local_patch
@@ -698,9 +780,13 @@ function kruize_local_demo_setup() {
698780

699781
get_urls
700782

783+
} >> "${LOG_FILE}" 2>&1
784+
701785
if [ ${demo} == "local" ]; then
702-
# Run the Kruize Local experiments
703786
kruize_local
787+
if [ ${#EXPERIMENTS[@]} -ne 0 ]; then
788+
kruize_local_experiments
789+
fi
704790
show_urls
705791
elif [ ${demo} == "bulk" ]; then
706792
kruize_bulk
@@ -718,26 +804,18 @@ function kruize_local_demo_setup() {
718804
function kruize_local_demo_update() {
719805
# Start all the installs
720806
start_time=$(get_date)
721-
807+
bench=$1
722808
if [ ${demo} == "local" ]; then
723809
if [ ${benchmark} -eq 1 ]; then
724-
echo
725-
echo "############################################"
726-
echo "# Deploy TFB on ${APP_NAMESPACE} "
727-
echo "############################################"
728810
echo
729811
create_namespace ${APP_NAMESPACE}
730-
benchmarks_install ${APP_NAMESPACE} "resource_provisioning_manifests"
812+
benchmarks_install ${APP_NAMESPACE} ${bench} "resource_provisioning_manifests"
731813
echo "Success! Running the benchmark in ${APP_NAMESPACE}"
732814
echo
733815
fi
734816
if [ ${benchmark_load} -eq 1 ]; then
735817
echo
736-
echo "#######################################"
737-
echo "# Apply the benchmark load #"
738-
echo "#######################################"
739-
echo
740-
apply_benchmark_load ${APP_NAMESPACE} ${LOAD_DURATION}
818+
apply_benchmark_load ${APP_NAMESPACE} ${bench} ${LOAD_DURATION}
741819
echo "Success! Running the benchmark load for ${LOAD_DURATION} seconds"
742820
echo
743821
fi
@@ -759,6 +837,7 @@ function kruize_local_demo_terminate() {
759837
echo "# Kruize Demo Terminate #"
760838
echo "#######################################"
761839
echo
840+
{
762841
if [ ${CLUSTER_TYPE} == "minikube" ]; then
763842
minikube_delete
764843
elif [ ${CLUSTER_TYPE} == "kind" ]; then
@@ -767,7 +846,7 @@ function kruize_local_demo_terminate() {
767846
kruize_uninstall
768847
fi
769848
if [ ${demo} == "local" ]; then
770-
delete_namespace "test-multiple-import"
849+
delete_namespace ${APP_NAMESPACE}
771850
elif [ ${demo} == "bulk" ]; then
772851
ns_name="tfb"
773852
count=3
@@ -781,6 +860,7 @@ function kruize_local_demo_terminate() {
781860
fi
782861
delete_repos autotune
783862
delete_repos "benchmarks"
863+
} >> "${LOG_FILE}" 2>&1
784864
end_time=$(get_date)
785865
elapsed_time=$(time_diff "${start_time}" "${end_time}")
786866
echo "Success! Kruize demo cleanup took ${elapsed_time} seconds"

monitoring/local_monitoring/ReadMe.md

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ By default, it runs on the `Kind` cluster.
2828
```
2929

3030
```
31-
Usage: ./local_monitoring_demo.sh [-s|-t] [-c cluster-type] [-l] [-p] [-r] [-i kruize-image] [-u kruize-ui-image] [-b] [-n namespace] [-d load-duration] [-m benchmark-manifests]
31+
Usage: ./local_monitoring_demo.sh [-s|-t] [-c cluster-type] [-e recommendation_experiment] [-l] [-p] [-r] [-i kruize-image] [-u kruize-ui-image] [-b] [-n namespace] [-d load-duration] [-m benchmark-manifests]
3232
c = supports minikube, kind and openshift cluster-type
33+
e = supports container, namespace and gpu. Default - none.
3334
i = kruize image. Default - quay.io/kruize/autotune_operator:<version as in pom.xml>
3435
l = Run a load against the benchmark
3536
p = expose prometheus port
@@ -44,13 +45,23 @@ m = manifests of the benchmark
4445

4546
## Understanding the Demo
4647

47-
This demo focuses on using the TFB (TechEmpower Framework Benchmarks) benchmark to simulate different load conditions and observe how Kruize-Autotune reacts with its recommendations. Here’s a breakdown of what happens during the demo:
48-
49-
- TFB deployment in default Namespace
50-
- The TFB benchmark is initially deployed in the default namespace, comprising two key deployments
51-
- tfb-qrh: Serving as the application server.
52-
- tfb-database: Database to the server.
53-
- Load is applied to the server for 20 mins within this namespace to simulate real-world usage scenarios
48+
This demo focuses on installing kruize and also install the benchmarks if asked for through `-e` parameter.
49+
- By default, it installs kruize and provides the URL to access the kruize UI service where the user can create experiments and generate recommendations.
50+
- To use demo benchmarks to create and generate recommendations through a script, pass -e for container, namespace and gpu benchmarks.
51+
- For container and namespace type, benchmark 'TFB' is deployed in a namespace.
52+
- For gpu type, benchmark 'human-eval' is deployed.
53+
54+
Here’s a breakdown of what happens during the demo:
55+
56+
- Deploys benchmarks in a namespace (if -e is passed)
57+
- If -e is container/namespace
58+
- The TFB benchmark is initially deployed in the namespace, comprising two key deployments
59+
- tfb-qrh: Serving as the application server.
60+
- tfb-database: Database to the server.
61+
- Load is applied to the server for 20 mins within this namespace to simulate real-world usage scenarios
62+
- If -e is gpu
63+
- The human-eval benchmark is deployed as job in the namespace.
64+
- The job is set to run for atleast 20 mins to generate the recommendations.
5465
- Install Kruize
5566
- Installs kruize under openshift-tuning name.
5667
- Metadata Collection and Experiment Creation
@@ -60,6 +71,9 @@ This demo focuses on using the TFB (TechEmpower Framework Benchmarks) benchmark
6071
- Generates Recommendations for all the experiments created.
6172

6273
## Recommendations for different load Simulations observed on Openshift
74+
75+
TFB (TechEmpower Framework Benchmarks) benchmark is simulated in different load conditions and below are the different recommendations observed from Kruize-Autotune.
76+
6377
### IDLE
6478
- Experiment: `monitor_tfb-db_benchmark`
6579
- Shows an IDLE scenario where CPU recommendations are not generated due to minimal CPU usage (less than a millicore).

0 commit comments

Comments
 (0)