Skip to content

Commit

Permalink
Updated baremetal DLRMv2 readme (#1910)
Browse files Browse the repository at this point in the history
* Updated baremetal DLRMv2 readme

Co-authored-by: Srikanth Ramakrishna <srikanth.ramakrishna@intel.com>
  • Loading branch information
Mahathi-Vatsal and sramakintel committed Jun 24, 2024
1 parent c598c81 commit 303de0c
Show file tree
Hide file tree
Showing 12 changed files with 224 additions and 73 deletions.
7 changes: 6 additions & 1 deletion docker/max-gpu/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,15 @@ services:
dockerfile: docker/max-gpu/tf-3d-unet-training/tf-max-series-3d-unet-training.Dockerfile
extends: tf-resnet50v1-5-training
image: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-image-segmentation-tf-max-gpu-3d-unet-training
pytorch-dlrmv2-inference:
build:
dockerfile: docker/max-gpu/pytorch-dlrmv2-inference/pytorch-max-series-dlrmv2-inference.Dockerfile
extends: pytorch-resnet50v1-5-inference
image: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-recommendation-pytorch-max-gpu-dlrmv2-inference
pytorch-dlrmv2-training:
build:
dockerfile: docker/max-gpu/pytorch-dlrmv2-training/pytorch-max-series-dlrmv2-training.Dockerfile
extends: pytorch-resnet50v1-5-training
extends: pytorch-resnet50v1-5-inference
image: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-recommendation-pytorch-max-gpu-dlrmv2-training
pytorch-distilbert-inference:
build:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

ARG PYT_BASE_IMAGE="intel/intel-extension-for-pytorch"
ARG PYT_BASE_TAG="2.1.10-xpu-pip-base"

FROM ${PYT_BASE_IMAGE}:${PYT_BASE_TAG}

ENV DEBIAN_FRONTEND=noninteractive

WORKDIR /workspace/pytorch-max-series-dlrmv2-inference/models

RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
intel-oneapi-mpi-devel=2021.11.0-49493 \
intel-oneapi-ccl=2021.11.2-5 && \
rm -rf /var/lib/apt/lists/*

COPY models_v2/pytorch/torchrec_dlrm/inference/gpu .
COPY models_v2/common common

RUN python -m pip install -r requirements.txt

ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/2021.11/lib/:/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.11/lib:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/2021.11/lib:/opt/intel/oneapi/ccl/2021.11/lib/
ENV PATH=/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/2021.11/bin:$PATH
ENV CCL_ROOT=/opt/intel/oneapi/ccl/2021.11
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/2021.11
ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric

COPY LICENSE licenses/LICENSE
COPY third_party licenses/third_party
22 changes: 22 additions & 0 deletions docker/max-gpu/pytorch-dlrmv2-inference/tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
fp16-distributed-real-data-inference:
img: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-recommendation-pytorch-max-gpu-dlrmv2-inference
cmd: bash run_model.sh
ipc: host
device: /dev/dri
env:
PRECISION: FP16
BATCH_SIZE: '65536'
OUTPUT_DIR: /tmp
MULTI_TILE: 'True'
PLATFORM: Max
DATASET_DIR: /var/torchrec-dlrm-v2
WEIGHT_DIR: /var/torchrec-dlrm-v2-weights
volumes:
- src: /var/torchrec-dlrm-v2
dst: /var/torchrec-dlrm-v2
- src: /dev/dri
dst: /dev/dri
- src: /var/torchrec-dlrm-v2-weights
dst: /var/torchrec-dlrm-v2-weights
- src: /tmp
dst: /tmp
18 changes: 9 additions & 9 deletions docs/general/MAX_DEVCATALOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,23 @@ This document provides links to step-by-step instructions on how to leverage ref

The table below provides links to run each workload in a docker container. The containers were validated on a host running Linux*.

| Model | Framework | Mode | Precisions |
| Model | Framework | Mode | Precisions |
| ----------------------------| ---------- | ------------------- | ------------ |
| [3D-UNet](https://arxiv.org/abs/1606.06650) | TensorFlow | [Training](../../models_v2/tensorflow/3d_unet/training/gpu/CONTAINER.md) | BF16 |
| [BERT Large](https://arxiv.org/pdf/1810.04805.pdf) | PyTorch | [Inference](../../models_v2/pytorch/bert_large/inference/gpu/CONTAINER.md) | FP16, BF16 and FP32 |
| [BERT Large](https://arxiv.org/pdf/1810.04805.pdf) | PyTorch | [Inference](../../models_v2/pytorch/bert_large/inference/gpu/CONTAINER.md) | FP16, BF16 and FP32 |
| [BERT Large](https://arxiv.org/pdf/1810.04805.pdf) | PyTorch | [Training](../../models_v2/pytorch/bert_large/training/gpu/CONTAINER.md) | BF16,TF32 and FP32 |
| [BERT Large](https://arxiv.org/pdf/1810.04805.pdf) | TensorFlow | [Training](../../models_v2/tensorflow/bert_large/training/gpu/CONTAINER.md) | BF16 |
| [DistilBERT](https://arxiv.org/abs/1910.01108) | PyTorch | [Inference](../../models_v2/pytorch/distilbert/inference/gpu/CONTAINER_MAX.md) | FP16,BF16 and FP32 |
| [DLRM](https://arxiv.org/abs/1906.00091) | PyTorch | [Inference](../../quickstart/recommendation/pytorch/torchrec_dlrm/inference/gpu/DEVCATALOG.md) | FP16 |
| [DLRM](https://arxiv.org/abs/1906.00091) | PyTorch | [Training](../../models_v2/pytorch/torchrec_dlrm/training/gpu/CONTAINER.md) | FP32,TF32 and BF16 |
| [DLRM](https://arxiv.org/abs/1906.00091) | PyTorch | [Inference](../../quickstart/recommendation/pytorch/torchrec_dlrm/inference/gpu/DEVCATALOG.md) | FP16 |
| [DLRM](https://arxiv.org/abs/1906.00091) | PyTorch | [Training](../../models_v2/pytorch/torchrec_dlrm/training/gpu/CONTAINER.md) | FP32,TF32 and BF16 |
| [Mask R-CNN](https://arxiv.org/abs/1703.06870) | TensorFlow | [Training](../../models_v2/tensorflow/maskrcnn/training/gpu/CONTAINER.md) | BF16 |
| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | [Inference](../../models_v2/pytorch/resnet50v1_5/inference/gpu/CONTAINER_MAX.md) | INT8,FP16,BF16,FP32 and TF32 |
| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | [Training](../../models_v2/pytorch/resnet50v1_5/training/gpu/CONTAINER.md) | BF16,FP32 and TF32 |
| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | TensorFlow | [Training](../../models_v2/tensorflow/resnet50v1_5/training/gpu/CONTAINER.md) | BF16 |
| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | [Inference](../../models_v2/pytorch/resnet50v1_5/inference/gpu/CONTAINER_MAX.md) | INT8,FP16,BF16,FP32 and TF32 |
| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | [Training](../../models_v2/pytorch/resnet50v1_5/training/gpu/CONTAINER.md) | BF16,FP32 and TF32 |
| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | TensorFlow | [Training](../../models_v2/tensorflow/resnet50v1_5/training/gpu/CONTAINER.md) | BF16 |
| [RNN-T](https://arxiv.org/abs/1211.3711) | PyTorch | [Inference](../../models_v2/pytorch/rnnt/inference/gpu/CONTAINER.md) |
| [RNN-T](https://arxiv.org/abs/1211.3711) | PyTorch | [Training](../../models_v2/pytorch/rnnt/training/gpu/CONTAINER.md) |
| [Stable Diffusion](https://arxiv.org/abs/2112.10752) | PyTorch | [Inference](../../models_v2/pytorch/stable_diffusion/inference/gpu/CONTAINER_MAX.md) | FP16 |

**Note**:
* DLRM(PyTorch) inference workload is supported on older Intel® Extension for TensorFlow* v2.13 and Intel® Extension for PyTorch* 2.0.110+xpu versions.
**Note**:
* DLRM(PyTorch) inference, BERT-Large(TensorFlow) inference and ResNet50v1.5(TensorFlow) inference workloads are supported on older Intel® Extension for TensorFlow* v2.13 and Intel® Extension for PyTorch* 2.0.110+xpu versions.
* The other models in the list are validated on Intel® Extension for TensorFlow* v2.14 and Intel® Extension for PyTorch* 2.1.10+xpu versions.
44 changes: 35 additions & 9 deletions models_v2/pytorch/torchrec_dlrm/inference/gpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,15 @@ DLRM v2 Inference best known configurations with Intel® Extension for PyTorch.
# Pre-Requisite
* Host has 4 Intel® Data Center GPU Max and two tiles for each.
* Host has installed latest Intel® Data Center GPU Max Series Drivers https://dgpu-docs.intel.com/driver/installation.html
* Host has installed [Intel® Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)
* The following Intel® oneAPI Base Toolkit components are required:
- Intel® oneAPI DPC++ Compiler (Placeholder DPCPPROOT as its installation path)
- Intel® oneAPI Math Kernel Library (oneMKL) (Placeholder MKLROOT as its installation path)
- Intel® oneAPI MPI Library
- Intel® oneAPI TBB Library

# prepare Dataset
Follow instructions at [Intel® oneAPI Base Toolkit Download page](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux) to setup the package manager repository.

# Prepare Dataset
After downloading and uncompressing the [Criteo 1TB Click Logs dataset](consisting of 24 files from day 0 to day 23), process the raw tsv files into the proper format for training by running ./scripts/process_Criteo_1TB_Click_Logs_dataset.sh with necessary command line arguments.

Example usage:
Expand All @@ -40,20 +46,40 @@ and the folder will be used as the parameter WEIGHT_DIR later
## Inference
1. `git clone https://github.com/IntelAI/models.git`
2. `cd models/models_v2/pytorch/torchrec_dlrm/inference/gpu`
3. Run `setup.sh` this will install all the required dependencies & create virtual environment `venv`.
4. Activate virtual env: `. ./venv/bin/activate`
5. Setup required environment paramaters
3. Create virtual environment `venv` and activate it:
```
python3 -m venv venv
. ./venv/bin/activate
```
4. Run setup.sh
```
./setup.sh
```
5. Install the latest GPU versions of [torch, torchvision and intel_extension_for_pytorch](https://intel.github.io/intel-extension-for-pytorch/index.html#installation):
```
python -m pip install torch==<torch_version> torchvision==<torchvision_version> intel-extension-for-pytorch==<ipex_version> --extra-index-url https://pytorch-extension.intel.com/release-whl-aitools/
```
6. Set environment variables for Intel® oneAPI Base Toolkit:
Default installation location `{ONEAPI_ROOT}` is `/opt/intel/oneapi` for root account, `${HOME}/intel/oneapi` for other accounts
```bash
source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
source {ONEAPI_ROOT}/mpi/latest/env/vars.sh
source {ONEAPI_ROOT}/ccl/latest/env/vars.sh
```
7. Setup required environment paramaters

| **Parameter** | **export command** |
|:---------------------------:|:------------------------------------------------------------------------------------:|
| **MULTI_TILE** | `export MULTI_TILE=True` (True or False) |
| **PLATFORM** | `export PLATFORM=PVC` (PVC) |
| **MULTI_TILE** | `export MULTI_TILE=True` (True) |
| **PLATFORM** | `export PLATFORM=Max` (Max) |
| **WEIGHT_DIR** | `export WEIGHT_DIR=` |
| **DATASET_DIR** | `export DATASET_DIR=` |
| **BATCH_SIZE** (optional) | `export BATCH_SIZE=32768` |
| **PRECISION** (optional) | `export PRECISION=FP16` (FP16 and FP32 are supported for PVC) |
| **PRECISION** (optional) | `export PRECISION=FP16` (FP16 and FP32 are supported for Max) |
| **OUTPUT_DIR** (optional) | `export OUTPUT_DIR=$PWD` |
6. Run `run_model.sh`
8. Run `run_model.sh`

## Output

Expand Down
37 changes: 37 additions & 0 deletions models_v2/pytorch/torchrec_dlrm/inference/gpu/ddp-dlrm-terabyte.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Consts for ddp-dlrm-terabyte inference."""

ACC = {
"type": "total",
"pattern": r"AUROC over test set: (\d+.\d+)",
"unit": "AUROC",
"number_of_partials": 2,
}


PERF = {
"type": "max",
"pattern": r"avg eval time per iter at ITER: 45, (\d+.\d+) s",
"inverse": True,
"multiply": False,
"use_batch_size": True,
"unit": "samples/s",
}

FUNCTIONAL = {
"pattern": r"AUROC over test set: (\d+.\d+)",
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ fbgemm-gpu==0.3.2
torchmetrics==0.11.0
torchrec==0.3.2
torchsnapshot
typing-extensions!=4.7.0
37 changes: 14 additions & 23 deletions models_v2/pytorch/torchrec_dlrm/inference/gpu/run_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ input_envs[DATASET_DIR]=${DATASET_DIR}
input_envs[WEIGHT_DIR]=${WEIGHT_DIR}
input_envs[MULTI_TILE]=${MULTI_TILE}
input_envs[PLATFORM]=${PLATFORM}
input_envs[OUTPUT_DIR]=${OUTPUT_DIR}

for i in "${!input_envs[@]}"; do
var_name=$i
Expand All @@ -36,33 +37,23 @@ for i in "${!input_envs[@]}"; do
fi
done

OUTPUT_DIR=${OUTPUT_DIR:-$PWD}

if [[ "${PLATFORM}" == "PVC" ]]; then
if [[ "${PLATFORM}" == "Max" ]]; then
BATCH_SIZE=${BATCH_SIZE:-65536}
PRECISION=${PRECISION:-FP16}
elif [[ "${PLATFORM}" == "ATS-M" ]]; then
echo "Only support PVC for platform"
elif [[ "${PLATFORM}" == "Flex" ]]; then
echo "Only support Max for platform"
fi



if [[ -z "${DATASET_DIR}" ]]; then
echo "Using Dummy data since environment variable DATASET_DIR has not been set"
DATASET_DIR="--dummy"
else
if [[ ! -d "${DATASET_DIR}" ]]; then
if [[ ! -d "${DATASET_DIR}" ]]; then
echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
exit 1
fi
fi

# known issue
if [[ "${MULTI_TILE}" == "True" ]]; then
export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
fi


echo 'Running with parameters:'
echo " PLATFORM: ${PLATFORM}"
echo " WEIGHT_DIR: ${WEIGHT_DIR}"
Expand Down Expand Up @@ -101,18 +92,18 @@ sum_log_analysis() {
cat ${1}"_t0.log" ${1}"_t1.log" |grep "Error" |awk '{if(a[$1]){a[$1]=a[$1]";"$2}else{a[$1]=$2}}END{for(i in a)print $1" " a[i]}' >> $2
}

modelname=dlrm-terabyte
modelname=ddp-dlrm-terabyte
if [[ ${MULTI_TILE} == "False" ]]; then
echo -e "do not support MULTI_TILE=False"
exit 1
else
rm ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf_raw.log
bash cmd_distributed_terabyte_test.sh -d ${DATASET_DIR} -m ${WEIGHT_DIR} ${flag} 2>&1 | tee ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf_raw.log
python ../../../../../models/common/pytorch/parse_result.py -t ddp -l ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf_raw.log -b ${BATCH_SIZE}
throughput=$(cat ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf.log | grep "Sum Performance" | awk -F ' ' '{print $3}')
throughput_unit=$(cat ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf.log | grep "Sum Performance" | awk -F ' ' '{print $4}')
acc=$(cat ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf.log | grep Accuracy | awk -F ' ' '{print $3}')
acc_unit=$(cat ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf.log | grep Accuracy | awk -F ' ' '{print $2}')
rm ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf_raw.log
bash cmd_distributed_terabyte_test.sh -d ${DATASET_DIR} -m ${WEIGHT_DIR} ${flag} 2>&1 | tee ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf_raw.log
python common/parse_result.py -m $modelname --ddp -l ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf_raw.log -b ${BATCH_SIZE}
throughput=$(cat ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf.log | grep "Sum Performance" | awk -F ' ' '{print $3}')
throughput_unit=$(cat ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf.log | grep "Sum Performance" | awk -F ' ' '{print $4}')
acc=$(cat ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf.log | grep Accuracy | awk -F ' ' '{print $3}')
acc_unit=$(cat ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf.log | grep Accuracy | awk -F ' ' '{print $2}')
fi

yaml_content=$(cat <<EOF
Expand All @@ -127,5 +118,5 @@ EOF
)

# Write the content to a YAML file
echo "$yaml_content" > ./results.yaml
echo "$yaml_content" > ${OUTPUT_DIR}/results.yaml
echo "YAML file created."
5 changes: 3 additions & 2 deletions models_v2/pytorch/torchrec_dlrm/inference/gpu/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

set -e
apt-get update && apt-get install -y python3-venv protobuf-compiler
python3 -m venv $PWD/venv
. ./venv/bin/activate

pip install -r requirements.txt

cp -r ../../../../common .
Loading

0 comments on commit 303de0c

Please sign in to comment.