Updated baremetal DLRMv2 readme (#1910)

* Updated baremetal DLRMv2 readme Co-authored-by: Srikanth Ramakrishna <srikanth.ramakrishna@intel.com>
intel · Jun 24, 2024 · 303de0c · 303de0c
1 parent c598c81
commit 303de0c
Show file tree

Hide file tree

Showing 12 changed files with 224 additions and 73 deletions.
diff --git a/docker/max-gpu/docker-compose.yml b/docker/max-gpu/docker-compose.yml
@@ -90,10 +90,15 @@ services:
       dockerfile: docker/max-gpu/tf-3d-unet-training/tf-max-series-3d-unet-training.Dockerfile
     extends: tf-resnet50v1-5-training
     image: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-image-segmentation-tf-max-gpu-3d-unet-training
+  pytorch-dlrmv2-inference:
+    build:
+      dockerfile: docker/max-gpu/pytorch-dlrmv2-inference/pytorch-max-series-dlrmv2-inference.Dockerfile
+    extends: pytorch-resnet50v1-5-inference
+    image: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-recommendation-pytorch-max-gpu-dlrmv2-inference
   pytorch-dlrmv2-training:
     build:
       dockerfile: docker/max-gpu/pytorch-dlrmv2-training/pytorch-max-series-dlrmv2-training.Dockerfile
-    extends: pytorch-resnet50v1-5-training
+    extends: pytorch-resnet50v1-5-inference
     image: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-recommendation-pytorch-max-gpu-dlrmv2-training
   pytorch-distilbert-inference:
     build:

diff --git a/docker/max-gpu/pytorch-dlrmv2-inference/pytorch-max-series-dlrmv2-inference.Dockerfile b/docker/max-gpu/pytorch-dlrmv2-inference/pytorch-max-series-dlrmv2-inference.Dockerfile
@@ -0,0 +1,46 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+ARG PYT_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYT_BASE_TAG="2.1.10-xpu-pip-base"
+
+FROM ${PYT_BASE_IMAGE}:${PYT_BASE_TAG}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+WORKDIR /workspace/pytorch-max-series-dlrmv2-inference/models
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        intel-oneapi-mpi-devel=2021.11.0-49493  \
+        intel-oneapi-ccl=2021.11.2-5 && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY models_v2/pytorch/torchrec_dlrm/inference/gpu .
+COPY models_v2/common common
+
+RUN python -m pip install -r requirements.txt 
+
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/2021.11/lib/:/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.11/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/2021.11/lib:/opt/intel/oneapi/ccl/2021.11/lib/
+ENV PATH=/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/2021.11/bin:$PATH
+ENV CCL_ROOT=/opt/intel/oneapi/ccl/2021.11
+ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/2021.11
+ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
+
+COPY LICENSE licenses/LICENSE
+COPY third_party licenses/third_party
diff --git a/docker/max-gpu/pytorch-dlrmv2-inference/tests.yaml b/docker/max-gpu/pytorch-dlrmv2-inference/tests.yaml
@@ -0,0 +1,22 @@
+fp16-distributed-real-data-inference:
+  img: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-recommendation-pytorch-max-gpu-dlrmv2-inference
+  cmd: bash run_model.sh
+  ipc: host
+  device: /dev/dri
+  env:
+    PRECISION: FP16
+    BATCH_SIZE: '65536' 
+    OUTPUT_DIR: /tmp
+    MULTI_TILE: 'True'
+    PLATFORM: Max
+    DATASET_DIR: /var/torchrec-dlrm-v2
+    WEIGHT_DIR: /var/torchrec-dlrm-v2-weights
+  volumes:
+    - src: /var/torchrec-dlrm-v2
+      dst: /var/torchrec-dlrm-v2
+    - src: /dev/dri
+      dst: /dev/dri
+    - src: /var/torchrec-dlrm-v2-weights
+      dst: /var/torchrec-dlrm-v2-weights
+    - src: /tmp
+      dst: /tmp
diff --git a/docs/general/MAX_DEVCATALOG.md b/docs/general/MAX_DEVCATALOG.md
@@ -13,23 +13,23 @@ This document provides links to step-by-step instructions on how to leverage ref
 
 The table below provides links to run each workload in a docker container. The containers were validated on a host running Linux*.
 
-| Model                            | Framework                  | Mode | Precisions | 
+| Model                            | Framework                  | Mode | Precisions |
 | ----------------------------|     ---------- | ------------------- | ------------ |
 | [3D-UNet](https://arxiv.org/abs/1606.06650) | TensorFlow | [Training](../../models_v2/tensorflow/3d_unet/training/gpu/CONTAINER.md) | BF16 |
-| [BERT Large](https://arxiv.org/pdf/1810.04805.pdf)                                           | PyTorch | [Inference](../../models_v2/pytorch/bert_large/inference/gpu/CONTAINER.md) | FP16, BF16 and FP32 | 
+| [BERT Large](https://arxiv.org/pdf/1810.04805.pdf)                                           | PyTorch | [Inference](../../models_v2/pytorch/bert_large/inference/gpu/CONTAINER.md) | FP16, BF16 and FP32 |
 | [BERT Large](https://arxiv.org/pdf/1810.04805.pdf)                                           | PyTorch | [Training](../../models_v2/pytorch/bert_large/training/gpu/CONTAINER.md) | BF16,TF32 and FP32 |
 | [BERT Large](https://arxiv.org/pdf/1810.04805.pdf)                                           | TensorFlow | [Training](../../models_v2/tensorflow/bert_large/training/gpu/CONTAINER.md) | BF16 |
 | [DistilBERT](https://arxiv.org/abs/1910.01108) | PyTorch | [Inference](../../models_v2/pytorch/distilbert/inference/gpu/CONTAINER_MAX.md) | FP16,BF16 and FP32 |
-| [DLRM](https://arxiv.org/abs/1906.00091) | PyTorch | [Inference](../../quickstart/recommendation/pytorch/torchrec_dlrm/inference/gpu/DEVCATALOG.md) | FP16 | 
-| [DLRM](https://arxiv.org/abs/1906.00091) | PyTorch | [Training](../../models_v2/pytorch/torchrec_dlrm/training/gpu/CONTAINER.md) | FP32,TF32 and BF16 | 
+| [DLRM](https://arxiv.org/abs/1906.00091) | PyTorch | [Inference](../../quickstart/recommendation/pytorch/torchrec_dlrm/inference/gpu/DEVCATALOG.md) | FP16 |
+| [DLRM](https://arxiv.org/abs/1906.00091) | PyTorch | [Training](../../models_v2/pytorch/torchrec_dlrm/training/gpu/CONTAINER.md) | FP32,TF32 and BF16 |
 | [Mask R-CNN](https://arxiv.org/abs/1703.06870) | TensorFlow | [Training](../../models_v2/tensorflow/maskrcnn/training/gpu/CONTAINER.md) | BF16 |
-| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | [Inference](../../models_v2/pytorch/resnet50v1_5/inference/gpu/CONTAINER_MAX.md) | INT8,FP16,BF16,FP32 and TF32 | 
-| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | [Training](../../models_v2/pytorch/resnet50v1_5/training/gpu/CONTAINER.md) | BF16,FP32 and TF32 | 
-| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | TensorFlow | [Training](../../models_v2/tensorflow/resnet50v1_5/training/gpu/CONTAINER.md) |  BF16 | 
+| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | [Inference](../../models_v2/pytorch/resnet50v1_5/inference/gpu/CONTAINER_MAX.md) | INT8,FP16,BF16,FP32 and TF32 |
+| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | [Training](../../models_v2/pytorch/resnet50v1_5/training/gpu/CONTAINER.md) | BF16,FP32 and TF32 |
+| [ResNet50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | TensorFlow | [Training](../../models_v2/tensorflow/resnet50v1_5/training/gpu/CONTAINER.md) |  BF16 |
 | [RNN-T](https://arxiv.org/abs/1211.3711) | PyTorch | [Inference](../../models_v2/pytorch/rnnt/inference/gpu/CONTAINER.md) |
 | [RNN-T](https://arxiv.org/abs/1211.3711) | PyTorch | [Training](../../models_v2/pytorch/rnnt/training/gpu/CONTAINER.md) |
 | [Stable Diffusion](https://arxiv.org/abs/2112.10752) | PyTorch | [Inference](../../models_v2/pytorch/stable_diffusion/inference/gpu/CONTAINER_MAX.md) | FP16 |
 
-**Note**: 
-* DLRM(PyTorch) inference workload is supported on older Intel® Extension for TensorFlow* v2.13 and Intel® Extension for PyTorch* 2.0.110+xpu versions. 
+**Note**:
+* DLRM(PyTorch) inference, BERT-Large(TensorFlow) inference and ResNet50v1.5(TensorFlow) inference workloads are supported on older Intel® Extension for TensorFlow* v2.13 and Intel® Extension for PyTorch* 2.0.110+xpu versions.
 * The other models in the list are validated on Intel® Extension for TensorFlow* v2.14 and Intel® Extension for PyTorch* 2.1.10+xpu versions.
diff --git a/models_v2/pytorch/torchrec_dlrm/inference/gpu/README.md b/models_v2/pytorch/torchrec_dlrm/inference/gpu/README.md
@@ -11,9 +11,15 @@ DLRM v2 Inference best known configurations with Intel® Extension for PyTorch.
 # Pre-Requisite
 * Host has 4 Intel® Data Center GPU Max and two tiles for each.
 * Host has installed latest Intel® Data Center GPU Max Series Drivers https://dgpu-docs.intel.com/driver/installation.html
-* Host has installed [Intel® Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)
+* The following Intel® oneAPI Base Toolkit components are required:
+  - Intel® oneAPI DPC++ Compiler (Placeholder DPCPPROOT as its installation path)
+  - Intel® oneAPI Math Kernel Library (oneMKL) (Placeholder MKLROOT as its installation path)
+  - Intel® oneAPI MPI Library
+  - Intel® oneAPI TBB Library
 
-# prepare Dataset
+  Follow instructions at [Intel® oneAPI Base Toolkit Download page](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux) to setup the package manager repository.
+
+# Prepare Dataset
 After downloading and uncompressing the [Criteo 1TB Click Logs dataset](consisting of 24 files from day 0 to day 23), process the raw tsv files into the proper format for training by running ./scripts/process_Criteo_1TB_Click_Logs_dataset.sh with necessary command line arguments.
 
 Example usage:
@@ -40,20 +46,40 @@ and the folder will be used as the parameter WEIGHT_DIR later
 ## Inference
 1. `git clone https://github.com/IntelAI/models.git`
 2. `cd models/models_v2/pytorch/torchrec_dlrm/inference/gpu`
-3. Run `setup.sh` this will install all the required dependencies & create virtual environment `venv`.
-4. Activate virtual env: `. ./venv/bin/activate`
-5. Setup required environment paramaters
+3. Create virtual environment `venv` and activate it:
+    ```
+    python3 -m venv venv
+    . ./venv/bin/activate
+    ```
+4. Run setup.sh
+    ```
+    ./setup.sh
+    ```
+5. Install the latest GPU versions of [torch, torchvision and intel_extension_for_pytorch](https://intel.github.io/intel-extension-for-pytorch/index.html#installation):
+  ```
+  python -m pip install torch==<torch_version> torchvision==<torchvision_version> intel-extension-for-pytorch==<ipex_version> --extra-index-url https://pytorch-extension.intel.com/release-whl-aitools/
+  ```
+6. Set environment variables for Intel® oneAPI Base Toolkit: 
+    Default installation location `{ONEAPI_ROOT}` is `/opt/intel/oneapi` for root account, `${HOME}/intel/oneapi` for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mpi/latest/env/vars.sh
+    source {ONEAPI_ROOT}/ccl/latest/env/vars.sh
+    ```
+7. Setup required environment paramaters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
-| **MULTI_TILE**               | `export MULTI_TILE=True` (True or False)                                             |
-| **PLATFORM**                 | `export PLATFORM=PVC` (PVC)                                                 |
+| **MULTI_TILE**               | `export MULTI_TILE=True` (True)                                             |
+| **PLATFORM**                 | `export PLATFORM=Max` (Max)                                                 |
 | **WEIGHT_DIR**               | `export WEIGHT_DIR=`                                                                 |
 | **DATASET_DIR**              |                               `export DATASET_DIR=`                                  |
 | **BATCH_SIZE** (optional)    |                               `export BATCH_SIZE=32768`                              |
-| **PRECISION** (optional)     |        `export PRECISION=FP16` (FP16 and FP32 are supported for PVC)                 |
+| **PRECISION** (optional)     |        `export PRECISION=FP16` (FP16 and FP32 are supported for Max)                 |
 | **OUTPUT_DIR** (optional)    |                               `export OUTPUT_DIR=$PWD`                               |
-6. Run `run_model.sh`
+8. Run `run_model.sh`
 
 ## Output
 

diff --git a/models_v2/pytorch/torchrec_dlrm/inference/gpu/ddp-dlrm-terabyte.py b/models_v2/pytorch/torchrec_dlrm/inference/gpu/ddp-dlrm-terabyte.py
@@ -0,0 +1,37 @@
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consts for ddp-dlrm-terabyte inference."""
+
+ACC = {
+    "type": "total",
+    "pattern": r"AUROC over test set: (\d+.\d+)",
+    "unit": "AUROC",
+    "number_of_partials": 2,
+}
+
+
+PERF = {
+    "type": "max",
+    "pattern": r"avg eval time per iter at ITER: 45, (\d+.\d+) s",
+    "inverse": True,
+    "multiply": False,
+    "use_batch_size": True,
+    "unit": "samples/s",
+}
+
+FUNCTIONAL = {
+    "pattern": r"AUROC over test set: (\d+.\d+)",
+}
diff --git a/models_v2/pytorch/torchrec_dlrm/inference/gpu/requirements.txt b/models_v2/pytorch/torchrec_dlrm/inference/gpu/requirements.txt
@@ -2,3 +2,4 @@ fbgemm-gpu==0.3.2
 torchmetrics==0.11.0
 torchrec==0.3.2
 torchsnapshot
+typing-extensions!=4.7.0
diff --git a/models_v2/pytorch/torchrec_dlrm/inference/gpu/run_model.sh b/models_v2/pytorch/torchrec_dlrm/inference/gpu/run_model.sh
@@ -25,6 +25,7 @@ input_envs[DATASET_DIR]=${DATASET_DIR}
 input_envs[WEIGHT_DIR]=${WEIGHT_DIR}
 input_envs[MULTI_TILE]=${MULTI_TILE}
 input_envs[PLATFORM]=${PLATFORM}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
 
 for i in "${!input_envs[@]}"; do
   var_name=$i
@@ -36,33 +37,23 @@ for i in "${!input_envs[@]}"; do
   fi
 done
 
-OUTPUT_DIR=${OUTPUT_DIR:-$PWD}
-
-if [[ "${PLATFORM}" == "PVC" ]]; then
+if [[ "${PLATFORM}" == "Max" ]]; then
     BATCH_SIZE=${BATCH_SIZE:-65536}
     PRECISION=${PRECISION:-FP16}
-elif [[ "${PLATFORM}" == "ATS-M" ]]; then
-    echo "Only support PVC for platform"
+elif [[ "${PLATFORM}" == "Flex" ]]; then
+    echo "Only support Max for platform"
 fi
 
-
-
-if [[ -z "${DATASET_DIR}" ]]; then
-  echo "Using Dummy data since environment variable DATASET_DIR has not been set"
-  DATASET_DIR="--dummy"
-else
-  if [[ ! -d "${DATASET_DIR}" ]]; then
+if [[ ! -d "${DATASET_DIR}" ]]; then
     echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
     exit 1
-  fi
 fi
 
 # known issue
 if [[ "${MULTI_TILE}" == "True" ]]; then
     export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
 fi
 
-
 echo 'Running with parameters:'
 echo " PLATFORM: ${PLATFORM}"
 echo " WEIGHT_DIR: ${WEIGHT_DIR}"
@@ -101,18 +92,18 @@ sum_log_analysis() {
     cat ${1}"_t0.log" ${1}"_t1.log" |grep "Error" |awk '{if(a[$1]){a[$1]=a[$1]";"$2}else{a[$1]=$2}}END{for(i in a)print $1" " a[i]}' >> $2
 }
 
-modelname=dlrm-terabyte
+modelname=ddp-dlrm-terabyte
 if [[ ${MULTI_TILE} == "False" ]]; then
 	echo -e "do not support MULTI_TILE=False"
 	exit 1
 else
-    rm ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf_raw.log
-    bash cmd_distributed_terabyte_test.sh -d ${DATASET_DIR} -m ${WEIGHT_DIR} ${flag} 2>&1 | tee ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf_raw.log
-    python ../../../../../models/common/pytorch/parse_result.py -t ddp -l ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf_raw.log -b ${BATCH_SIZE}
-    throughput=$(cat ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf.log | grep "Sum Performance" | awk -F ' ' '{print $3}')
-    throughput_unit=$(cat ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf.log | grep "Sum Performance" | awk -F ' ' '{print $4}')
-    acc=$(cat ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf.log | grep Accuracy | awk -F ' ' '{print $3}')
-    acc_unit=$(cat ${OUTPUT_DIR}/ddp-${modelname}_${PRECISION}_inf.log | grep Accuracy | awk -F ' ' '{print $2}')
+    rm ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf_raw.log
+    bash cmd_distributed_terabyte_test.sh -d ${DATASET_DIR} -m ${WEIGHT_DIR} ${flag} 2>&1 | tee ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf_raw.log
+    python common/parse_result.py -m $modelname --ddp -l ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf_raw.log -b ${BATCH_SIZE}
+    throughput=$(cat ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf.log | grep "Sum Performance" | awk -F ' ' '{print $3}')
+    throughput_unit=$(cat ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf.log | grep "Sum Performance" | awk -F ' ' '{print $4}')
+    acc=$(cat ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf.log | grep Accuracy | awk -F ' ' '{print $3}')
+    acc_unit=$(cat ${OUTPUT_DIR}/${modelname}_${PRECISION}_inf.log | grep Accuracy | awk -F ' ' '{print $2}')
 fi
 
 yaml_content=$(cat <<EOF
@@ -127,5 +118,5 @@ EOF
 )
 
 # Write the content to a YAML file
-echo "$yaml_content" >  ./results.yaml
+echo "$yaml_content" >  ${OUTPUT_DIR}/results.yaml
 echo "YAML file created."
diff --git a/models_v2/pytorch/torchrec_dlrm/inference/gpu/setup.sh b/models_v2/pytorch/torchrec_dlrm/inference/gpu/setup.sh
@@ -22,6 +22,7 @@
 
 set -e
 apt-get update && apt-get install -y python3-venv protobuf-compiler
-python3 -m venv $PWD/venv
-. ./venv/bin/activate
+
 pip install -r requirements.txt
+
+cp -r ../../../../common .