Update Nemo Megatron (#138)

* Updating nemo example This change updates the nemo example as follows: - Fixed a bug in the seq command which launched one process too many, failing the job - Update to run_name to have an informative default - Updating base image to latest nemo image 22.11 - Added comments better document the setup - Moved apt install into the integrations section - Updated a few hyper parameters to ensure successful run * Update multi_node.yaml * Update examples/nemo/single_node.yaml Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> * Update examples/nemo/multi_node.yaml Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> * Fixing linting * Change defaults --------- Co-authored-by: Hagay Lupesko <hagay@mosaicml.com> Co-authored-by: Hagay Lupesko <lupesko@users.noreply.github.com> Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
mosaicml · Feb 9, 2023 · 3088344 · 3088344
1 parent 2fae0c8
commit 3088344
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 16 deletions.
diff --git a/examples/nemo/multi_node.yaml b/examples/nemo/multi_node.yaml
@@ -1,20 +1,32 @@
-run_name: # run-name-here
-cluster: # mcloud-cluster-name-here
+run_name: nemo-megatron-gpt-124m-gpu-16
+cluster: r0z0 # Update with your cluster here!
 gpu_num: 16
-image: nvcr.io/nvidia/nemo:22.09
+
+# For the latest NeMo container version, see https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo
+image: nvcr.io/nvidia/nemo:22.11
+
 env_variables:
-  - key: PYTHONUNBUFFERED
-    value: '1'
-command: |
+# Configure Python to not buffer stdout and stderr, so output shows up in console immediately
+- key: PYTHONUNBUFFERED
+  value: '1'
 
+integrations:
+- integration_type: apt_packages
+  # Install parallel to launch multiple processes per node with rank per process
+  packages:
+  - parallel
+
+command: |
   # getting the vocab, merge files for the tokenizer
   wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
   wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
 
-  apt update
-  apt install -y parallel
+  # Make sure to prepare and download the training data, as defined in NeMo documentation:
+  # https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/gpt/gpt_training.html
 
-  seq 0 8 | parallel -u \ 'CUDA_VISIBLE_DEVICES={} RANK=$(( $NODE_RANK * 8 + {} )) python3 examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+  # Make sure to update the training dataset path below
+  seq 0 7 | parallel -u \ 'CUDA_VISIBLE_DEVICES={} RANK=$(( $NODE_RANK * 8 + {} )) \
+  python3 examples/nlp/language_modeling/megatron_gpt_pretraining.py \
   --config-path=/workspace/nemo/examples/nlp/language_modeling/conf/ \
   --config-name=megatron_gpt_config.yaml \
   model.data.data_prefix=[1.0,/your_dataset_path_here/] \

diff --git a/examples/nemo/single_node.yaml b/examples/nemo/single_node.yaml
@@ -1,16 +1,24 @@
-run_name: # run-name-here
-cluster: # mcloud-cluster-name-here
+run_name: nemo-megatron-gpt-124m-gpu-8
+cluster: r0z0 # Update with your cluster here!
 gpu_num: 8
-image: nvcr.io/nvidia/nemo:22.09
+
+# For the latest NeMo container version, see https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo
+image: nvcr.io/nvidia/nemo:22.11
+
 env_variables:
-  - key: PYTHONUNBUFFERED
-    value: '1'
-command: |
+# Configure Python to not buffer stdout and stderr, so output shows up in console immediately
+- key: PYTHONUNBUFFERED
+  value: '1'
 
-  # getting the vocab, merge files for the tokenizer
+command: |
+  # Getting the tokenizer vocab and merge files
   wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
   wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
 
+  # Make sure to prepare and download the training data, as defined in NeMo documentation:
+  # https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/gpt/gpt_training.html
+
+  # Make sure to update the training dataset path below
   python3 examples/nlp/language_modeling/megatron_gpt_pretraining.py \
   --config-path=/workspace/nemo/examples/nlp/language_modeling/conf/ \
   --config-name=megatron_gpt_config.yaml \