Merge pull request #77 from nebius/release/soperator

Release soperator 1.15.2
nebius · Nov 12, 2024 · 355430c · 355430c
2 parents b7c212a + ef5c57f
commit 355430c
Show file tree

Hide file tree

Showing 52 changed files with 5,400 additions and 2 deletions.
diff --git a/soperator/VERSION b/soperator/VERSION
@@ -1 +1 @@
-1.15.1
+1.15.2
diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars
@@ -168,7 +168,7 @@ slurm_cluster_name = "my-amazing-slurm"
 
 # Version of soperator.
 # ---
-slurm_operator_version = "1.15.1"
+slurm_operator_version = "1.15.2"
 
 # Type of the Slurm partition config. Could be either `default` or `custom`.
 # By default, "default".

diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/.gitignore b/soperator/mlperf/gpt3-impl-4.0-nvidia/.gitignore
@@ -0,0 +1,4 @@
+gpt3-*.out
+result/
+api_logs/
+
diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile b/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile
@@ -0,0 +1,129 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.04-py3
+FROM ${FROM_IMAGE_NAME}
+
+# Document build setup
+ARG FROM_IMAGE_NAME
+ENV CUSTOM_FROM_IMAGE_NAME ${FROM_IMAGE_NAME}
+
+# Custom libraries version
+WORKDIR /workspace/
+
+## 1. Apex
+ARG APEX_REVISION=SKIP
+ENV CUSTOM_APEX_REVISION ${APEX_REVISION}
+ARG APEX_MAX_JOBS=4
+
+RUN if [ "${APEX_REVISION}" != SKIP ]; then \
+      git clone https://github.com/NVIDIA/apex && \
+      cd apex && \
+      echo APEX_REVISION=${APEX_REVISION} && \
+      git checkout ${APEX_REVISION} && \
+      echo APEX_COMMIT_HASH=$(git rev-parse HEAD) && \
+      MAX_JOBS=${APEX_MAX_JOBS} NVCC_APPEND_FLAGS="--threads 8" pip install -v --no-build-isolation --no-cache-dir --disable-pip-version-check --config-settings "--build-option=--cpp_ext --cuda_ext --bnp --xentropy --deprecated_fused_adam --deprecated_fused_lamb --fast_multihead_attn --distributed_lamb --fast_layer_norm --transducer --distributed_adam --fmha --fast_bottleneck --nccl_p2p --peer_memory --permutation_search --focal_loss --fused_conv_bias_relu --index_mul_2d --cudnn_gbn --group_norm" . \
+    ; fi
+
+## 2. Transformer Engine
+ARG TE_REVISION=v1.6rc1
+ENV CUSTOM_TE_REVISION ${TE_REVISION}
+
+RUN if [ "${TE_REVISION}" != SKIP ]; then \
+      NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install --force-reinstall --no-deps git+https://github.com/NVIDIA/TransformerEngine.git@${TE_REVISION} \
+    ; fi
+
+## 3. NeMo
+ARG NEMO_REVISION=v2.0.0.rc0.beta
+ENV CUSTOM_NEMO_REVISION ${NEMO_REVISION}
+ARG NEMO_BASE_VERSION=r2.0.0
+ENV CUSTOM_NEMO_BASE_VERSION ${NEMO_BASE_VERSION}
+
+### Base version
+RUN if [ "${NEMO_REVISION}" == SKIP ]; then \
+      if [ -d /opt/bignlp/NeMo ]; then \
+        ln -s /opt/bignlp/NeMo \
+      ; else \
+        echo "Error: NEMO_REVISION=SKIP but there is no BigNLP NeMo installation in base image." && \
+        exit 1 \
+      ; fi \
+    ; else \
+      git clone https://github.com/NVIDIA/NeMo.git && \
+      cd NeMo && \
+      git config user.email "[email protected]" && \
+      git config user.name "name name" && \  
+      echo NEMO_REVISION=${NEMO_REVISION} && \
+      git checkout ${NEMO_REVISION} && \
+      echo NEMO_COMMIT_HASH=$(git rev-parse HEAD) && \
+      pip uninstall -y nemo-toolkit && \
+      pip install "cython<3.0.0" && \
+      pip install --no-build-isolation -e ".[nlp]" \
+    ; fi
+
+### Make (has to be called after all changes to repo)
+RUN cd NeMo && \
+      cd nemo/collections/nlp/data/language_modeling/megatron && \
+      make
+
+# 4. Megatron-core
+ARG MEGATRON_REVISION=core_v0.7.0.beta
+ENV CUSTOM_MEGATRON_REVISION ${MEGATRON_REVISION}
+
+RUN if [ "${MEGATRON_REVISION}" != SKIP ]; then \
+      pip uninstall -y megatron-core && \
+      git clone https://github.com/NVIDIA/Megatron-LM.git && \
+      cd Megatron-LM && \
+      git config user.email "[email protected]" && \
+      git config user.name "Docker Build" && \
+      git checkout ${CUSTOM_MEGATRON_REVISION} && \
+      echo MEGATRON_COMMIT_HASH=$(git rev-parse HEAD) && \
+      pip install . && \
+      cd megatron/core/datasets && \
+      make \
+    ; fi
+
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/Megatron-LM"
+
+## 5. Benchmark dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+## 6. Use nccl-rdma-sharp-plugins from master to pick a fix after HPCX2.18 release
+RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \
+    ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so && \
+    git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins && \
+    cd nccl-rdma-sharp-plugins/ && \
+    ./autogen.sh && \
+    ./configure --prefix=/opt/hpcx/nccl_rdma_sharp_plugin --with-cuda=/usr/local/cuda --with-sharp=/opt/hpcx/sharp/ && \
+    make -j install && \
+    cd ../ && \
+    rm -rf nccl-rdma-sharp-plugins/
+
+# Fix dependencies that don't work from the newest versions
+RUN pip install huggingface_hub==0.23.2
+RUN pip install -v "transformers<=4.40.2"
+
+# Benchmark code
+WORKDIR /workspace/llm
+
+COPY . .
+ENV PYTHONPATH "/workspace/llm:/workspace/NeMo:${PYTHONPATH}"
+
diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/README.md b/soperator/mlperf/gpt3-impl-4.0-nvidia/README.md
@@ -0,0 +1,103 @@
+# 1. Problem 
+Large Language Model - GPT3 175B
+
+## Requirements
+* [PyTorch 24.04-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+* Slurm with [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) (multi-node)
+* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) (for container build)
+
+# 2. Directions
+
+## Steps to run benchmark.
+
+### Steps to configure the training setup
+Launch configuration and system-specific hyperparameters for the appropriate
+NVIDIA DGX submission are in the `config_DGXH100_*.sh` scripts.
+
+Data related variables (PREPROC_DATA, SPM, LOAD_CHECKPOINTS_PATH) are not
+covered in the config files and must be set separately. 
+
+### Steps to launch training
+
+1. Build the container and push to a docker registry:
+```
+docker build --pull -t <docker/registry>/mlperf-nvidia:large_language_model-pytorch .
+docker push <docker/registry>/mlperf-nvidia:large_language_model-pytorch
+```
+2. Launch the training:
+```
+source config_DGXH100_64x8x128x4x8_mbs1.sh  # use appropriate config
+CONT="<docker/registry>/mlperf-nvidia:large_language_model-pytorch LOGDIR=<path/to/output/dir> PREPROC_DATA=<path/to/dataset> SPM=<path/to/tokenizer/model> LOAD_CHECKPOINTS_PATH=<path/to/checkpoint> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+### Hyperparameter settings
+
+Hyperparameters are recorded in the `config_*.sh` files for each configuration and in `run_and_time.sh`.
+
+# 3. Dataset/Environment
+Please refer to the [instructions](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#3-datasetenvironment) from the reference to download the dataset.
+Separate tokenizer files download can be skipped - the unpacked dataset archive already contains the required tokenizer `c4_en_301_5Mexp2_spm.model` in the `spm` directory.
+
+The C4 dataset location (`preprocessed_c4_spm` directory) should be set as PREPROC_DATA variable and the tokenizer location (the `c4_en_301_5Mexp2_spm.model` **file**) as the SPM variable. 
+
+# 4. Model
+### Publication/Attribution
+[Megatron](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/intro.html) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository uses [Nemo Megatron](https://github.com/NVIDIA/NeMo). NeMo Megatron GPT has been integrated with [NVIDIA Transformer Engine](https://github.com/NVIDIA/TransformerEngine). Transformer Engine enables FP8 training on NVIDIA Hopper GPUs.
+
+### List of Layers
+
+The model largely follows the GPT3 [paper](https://arxiv.org/abs/2005.14165), refer [here](https://github.com/mlcommons/training/tree/master/large_language_model/megatron-lm#list-of-layers) for model details.
+
+### Model checkpoint
+In the benchmarking region, we resume training from a reference checkpoint which is trained with Global Batch Size of 1536 for 4000 iterations. 
+
+Please refer to the [instructions](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#checkpoint-download) from the reference to download the BF16 model checkpoint.
+The postprocessing step can be skipped - the `gpt3/megatron-lm/checkpoint_nemo_bf16.tar` is already NeMo-compatible after unpacking.
+
+The `LOAD_CHECKPOINTS_PATH` variable should be set to the **parent** directory of the `ckpt4000-consumed_samples=0` checkpoint.
+
+For more details on the checkpoint format, please refer to the reference checkpoint [description](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#model-checkpoint). 
+
+# 5. Quality
+
+### Quality metric
+Log Perplexity
+
+### Quality target
+2.69
+
+### Evaluation frequency
+Evaluate after every 24576 samples (=50.33B tokens)
+
+### Evaluation thoroughness
+Evaluation on the validation subset that consists of 24567 examples.
+
+
+# 6. Additional notes
+
+### Config naming convention
+
+`<number of nodes DGXNNODES>x<number of gpus per node>x<mini batch size>x<tensor parallelism TENSOR_MODEL_PARALLEL>x<pipeline parallelism PIPELINE_MODEL_PARALLEL>`
+
+```
+MP = TP * PP
+DP = WS // MP
+miniBS = GBS // DP
+```
+where: 
+```
+MP = model parallelism
+TP = tensor parallelism
+PP = pipeline parallelism
+DP = data parallelism
+WS = world size (number of nodes x number of gpus per node)
+GBS = global batch size
+```
+Note: changing `MICRO_BATCH_SIZE` doesn't affect GBS or any of the above parameters.
+Effectively it controls gradient accumulation (`GA = miniBS // microBS`).
+
+Additional requirement for every config is that the GBS should be divisible by `DP*PP*MICRO_BATCH_SIZE`
+
+### Seeds
+NeMo produces dataset index shuffling only on one process and holds the `SEED` value in the file name.
+Thus, all processes need to have the same value of `SEED` otherwise will not be able to read the data.
+The `SEED` environment variable can be set prior to launching the job, otherwise it is set in `run.sub`.
diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION b/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION
@@ -0,0 +1 @@
+4.0-16
diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/build_and_push.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/build_and_push.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -e
+
+REGISTRY="cr.ai.nebius.cloud/crnbu823dealq64cp1s6"
+REPOSITORY="nvidia-megatron"
+TAG=$(cat ./VERSION)
+
+echo "Build image"
+docker build -f ./Dockerfile -t $REPOSITORY:$TAG --platform linux/amd64 -m 64G .
+
+echo "Tag image"
+docker tag $REPOSITORY:$TAG $REGISTRY/$REPOSITORY:$TAG
+
+echo "Push image"
+docker push $REGISTRY/$REPOSITORY:$TAG
+