Skip to content

Commit

Permalink
Merge pull request #77 from nebius/release/soperator
Browse files Browse the repository at this point in the history
Release soperator 1.15.2
  • Loading branch information
asteny authored Nov 12, 2024
2 parents b7c212a + ef5c57f commit 355430c
Show file tree
Hide file tree
Showing 52 changed files with 5,400 additions and 2 deletions.
2 changes: 1 addition & 1 deletion soperator/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.15.1
1.15.2
2 changes: 1 addition & 1 deletion soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ slurm_cluster_name = "my-amazing-slurm"

# Version of soperator.
# ---
slurm_operator_version = "1.15.1"
slurm_operator_version = "1.15.2"

# Type of the Slurm partition config. Could be either `default` or `custom`.
# By default, "default".
Expand Down
4 changes: 4 additions & 0 deletions soperator/mlperf/gpt3-impl-4.0-nvidia/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
gpt3-*.out
result/
api_logs/

129 changes: 129 additions & 0 deletions soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.


ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.04-py3
FROM ${FROM_IMAGE_NAME}

# Document build setup
ARG FROM_IMAGE_NAME
ENV CUSTOM_FROM_IMAGE_NAME ${FROM_IMAGE_NAME}

# Custom libraries version
WORKDIR /workspace/

## 1. Apex
ARG APEX_REVISION=SKIP
ENV CUSTOM_APEX_REVISION ${APEX_REVISION}
ARG APEX_MAX_JOBS=4

RUN if [ "${APEX_REVISION}" != SKIP ]; then \
git clone https://github.com/NVIDIA/apex && \
cd apex && \
echo APEX_REVISION=${APEX_REVISION} && \
git checkout ${APEX_REVISION} && \
echo APEX_COMMIT_HASH=$(git rev-parse HEAD) && \
MAX_JOBS=${APEX_MAX_JOBS} NVCC_APPEND_FLAGS="--threads 8" pip install -v --no-build-isolation --no-cache-dir --disable-pip-version-check --config-settings "--build-option=--cpp_ext --cuda_ext --bnp --xentropy --deprecated_fused_adam --deprecated_fused_lamb --fast_multihead_attn --distributed_lamb --fast_layer_norm --transducer --distributed_adam --fmha --fast_bottleneck --nccl_p2p --peer_memory --permutation_search --focal_loss --fused_conv_bias_relu --index_mul_2d --cudnn_gbn --group_norm" . \
; fi

## 2. Transformer Engine
ARG TE_REVISION=v1.6rc1
ENV CUSTOM_TE_REVISION ${TE_REVISION}

RUN if [ "${TE_REVISION}" != SKIP ]; then \
NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install --force-reinstall --no-deps git+https://github.com/NVIDIA/TransformerEngine.git@${TE_REVISION} \
; fi

## 3. NeMo
ARG NEMO_REVISION=v2.0.0.rc0.beta
ENV CUSTOM_NEMO_REVISION ${NEMO_REVISION}
ARG NEMO_BASE_VERSION=r2.0.0
ENV CUSTOM_NEMO_BASE_VERSION ${NEMO_BASE_VERSION}

### Base version
RUN if [ "${NEMO_REVISION}" == SKIP ]; then \
if [ -d /opt/bignlp/NeMo ]; then \
ln -s /opt/bignlp/NeMo \
; else \
echo "Error: NEMO_REVISION=SKIP but there is no BigNLP NeMo installation in base image." && \
exit 1 \
; fi \
; else \
git clone https://github.com/NVIDIA/NeMo.git && \
cd NeMo && \
git config user.email "[email protected]" && \
git config user.name "name name" && \
echo NEMO_REVISION=${NEMO_REVISION} && \
git checkout ${NEMO_REVISION} && \
echo NEMO_COMMIT_HASH=$(git rev-parse HEAD) && \
pip uninstall -y nemo-toolkit && \
pip install "cython<3.0.0" && \
pip install --no-build-isolation -e ".[nlp]" \
; fi

### Make (has to be called after all changes to repo)
RUN cd NeMo && \
cd nemo/collections/nlp/data/language_modeling/megatron && \
make

# 4. Megatron-core
ARG MEGATRON_REVISION=core_v0.7.0.beta
ENV CUSTOM_MEGATRON_REVISION ${MEGATRON_REVISION}

RUN if [ "${MEGATRON_REVISION}" != SKIP ]; then \
pip uninstall -y megatron-core && \
git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git config user.email "[email protected]" && \
git config user.name "Docker Build" && \
git checkout ${CUSTOM_MEGATRON_REVISION} && \
echo MEGATRON_COMMIT_HASH=$(git rev-parse HEAD) && \
pip install . && \
cd megatron/core/datasets && \
make \
; fi

ENV PYTHONPATH "${PYTHONPATH}:/workspace/Megatron-LM"

## 5. Benchmark dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

## 6. Use nccl-rdma-sharp-plugins from master to pick a fix after HPCX2.18 release
RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \
ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so && \
git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins && \
cd nccl-rdma-sharp-plugins/ && \
./autogen.sh && \
./configure --prefix=/opt/hpcx/nccl_rdma_sharp_plugin --with-cuda=/usr/local/cuda --with-sharp=/opt/hpcx/sharp/ && \
make -j install && \
cd ../ && \
rm -rf nccl-rdma-sharp-plugins/

# Fix dependencies that don't work from the newest versions
RUN pip install huggingface_hub==0.23.2
RUN pip install -v "transformers<=4.40.2"

# Benchmark code
WORKDIR /workspace/llm

COPY . .
ENV PYTHONPATH "/workspace/llm:/workspace/NeMo:${PYTHONPATH}"

103 changes: 103 additions & 0 deletions soperator/mlperf/gpt3-impl-4.0-nvidia/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# 1. Problem
Large Language Model - GPT3 175B

## Requirements
* [PyTorch 24.04-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
* Slurm with [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) (multi-node)
* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) (for container build)

# 2. Directions

## Steps to run benchmark.

### Steps to configure the training setup
Launch configuration and system-specific hyperparameters for the appropriate
NVIDIA DGX submission are in the `config_DGXH100_*.sh` scripts.

Data related variables (PREPROC_DATA, SPM, LOAD_CHECKPOINTS_PATH) are not
covered in the config files and must be set separately.

### Steps to launch training

1. Build the container and push to a docker registry:
```
docker build --pull -t <docker/registry>/mlperf-nvidia:large_language_model-pytorch .
docker push <docker/registry>/mlperf-nvidia:large_language_model-pytorch
```
2. Launch the training:
```
source config_DGXH100_64x8x128x4x8_mbs1.sh # use appropriate config
CONT="<docker/registry>/mlperf-nvidia:large_language_model-pytorch LOGDIR=<path/to/output/dir> PREPROC_DATA=<path/to/dataset> SPM=<path/to/tokenizer/model> LOAD_CHECKPOINTS_PATH=<path/to/checkpoint> sbatch -N $DGXNNODES -t $WALLTIME run.sub
```
### Hyperparameter settings

Hyperparameters are recorded in the `config_*.sh` files for each configuration and in `run_and_time.sh`.

# 3. Dataset/Environment
Please refer to the [instructions](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#3-datasetenvironment) from the reference to download the dataset.
Separate tokenizer files download can be skipped - the unpacked dataset archive already contains the required tokenizer `c4_en_301_5Mexp2_spm.model` in the `spm` directory.

The C4 dataset location (`preprocessed_c4_spm` directory) should be set as PREPROC_DATA variable and the tokenizer location (the `c4_en_301_5Mexp2_spm.model` **file**) as the SPM variable.

# 4. Model
### Publication/Attribution
[Megatron](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/intro.html) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository uses [Nemo Megatron](https://github.com/NVIDIA/NeMo). NeMo Megatron GPT has been integrated with [NVIDIA Transformer Engine](https://github.com/NVIDIA/TransformerEngine). Transformer Engine enables FP8 training on NVIDIA Hopper GPUs.

### List of Layers

The model largely follows the GPT3 [paper](https://arxiv.org/abs/2005.14165), refer [here](https://github.com/mlcommons/training/tree/master/large_language_model/megatron-lm#list-of-layers) for model details.

### Model checkpoint
In the benchmarking region, we resume training from a reference checkpoint which is trained with Global Batch Size of 1536 for 4000 iterations.

Please refer to the [instructions](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#checkpoint-download) from the reference to download the BF16 model checkpoint.
The postprocessing step can be skipped - the `gpt3/megatron-lm/checkpoint_nemo_bf16.tar` is already NeMo-compatible after unpacking.

The `LOAD_CHECKPOINTS_PATH` variable should be set to the **parent** directory of the `ckpt4000-consumed_samples=0` checkpoint.

For more details on the checkpoint format, please refer to the reference checkpoint [description](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#model-checkpoint).

# 5. Quality

### Quality metric
Log Perplexity

### Quality target
2.69

### Evaluation frequency
Evaluate after every 24576 samples (=50.33B tokens)

### Evaluation thoroughness
Evaluation on the validation subset that consists of 24567 examples.


# 6. Additional notes

### Config naming convention

`<number of nodes DGXNNODES>x<number of gpus per node>x<mini batch size>x<tensor parallelism TENSOR_MODEL_PARALLEL>x<pipeline parallelism PIPELINE_MODEL_PARALLEL>`

```
MP = TP * PP
DP = WS // MP
miniBS = GBS // DP
```
where:
```
MP = model parallelism
TP = tensor parallelism
PP = pipeline parallelism
DP = data parallelism
WS = world size (number of nodes x number of gpus per node)
GBS = global batch size
```
Note: changing `MICRO_BATCH_SIZE` doesn't affect GBS or any of the above parameters.
Effectively it controls gradient accumulation (`GA = miniBS // microBS`).

Additional requirement for every config is that the GBS should be divisible by `DP*PP*MICRO_BATCH_SIZE`

### Seeds
NeMo produces dataset index shuffling only on one process and holds the `SEED` value in the file name.
Thus, all processes need to have the same value of `SEED` otherwise will not be able to read the data.
The `SEED` environment variable can be set prior to launching the job, otherwise it is set in `run.sub`.
1 change: 1 addition & 0 deletions soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
4.0-16
17 changes: 17 additions & 0 deletions soperator/mlperf/gpt3-impl-4.0-nvidia/build_and_push.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

set -e

REGISTRY="cr.ai.nebius.cloud/crnbu823dealq64cp1s6"
REPOSITORY="nvidia-megatron"
TAG=$(cat ./VERSION)

echo "Build image"
docker build -f ./Dockerfile -t $REPOSITORY:$TAG --platform linux/amd64 -m 64G .

echo "Tag image"
docker tag $REPOSITORY:$TAG $REGISTRY/$REPOSITORY:$TAG

echo "Push image"
docker push $REGISTRY/$REPOSITORY:$TAG

Loading

0 comments on commit 355430c

Please sign in to comment.