-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #77 from nebius/release/soperator
Release soperator 1.15.2
- Loading branch information
Showing
52 changed files
with
5,400 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
1.15.1 | ||
1.15.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
gpt3-*.out | ||
result/ | ||
api_logs/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Permission is hereby granted, free of charge, to any person obtaining a | ||
# copy of this software and associated documentation files (the "Software"), | ||
# to deal in the Software without restriction, including without limitation | ||
# the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
# and/or sell copies of the Software, and to permit persons to whom the | ||
# Software is furnished to do so, subject to the following conditions: | ||
# | ||
# The above copyright notice and this permission notice shall be included in | ||
# all copies or substantial portions of the Software. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
# DEALINGS IN THE SOFTWARE. | ||
|
||
|
||
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.04-py3 | ||
FROM ${FROM_IMAGE_NAME} | ||
|
||
# Document build setup | ||
ARG FROM_IMAGE_NAME | ||
ENV CUSTOM_FROM_IMAGE_NAME ${FROM_IMAGE_NAME} | ||
|
||
# Custom libraries version | ||
WORKDIR /workspace/ | ||
|
||
## 1. Apex | ||
ARG APEX_REVISION=SKIP | ||
ENV CUSTOM_APEX_REVISION ${APEX_REVISION} | ||
ARG APEX_MAX_JOBS=4 | ||
|
||
RUN if [ "${APEX_REVISION}" != SKIP ]; then \ | ||
git clone https://github.com/NVIDIA/apex && \ | ||
cd apex && \ | ||
echo APEX_REVISION=${APEX_REVISION} && \ | ||
git checkout ${APEX_REVISION} && \ | ||
echo APEX_COMMIT_HASH=$(git rev-parse HEAD) && \ | ||
MAX_JOBS=${APEX_MAX_JOBS} NVCC_APPEND_FLAGS="--threads 8" pip install -v --no-build-isolation --no-cache-dir --disable-pip-version-check --config-settings "--build-option=--cpp_ext --cuda_ext --bnp --xentropy --deprecated_fused_adam --deprecated_fused_lamb --fast_multihead_attn --distributed_lamb --fast_layer_norm --transducer --distributed_adam --fmha --fast_bottleneck --nccl_p2p --peer_memory --permutation_search --focal_loss --fused_conv_bias_relu --index_mul_2d --cudnn_gbn --group_norm" . \ | ||
; fi | ||
|
||
## 2. Transformer Engine | ||
ARG TE_REVISION=v1.6rc1 | ||
ENV CUSTOM_TE_REVISION ${TE_REVISION} | ||
|
||
RUN if [ "${TE_REVISION}" != SKIP ]; then \ | ||
NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install --force-reinstall --no-deps git+https://github.com/NVIDIA/TransformerEngine.git@${TE_REVISION} \ | ||
; fi | ||
|
||
## 3. NeMo | ||
ARG NEMO_REVISION=v2.0.0.rc0.beta | ||
ENV CUSTOM_NEMO_REVISION ${NEMO_REVISION} | ||
ARG NEMO_BASE_VERSION=r2.0.0 | ||
ENV CUSTOM_NEMO_BASE_VERSION ${NEMO_BASE_VERSION} | ||
|
||
### Base version | ||
RUN if [ "${NEMO_REVISION}" == SKIP ]; then \ | ||
if [ -d /opt/bignlp/NeMo ]; then \ | ||
ln -s /opt/bignlp/NeMo \ | ||
; else \ | ||
echo "Error: NEMO_REVISION=SKIP but there is no BigNLP NeMo installation in base image." && \ | ||
exit 1 \ | ||
; fi \ | ||
; else \ | ||
git clone https://github.com/NVIDIA/NeMo.git && \ | ||
cd NeMo && \ | ||
git config user.email "[email protected]" && \ | ||
git config user.name "name name" && \ | ||
echo NEMO_REVISION=${NEMO_REVISION} && \ | ||
git checkout ${NEMO_REVISION} && \ | ||
echo NEMO_COMMIT_HASH=$(git rev-parse HEAD) && \ | ||
pip uninstall -y nemo-toolkit && \ | ||
pip install "cython<3.0.0" && \ | ||
pip install --no-build-isolation -e ".[nlp]" \ | ||
; fi | ||
|
||
### Make (has to be called after all changes to repo) | ||
RUN cd NeMo && \ | ||
cd nemo/collections/nlp/data/language_modeling/megatron && \ | ||
make | ||
|
||
# 4. Megatron-core | ||
ARG MEGATRON_REVISION=core_v0.7.0.beta | ||
ENV CUSTOM_MEGATRON_REVISION ${MEGATRON_REVISION} | ||
|
||
RUN if [ "${MEGATRON_REVISION}" != SKIP ]; then \ | ||
pip uninstall -y megatron-core && \ | ||
git clone https://github.com/NVIDIA/Megatron-LM.git && \ | ||
cd Megatron-LM && \ | ||
git config user.email "[email protected]" && \ | ||
git config user.name "Docker Build" && \ | ||
git checkout ${CUSTOM_MEGATRON_REVISION} && \ | ||
echo MEGATRON_COMMIT_HASH=$(git rev-parse HEAD) && \ | ||
pip install . && \ | ||
cd megatron/core/datasets && \ | ||
make \ | ||
; fi | ||
|
||
ENV PYTHONPATH "${PYTHONPATH}:/workspace/Megatron-LM" | ||
|
||
## 5. Benchmark dependencies | ||
COPY requirements.txt . | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
## 6. Use nccl-rdma-sharp-plugins from master to pick a fix after HPCX2.18 release | ||
RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \ | ||
ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so && \ | ||
git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins && \ | ||
cd nccl-rdma-sharp-plugins/ && \ | ||
./autogen.sh && \ | ||
./configure --prefix=/opt/hpcx/nccl_rdma_sharp_plugin --with-cuda=/usr/local/cuda --with-sharp=/opt/hpcx/sharp/ && \ | ||
make -j install && \ | ||
cd ../ && \ | ||
rm -rf nccl-rdma-sharp-plugins/ | ||
|
||
# Fix dependencies that don't work from the newest versions | ||
RUN pip install huggingface_hub==0.23.2 | ||
RUN pip install -v "transformers<=4.40.2" | ||
|
||
# Benchmark code | ||
WORKDIR /workspace/llm | ||
|
||
COPY . . | ||
ENV PYTHONPATH "/workspace/llm:/workspace/NeMo:${PYTHONPATH}" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
# 1. Problem | ||
Large Language Model - GPT3 175B | ||
|
||
## Requirements | ||
* [PyTorch 24.04-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) | ||
* Slurm with [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) (multi-node) | ||
* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) (for container build) | ||
|
||
# 2. Directions | ||
|
||
## Steps to run benchmark. | ||
|
||
### Steps to configure the training setup | ||
Launch configuration and system-specific hyperparameters for the appropriate | ||
NVIDIA DGX submission are in the `config_DGXH100_*.sh` scripts. | ||
|
||
Data related variables (PREPROC_DATA, SPM, LOAD_CHECKPOINTS_PATH) are not | ||
covered in the config files and must be set separately. | ||
|
||
### Steps to launch training | ||
|
||
1. Build the container and push to a docker registry: | ||
``` | ||
docker build --pull -t <docker/registry>/mlperf-nvidia:large_language_model-pytorch . | ||
docker push <docker/registry>/mlperf-nvidia:large_language_model-pytorch | ||
``` | ||
2. Launch the training: | ||
``` | ||
source config_DGXH100_64x8x128x4x8_mbs1.sh # use appropriate config | ||
CONT="<docker/registry>/mlperf-nvidia:large_language_model-pytorch LOGDIR=<path/to/output/dir> PREPROC_DATA=<path/to/dataset> SPM=<path/to/tokenizer/model> LOAD_CHECKPOINTS_PATH=<path/to/checkpoint> sbatch -N $DGXNNODES -t $WALLTIME run.sub | ||
``` | ||
### Hyperparameter settings | ||
|
||
Hyperparameters are recorded in the `config_*.sh` files for each configuration and in `run_and_time.sh`. | ||
|
||
# 3. Dataset/Environment | ||
Please refer to the [instructions](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#3-datasetenvironment) from the reference to download the dataset. | ||
Separate tokenizer files download can be skipped - the unpacked dataset archive already contains the required tokenizer `c4_en_301_5Mexp2_spm.model` in the `spm` directory. | ||
|
||
The C4 dataset location (`preprocessed_c4_spm` directory) should be set as PREPROC_DATA variable and the tokenizer location (the `c4_en_301_5Mexp2_spm.model` **file**) as the SPM variable. | ||
|
||
# 4. Model | ||
### Publication/Attribution | ||
[Megatron](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/intro.html) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository uses [Nemo Megatron](https://github.com/NVIDIA/NeMo). NeMo Megatron GPT has been integrated with [NVIDIA Transformer Engine](https://github.com/NVIDIA/TransformerEngine). Transformer Engine enables FP8 training on NVIDIA Hopper GPUs. | ||
|
||
### List of Layers | ||
|
||
The model largely follows the GPT3 [paper](https://arxiv.org/abs/2005.14165), refer [here](https://github.com/mlcommons/training/tree/master/large_language_model/megatron-lm#list-of-layers) for model details. | ||
|
||
### Model checkpoint | ||
In the benchmarking region, we resume training from a reference checkpoint which is trained with Global Batch Size of 1536 for 4000 iterations. | ||
|
||
Please refer to the [instructions](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#checkpoint-download) from the reference to download the BF16 model checkpoint. | ||
The postprocessing step can be skipped - the `gpt3/megatron-lm/checkpoint_nemo_bf16.tar` is already NeMo-compatible after unpacking. | ||
|
||
The `LOAD_CHECKPOINTS_PATH` variable should be set to the **parent** directory of the `ckpt4000-consumed_samples=0` checkpoint. | ||
|
||
For more details on the checkpoint format, please refer to the reference checkpoint [description](https://github.com/mlcommons/training/blob/master/large_language_model/megatron-lm/README.md#model-checkpoint). | ||
|
||
# 5. Quality | ||
|
||
### Quality metric | ||
Log Perplexity | ||
|
||
### Quality target | ||
2.69 | ||
|
||
### Evaluation frequency | ||
Evaluate after every 24576 samples (=50.33B tokens) | ||
|
||
### Evaluation thoroughness | ||
Evaluation on the validation subset that consists of 24567 examples. | ||
|
||
|
||
# 6. Additional notes | ||
|
||
### Config naming convention | ||
|
||
`<number of nodes DGXNNODES>x<number of gpus per node>x<mini batch size>x<tensor parallelism TENSOR_MODEL_PARALLEL>x<pipeline parallelism PIPELINE_MODEL_PARALLEL>` | ||
|
||
``` | ||
MP = TP * PP | ||
DP = WS // MP | ||
miniBS = GBS // DP | ||
``` | ||
where: | ||
``` | ||
MP = model parallelism | ||
TP = tensor parallelism | ||
PP = pipeline parallelism | ||
DP = data parallelism | ||
WS = world size (number of nodes x number of gpus per node) | ||
GBS = global batch size | ||
``` | ||
Note: changing `MICRO_BATCH_SIZE` doesn't affect GBS or any of the above parameters. | ||
Effectively it controls gradient accumulation (`GA = miniBS // microBS`). | ||
|
||
Additional requirement for every config is that the GBS should be divisible by `DP*PP*MICRO_BATCH_SIZE` | ||
|
||
### Seeds | ||
NeMo produces dataset index shuffling only on one process and holds the `SEED` value in the file name. | ||
Thus, all processes need to have the same value of `SEED` otherwise will not be able to read the data. | ||
The `SEED` environment variable can be set prior to launching the job, otherwise it is set in `run.sub`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
4.0-16 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
|
||
REGISTRY="cr.ai.nebius.cloud/crnbu823dealq64cp1s6" | ||
REPOSITORY="nvidia-megatron" | ||
TAG=$(cat ./VERSION) | ||
|
||
echo "Build image" | ||
docker build -f ./Dockerfile -t $REPOSITORY:$TAG --platform linux/amd64 -m 64G . | ||
|
||
echo "Tag image" | ||
docker tag $REPOSITORY:$TAG $REGISTRY/$REPOSITORY:$TAG | ||
|
||
echo "Push image" | ||
docker push $REGISTRY/$REPOSITORY:$TAG | ||
|
Oops, something went wrong.