Skip to content

Commit

Permalink
Merge remote-tracking branch 'xinhao/xinhao_candle' into xinhao_infer…
Browse files Browse the repository at this point in the history
…ence
  • Loading branch information
xinhaoc committed May 17, 2024
2 parents d54e4b6 + f65044d commit 024d188
Show file tree
Hide file tree
Showing 55 changed files with 1,377 additions and 310 deletions.
94 changes: 82 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,17 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
include(cudnn)
endif()

# NCCL
if(FF_USE_NCCL)
if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")
include(nccl)
endif()
list(APPEND FF_CC_FLAGS
-DFF_USE_NCCL)
list(APPEND FF_NVCC_FLAGS
-DFF_USE_NCCL)
endif()

# Legion
include(legion)

Expand Down Expand Up @@ -383,19 +394,78 @@ if(NOT BUILD_LEGION_ONLY)
add_dependencies(flexflow ${NCCL_NAME})
endif()

target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
# LEGION_URL is defined if we found a precompiled Legion library to download
if(LEGION_URL)
# Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime.
# When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
add_dependencies(flexflow ${LEGION_NAME})
else()
# When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
# Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
# to link FlexFlow to ${LEGION_LIBRARY}
target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})

find_package(hip REQUIRED)

if (FF_GPU_BACKEND STREQUAL "hip_cuda")
# The targets defined by the hip cmake config only target amd devices.
# For targeting nvidia devices, we'll make our own interface target,
# hip_device_nvidia, that includes the rocm and hip headers.
add_library(hip_device_nvidia INTERFACE)

if (NOT FF_CUDA_ARCH STREQUAL "")
target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH})
endif()

target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)

add_compile_definitions(FF_USE_HIP_CUDA)

# Linking cuda:
# We do not explicitly link cuda. hipcc when targeting nvidia will
# use nvcc under the hood. nvcc when used for linking will handle
# linking cuda dependencies
target_link_libraries(flexflow hip_device_nvidia)
elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
find_package(hipblas REQUIRED)
find_package(miopen REQUIRED)
if(FF_USE_NCCL)
find_package(rccl REQUIRED)
endif()
# find_package(rocrand REQUIRED)
find_library(HIP_RAND_LIBRARY hiprand REQUIRED)

add_compile_definitions(FF_USE_HIP_ROCM)
# The hip cmake config module defines three targets,
# hip::amdhip64, hip::host, and hip::device.
#
# hip::host and hip::device are interface targets. hip::amdhip64 is an
# imported target for libamdhip.
#
# You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64
# and hip::device links to hip::host. Link to hip::host to just use hip without
# compiling any GPU code. Link to hip::device to compile the GPU device code.
#
# Docs (outdated):
# https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY})
if(FF_USE_NCCL)
target_link_libraries(flexflow rccl)
endif()
endif()
else()
message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}")
endif()

if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda"))
add_dependencies(flexflow ${NCCL_NAME})
endif()

target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
# LEGION_URL is defined if we found a precompiled Legion library to download
if(LEGION_URL)
# Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime.
# When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
add_dependencies(flexflow ${LEGION_NAME})
else()
# When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
# Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
# to link FlexFlow to ${LEGION_LIBRARY}
target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
endif()

#library api version, bump from time to time
set(SOVERSION 1)
Expand Down
5 changes: 1 addition & 4 deletions cmake/json.cmake
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
include(FetchContent)

FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz)
FetchContent_MakeAvailable(json)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json)
7 changes: 6 additions & 1 deletion config/config.inc
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi"
elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp"
elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx"
SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL"
elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ofi"
fi
elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx"
Expand Down Expand Up @@ -235,7 +240,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc"
else
ADD_ROCM_TO_PATH="PATH=${PATH}:${ROCM_PATH}/bin"
#SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc"
SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip -DCMAKE_CXX_FLAGS='-I${MPICH_DIR}/include' -DCMAKE_EXE_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi' -DCMAKE_SHARED_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi'"
fi
fi
fi
Expand Down
6 changes: 2 additions & 4 deletions config/config.linux
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ FF_USE_PYTHON=${FF_USE_PYTHON:-ON}
FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}

# select GASNET conduit
FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}
FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi}

# set UCX dir if Legion networks is set to ucx
UCX_DIR=${UCX_DIR:-""}
Expand Down Expand Up @@ -99,11 +99,9 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
exit 1
elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm" ]]; then
elif [["$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm"]]; then
# enable NCCL
FF_USE_NCCL=${FF_USE_NCCL:-ON}
else
FF_USE_NCCL=OFF
fi

function get_build_configs() {
Expand Down
2 changes: 1 addition & 1 deletion deps/legion
Submodule legion updated from 24e8c4 to 626b55
117 changes: 73 additions & 44 deletions examples/python/pytorch/mt5/mt5_ff.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
import sys

import numpy as np
import torch
from flexflow.core import *
from flexflow.torch.model import PyTorchModel
from transformers import MT5ForConditionalGeneration, T5Tokenizer

#from transformers import MT5ForConditionalGeneration, T5Tokenizer
from transformers import BertForMaskedLM, BertTokenizer
sys.path.append("./examples/python/pytorch/mt5")
from mt5_torch import DataPreparer, get_dataloaders, set_seed

BASE_DIR = "examples/python/pytorch/mt5"
DATA_DIR = os.path.join(BASE_DIR, "data")
NUMPY_DIR = os.path.join(DATA_DIR, "numpy")
NUMPY_DIR = os.path.join(DATA_DIR, "numpy_candle")


def data_to_numpy() -> None:
Expand All @@ -28,15 +29,17 @@ def data_to_numpy() -> None:
"""
model_params = {
"SEED": 42,
"MODEL": "google/mt5-small",
#"MODEL": "google/mt5-small",
"MODEL": "bert-base-uncased",
"TRAIN_BATCH_SIZE": None, # use the full dataset as one batch
"EVAL_BATCH_SIZE": None, # use the full dataset as one batch
"TRAIN_EPOCHS": 1, # unused
"MAX_SOURCE_TEXT_LENGTH": 48,
"MAX_TARGET_TEXT_LENGTH": 48,
}
set_seed(model_params)
tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
#tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
tokenizer = BertTokenizer.from_pretrained(model_params["MODEL"])
print("Getting dataloaders...")
train_loader, eval_loader = get_dataloaders(tokenizer, model_params)
assert len(train_loader) == 1
Expand All @@ -61,8 +64,8 @@ def preprocess_train() -> None:
y_shape = y.shape
assert len(y.shape) == 2, \
"`y` should have shape (num examples, sequence length)"
y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.long)
y_ids = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32)
lm_labels = np.empty((y_shape[0], y_shape[1] - 1), dtype=np.int32)
y_ids[:, :] = y[:, :-1]
lm_labels[:, :] = y[:, 1:]

Expand All @@ -81,36 +84,53 @@ def preprocess_train() -> None:
def top_level_task():
ffconfig = FFConfig()
ffmodel = FFModel(ffconfig)
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

#model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
#model = BertModel.from_pretrained("bert-base-uncased")
# Load train data as numpy arrays
print("Loading data...")
ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy"))
mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy"))
y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy"))
ids = np.load(os.path.join(NUMPY_DIR, "train_input_ids.npy")).astype('int32')
ids = np.pad(ids, ((0,0), (0,17)), 'constant')
#ids = np.random.randint(0, 5, (1000, 512))
#print('ids_shape', ids.shape)
#print('ids', ids)
mask = np.load(os.path.join(NUMPY_DIR, "train_attention_mask.npy")).astype('int32')
mask = np.pad(mask, ((0,0), (0,17)), 'constant')
#mask = np.random.randint(0, 2, (1000, 512))
#y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
lm_labels = np.load(os.path.join(NUMPY_DIR, "train_labels.npy")).astype('int32')
lm_labels = np.pad(lm_labels, ((0,0), (0,17)), 'constant')
#lm_labels = np.random.randint(-1, 5, (1000, 512))
position_id = torch.arange(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy()
token_type_ids = torch.zeros(ids.shape[1], dtype=torch.int32).expand((1, -1)).numpy()


batch_size = ffconfig.batch_size
input_ids_shape = (batch_size, ids.shape[1])
attention_mask_shape = (batch_size, mask.shape[1])
decoder_input_ids_shape = (batch_size, y_ids.shape[1])
#decoder_input_ids_shape = (batch_size, y_ids.shape[1])
input_tensors = [
ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64), # input_ids
ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT64), # attention_mask
ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids
ffmodel.create_tensor(input_ids_shape, DataType.DT_INT32), # input_ids
ffmodel.create_tensor(attention_mask_shape, DataType.DT_INT32), # attention_mask
#ffmodel.create_tensor(decoder_input_ids_shape, DataType.DT_INT64), # decoder_input_ids
]
encoder_seq_length = ids.shape[1]
decoder_seq_length = y_ids.shape[1]
seq_length = (encoder_seq_length, decoder_seq_length)
input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
#decoder_seq_length = y_ids.shape[1]
#seq_length = (encoder_seq_length, decoder_seq_length)
seq_length = encoder_seq_length
#input_names = ["input_ids", "attention_mask", "decoder_input_ids"]
input_names = ["input_ids", "attention_mask"]

print("Tracing the model...")
hf_model = PyTorchModel(
model, is_hf_model=True, input_names=input_names,
batch_size=batch_size, seq_length=seq_length,
)
output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True)
ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
#from flexflow.torch.model import file_to_ff
#file_to_ff("mt5.ff", ffmodel, input_tensors)
ffoptimizer = AdamOptimizer(ffmodel, alpha=1e-4, beta1=0.9, beta2=0.98, weight_decay=0.0, epsilon=2e-8)
# ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)

print("Compiling the model...")
ffmodel.compile(
Expand All @@ -121,13 +141,21 @@ def top_level_task():
MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
],
)

# load weights here
ffmodel.load_bert_pretrained(checkpoint=model)

print("Creating data loaders...")
print('id_dtype', ids.dtype)
print('mask_dtype', mask.dtype)
print('labels_dtype', lm_labels.dtype)
input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids)
attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask)
decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
#decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
# NOTE: We cast down the label tensor data to 32-bit to accommodate the
# label tensor's required dtype
token_type_ids_dl = ffmodel.create_data_loader(input_tensors[2], token_type_ids)
position_id_dl = ffmodel.create_data_loader(input_tensors[3], position_id)
labels_dl = ffmodel.create_data_loader(
ffmodel.label_tensor, lm_labels.astype("int32")
)
Expand All @@ -138,31 +166,32 @@ def top_level_task():
print("Training...")
epochs = ffconfig.epochs
ffmodel.fit(
x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
#x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
x=[input_ids_dl, attention_mask_dl, position_id_dl, token_type_ids_dl],
y=labels_dl, batch_size=batch_size, epochs=epochs,
)


if __name__ == "__main__":
# Generate the .tsv files if needed
if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
DataPreparer.data_to_tsv()
# Convert the .tsv files to .npy if needed
if not os.path.exists(NUMPY_DIR):
os.mkdir(NUMPY_DIR)
prefixes = ["train_", "eval_"]
suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
npy_filenames = [
pre + suf for pre, suf in itertools.product(prefixes, suffixes)
]
if any(
not os.path.exists(os.path.join(NUMPY_DIR, filename))
for filename in npy_filenames
):
data_to_numpy()
# Preprocess the training data if needed
if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
preprocess_train()
## Generate the .tsv files if needed
#if not os.path.exists(os.path.join(DATA_DIR, "train.tsv")) or \
# not os.path.exists(os.path.join(DATA_DIR, "eval.tsv")):
# DataPreparer.data_to_tsv()
## Convert the .tsv files to .npy if needed
#if not os.path.exists(NUMPY_DIR):
# os.mkdir(NUMPY_DIR)
#prefixes = ["train_", "eval_"]
#suffixes = ["source_ids.npy", "source_mask.npy", "target_ids.npy"]
#npy_filenames = [
# pre + suf for pre, suf in itertools.product(prefixes, suffixes)
#]
#if any(
# not os.path.exists(os.path.join(NUMPY_DIR, filename))
# for filename in npy_filenames
#):
# data_to_numpy()
## Preprocess the training data if needed
#if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
# not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
# preprocess_train()
top_level_task()
4 changes: 2 additions & 2 deletions examples/python/pytorch/mt5/mt5_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import os

import numpy as np
import pandas as pd
#import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import MT5ForConditionalGeneration, T5Tokenizer
Expand Down Expand Up @@ -311,5 +311,5 @@ def TorchMT5Trainer(
"MAX_TARGET_TEXT_LENGTH": 48,
"LEARNING_RATE": 1e-4,
}
device = torch.device(0)
device = torch.device('cpu')
TorchMT5Trainer(model_params, device)
Loading

0 comments on commit 024d188

Please sign in to comment.