-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
1,068 additions
and
2 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
defaults: | ||
- _self_ | ||
- train: train_qwen_2.5_1.5b | ||
# - train: train_mixtral_1.8b | ||
|
||
experiment: | ||
exp_name: train_qwen_2.5_1.5b | ||
exp_dir: ./outputs # outputs ## log、checkpoints output path | ||
task: | ||
type: train | ||
backend: megatron | ||
entrypoint: ./flagscale/train/train_aquila.py | ||
runner: | ||
backend: torchrun | ||
nnodes: 2 | ||
nproc_per_node: 8 | ||
hostfile: torchrun # Please replace with your actual hostfile path | ||
envs: | ||
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" | ||
CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
NCCL_SOCKET_IFNAME: eth0 | ||
NCCL_IB_DISABLE: 0 | ||
NCCL_IB_CUDA_SUPPORT: 1 | ||
NCCL_IB_GID_INDEX: 0 | ||
NCCL_DEBUG: INFO | ||
OMP_NUM_THREADS: 4 | ||
GLOO_SOCKET_IFNAME: eth0 | ||
NCCL_IB_HCA: mlx5_2,mlx5_5 | ||
cmds: | ||
before_start: "" # activate environment | ||
after_stop: "" | ||
|
||
action: run |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
defaults: | ||
- _self_ | ||
- train: train_qwen_2.5_1.5b | ||
# - train: train_mixtral_1.8b | ||
|
||
experiment: | ||
exp_name: train_qwen_2.5_1.5b | ||
exp_dir: ./outputs # outputs ## log、checkpoints output path | ||
task: | ||
type: train | ||
backend: megatron | ||
entrypoint: ./flagscale/train/train_aquila.py | ||
runner: | ||
backend: torchrun | ||
nnodes: 2 | ||
nproc_per_node: 8 | ||
hostfile: torchrun # Please replace with your actual hostfile path | ||
envs: | ||
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" | ||
CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
NCCL_SOCKET_IFNAME: eth0 | ||
NCCL_IB_DISABLE: 0 | ||
NCCL_IB_CUDA_SUPPORT: 1 | ||
NCCL_IB_GID_INDEX: 0 | ||
NCCL_DEBUG: INFO | ||
OMP_NUM_THREADS: 4 | ||
GLOO_SOCKET_IFNAME: eth0 | ||
NCCL_IB_HCA: mlx5_2,mlx5_5 | ||
|
||
action: run |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
system: | ||
tensor_model_parallel_size: 1 | ||
pipeline_model_parallel_size: 1 | ||
make_vocab_size_divisible_by: 128 | ||
disable_bias_linear: True | ||
sequence_parallel: True | ||
use_flash_attn: True | ||
use_distributed_optimizer: True | ||
distributed-timeout-minutes: 60 | ||
precision: | ||
bf16: True | ||
attention_softmax_in_fp32: True | ||
accumulate_allreduce_grads_in_fp32: True | ||
logging: | ||
log_interval: 1 | ||
tensorboard_log_interval: 1 | ||
wandb_project: "train-qwen2.5-1.5B" | ||
wandb_exp_name: "train-qwen2.5-1.5B" | ||
checkpoint: | ||
load: ${megatron_model__path:} | ||
# If you want to train the model, you need to comment out ckpt_format, ckpt_convert_format, ckpt_convert_save, which are used for converting ckpt. | ||
ckpt_format: torch_dist # ${experiment.ckpt_format} | ||
ckpt_convert_format: torch # ${experiment.ckpt_convert_format} | ||
ckpt_convert_save: ${experiment.ckpt_convert_save} | ||
save_interval: 5000000 | ||
rampup_save_interval: 50000 | ||
|
||
model: | ||
use_mcore_models: true | ||
num_layers: 28 | ||
hidden_size: 1536 | ||
num_attention_heads: 12 | ||
num_query_groups: 2 | ||
group_query_attention: True | ||
ffn_hidden_size: 8960 | ||
seq_length: 4096 | ||
max_position_embeddings: 4096 | ||
norm_epsilon: 1e-6 | ||
norm_init_weight: 0.02 | ||
use_rotary_position_embeddings: true | ||
rotary_base: 1000000.0 | ||
no_position_embedding: true | ||
reset_position_ids: true | ||
add_qkv_bias: true | ||
reset_attention_mask: true | ||
swiglu: true | ||
normalization: RMSNorm | ||
untie_embeddings_and_output_weights: false | ||
init_method_std: 0.02 | ||
attention_dropout: 0.0 | ||
hidden_dropout: 0.0 | ||
weight_decay: 0.0 | ||
clip_grad: 1.0 | ||
train_samples: 1478125 | ||
eval_iters: 0 | ||
eval_interval: 2000000 | ||
micro_batch_size: 1 | ||
global_batch_size: 512 | ||
finetune: true | ||
transformer_impl: transformer_engine | ||
seed: 42 | ||
#data_searching_range: [1156,1274] | ||
optimizer: | ||
weight_decay: 0.0 | ||
adam_beta1: 0.9 | ||
adam_beta2: 0.95 | ||
lr_scheduler: | ||
lr: 1e-5 | ||
min_lr: 0 | ||
lr_warmup_samples: 21120 | ||
lr_decay_style: cosine | ||
|
||
data: | ||
data_path: ${data_path:??} | ||
split: 1 | ||
apply_sft_dataset_separated_loss_mask_if_existed: true | ||
tokenizer: | ||
tokenizer_type: HFTokenizerFS | ||
tokenizer_path: ${HF_model_path:??} | ||
vocab_size: 151665 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,237 @@ | ||
"""GPT style dataset.""" | ||
|
||
import copy | ||
import hashlib | ||
import os | ||
import time | ||
|
||
import numpy as np | ||
import torch | ||
|
||
from megatron import print_rank_0 | ||
from megatron.core import mpu | ||
from megatron.data.data_samplers import RandomSeedDataset | ||
|
||
class ConversationDatasetCPT(torch.utils.data.Dataset): | ||
def __init__(self, conversations, tokenizer, maxlen, seed, num_samples, role_sep="\n\n"): | ||
super(ConversationDatasetCPT, self).__init__() | ||
self.conversations = conversations | ||
self.tokenizer = tokenizer | ||
self.maxlen = maxlen+1 | ||
self.seed = seed | ||
self.num_samples = num_samples | ||
|
||
## TODO convo template | ||
self.sep = role_sep | ||
|
||
# rng state | ||
np_rng = np.random.RandomState(seed=seed) | ||
np_rng.shuffle(self.conversations) | ||
|
||
def __getitem__(self, i): | ||
source = self.conversations[i] | ||
|
||
instruction = source['instruction'] | ||
conversations = source['conversations'] | ||
|
||
BOS_TOKEN = self.tokenizer.cls | ||
EOS_TOKEN = self.tokenizer.eod | ||
example = [BOS_TOKEN] | ||
|
||
# instruction | ||
instruction = self.tokenizer.tokenize(f"{instruction}") | ||
example += instruction | ||
|
||
labels = [-100] * len(example) | ||
|
||
for conversation in conversations: | ||
role = conversation['from'] | ||
content = conversation['value'] | ||
content += self.sep | ||
|
||
content = self.tokenizer.tokenize(f"{content}") | ||
|
||
example += content | ||
if role == 'gpt': | ||
role_labels = copy.deepcopy(content) | ||
else: | ||
# masking | ||
role_labels = [-100] * len(content) | ||
labels += role_labels | ||
|
||
example.append(EOS_TOKEN) | ||
labels.append(EOS_TOKEN) | ||
|
||
# maxlen | ||
example = example[:self.maxlen] | ||
labels = labels[:self.maxlen] | ||
|
||
# padding | ||
delta = self.maxlen - len(example) | ||
if delta > 0: | ||
example.extend([self.tokenizer.pad]*delta) | ||
labels.extend([-100]*delta) | ||
|
||
output = { | ||
"tokens": np.array(example, dtype=np.int64), | ||
"labels": np.array(labels, dtype=np.int64), | ||
} | ||
return output | ||
|
||
def __len__(self): | ||
return len(self.conversations) | ||
|
||
|
||
class ConversationDatasetV2(torch.utils.data.Dataset): | ||
def __init__(self, conversations, tokenizer, maxlen, seed, num_samples): | ||
super(ConversationDatasetV2, self).__init__() | ||
self.conversations = conversations | ||
self.tokenizer = tokenizer | ||
self.maxlen = maxlen+1 | ||
self.seed = seed | ||
self.num_samples = num_samples | ||
|
||
# rng state | ||
np_rng = np.random.RandomState(seed=seed) | ||
np_rng.shuffle(self.conversations) | ||
|
||
|
||
def __getitem__(self, i): | ||
from examples.aquila.utils.convo_prompt import _add_speaker_and_signal | ||
from examples.aquila.utils.convo_prompt import header | ||
|
||
#source = self.conversations[self.sample_idx[i]] | ||
source = self.conversations[i] | ||
_add_speaker_and_signal(source) | ||
|
||
source["chat_desc"] = header | ||
chat_desc = source['chat_desc'] | ||
instruction = source['instruction'] | ||
conversations = source['conversations'] | ||
|
||
BOS_TOKEN = self.tokenizer.cls | ||
EOS_TOKEN = self.tokenizer.eod | ||
example = [BOS_TOKEN] | ||
|
||
# chat_desc | ||
example += self.tokenizer.tokenize(f"{chat_desc}") | ||
|
||
# instruction | ||
instruction = self.tokenizer.tokenize(f"{instruction}") | ||
example += instruction | ||
|
||
labels = copy.deepcopy(example) | ||
# add zero-out | ||
#labels = [-100] * len(example) | ||
|
||
for conversation in conversations: | ||
role = conversation['from'] | ||
content = conversation['value'] | ||
content = self.tokenizer.tokenize(f"{content}") | ||
example += content | ||
if role == 'gpt': | ||
role_labels = copy.deepcopy(content) | ||
else: | ||
# masking | ||
role_labels = [-100] * len(content) | ||
labels += role_labels | ||
|
||
example.append(EOS_TOKEN) | ||
labels.append(EOS_TOKEN) | ||
|
||
# maxlen | ||
example = example[:self.maxlen] | ||
labels = labels[:self.maxlen] | ||
|
||
# padding | ||
delta = self.maxlen - len(example) | ||
if delta > 0: | ||
example.extend([self.tokenizer.pad]*delta) | ||
labels.extend([-100]*delta) | ||
|
||
output = { | ||
"tokens": np.array(example, dtype=np.int64), | ||
"labels": np.array(labels, dtype=np.int64), | ||
} | ||
return output | ||
|
||
def __len__(self): | ||
#return len(self.sample_idx) | ||
return len(self.conversations) | ||
|
||
|
||
def build_train_valid_test_datasets(train_valid_test_num_samples, | ||
seq_length, seed, tokenizer, | ||
train_data_prefix, | ||
valid_data_prefix, | ||
test_data_prefix=None, | ||
finetune_dataset_type=None): | ||
"""Build train, valid, and test datasets.""" | ||
suppored_dataset_types = dict(CPT=ConversationDatasetCPT) | ||
dataset_cls = ConversationDatasetV2 | ||
if finetune_dataset_type in suppored_dataset_types: | ||
dataset_cls = suppored_dataset_types[finetune_dataset_type] | ||
|
||
def read_file(jsonl_file): | ||
import jsonlines | ||
conversations = [] | ||
with jsonlines.open(jsonl_file) as reader: | ||
for line in reader: | ||
conversations.append(line) | ||
return conversations | ||
|
||
train_dataset, valid_dataset, test_dataset = None, None, None | ||
# Single dataset. | ||
if train_data_prefix is not None: | ||
train_conversations = read_file(train_data_prefix[0]) | ||
train_dataset = dataset_cls( | ||
train_conversations, | ||
tokenizer=tokenizer, | ||
maxlen=seq_length, | ||
seed=seed, | ||
num_samples=train_valid_test_num_samples[0]) | ||
train_dataset = RandomSeedDataset(train_dataset) | ||
|
||
if valid_data_prefix is not None: | ||
valid_conversations = read_file(valid_data_prefix[0]) | ||
valid_dataset = dataset_cls( | ||
valid_conversations, | ||
tokenizer=tokenizer, | ||
maxlen=seq_length, | ||
seed=seed, | ||
num_samples=train_valid_test_num_samples[1]) | ||
valid_dataset = RandomSeedDataset(valid_dataset) | ||
|
||
if test_data_prefix is not None: | ||
test_conversations = read_file(test_data_prefix[0]) | ||
test_dataset = dataset_cls( | ||
test_conversations, | ||
tokenizer=tokenizer, | ||
maxlen=seq_length, | ||
seed=seed, | ||
num_samples=train_valid_test_num_samples[2]) | ||
test_dataset = RandomSeedDataset(test_dataset) | ||
|
||
return (train_dataset, valid_dataset, test_dataset) | ||
|
||
if __name__ == "__main__": | ||
train_valid_test_num_samples = [12000,2000,0] | ||
seq_length = 2048 | ||
seed = 1234 | ||
from megatron.tokenizer.tokenizer import _AquilaTokenizer | ||
tokenizer = _AquilaTokenizer( | ||
'../examples/aquila/tokenizer/vocab.json', | ||
'../examples/aquila/tokenizer/merges.txt') | ||
print(f"{dir(tokenizer)}") | ||
train_data_prefix = ['path/to/train/set'] | ||
valid_data_prefix = ['path/to/valid/set'] | ||
train_dataset, valid_dataset, test_dataset = build_train_valid_test_datasets( | ||
train_valid_test_num_samples, | ||
seq_length, seed, tokenizer, | ||
train_data_prefix, | ||
valid_data_prefix, | ||
test_data_prefix=None) | ||
for idx, sample in enumerate(train_dataset): | ||
print(f"idx={idx} sample={type(sample['labels'])}") | ||
break | ||
|
Oops, something went wrong.