Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
cyber-pioneer authored Dec 3, 2024
2 parents f21801d + 39d1775 commit ebcc9de
Show file tree
Hide file tree
Showing 11 changed files with 1,068 additions and 2 deletions.
Empty file added examples/qwen/__init__.py
Empty file.
33 changes: 33 additions & 0 deletions examples/qwen/conf/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
defaults:
- _self_
- train: train_qwen_2.5_1.5b
# - train: train_mixtral_1.8b

experiment:
exp_name: train_qwen_2.5_1.5b
exp_dir: ./outputs # outputs ## log、checkpoints output path
task:
type: train
backend: megatron
entrypoint: ./flagscale/train/train_aquila.py
runner:
backend: torchrun
nnodes: 2
nproc_per_node: 8
hostfile: torchrun # Please replace with your actual hostfile path
envs:
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
NCCL_SOCKET_IFNAME: eth0
NCCL_IB_DISABLE: 0
NCCL_IB_CUDA_SUPPORT: 1
NCCL_IB_GID_INDEX: 0
NCCL_DEBUG: INFO
OMP_NUM_THREADS: 4
GLOO_SOCKET_IFNAME: eth0
NCCL_IB_HCA: mlx5_2,mlx5_5
cmds:
before_start: "" # activate environment
after_stop: ""

action: run
30 changes: 30 additions & 0 deletions examples/qwen/conf/config_qwen2.5_1.5b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
defaults:
- _self_
- train: train_qwen_2.5_1.5b
# - train: train_mixtral_1.8b

experiment:
exp_name: train_qwen_2.5_1.5b
exp_dir: ./outputs # outputs ## log、checkpoints output path
task:
type: train
backend: megatron
entrypoint: ./flagscale/train/train_aquila.py
runner:
backend: torchrun
nnodes: 2
nproc_per_node: 8
hostfile: torchrun # Please replace with your actual hostfile path
envs:
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
NCCL_SOCKET_IFNAME: eth0
NCCL_IB_DISABLE: 0
NCCL_IB_CUDA_SUPPORT: 1
NCCL_IB_GID_INDEX: 0
NCCL_DEBUG: INFO
OMP_NUM_THREADS: 4
GLOO_SOCKET_IFNAME: eth0
NCCL_IB_HCA: mlx5_2,mlx5_5

action: run
80 changes: 80 additions & 0 deletions examples/qwen/conf/train/qwen_2.5_1.5b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
system:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
make_vocab_size_divisible_by: 128
disable_bias_linear: True
sequence_parallel: True
use_flash_attn: True
use_distributed_optimizer: True
distributed-timeout-minutes: 60
precision:
bf16: True
attention_softmax_in_fp32: True
accumulate_allreduce_grads_in_fp32: True
logging:
log_interval: 1
tensorboard_log_interval: 1
wandb_project: "train-qwen2.5-1.5B"
wandb_exp_name: "train-qwen2.5-1.5B"
checkpoint:
load: ${megatron_model__path:}
# If you want to train the model, you need to comment out ckpt_format, ckpt_convert_format, ckpt_convert_save, which are used for converting ckpt.
ckpt_format: torch_dist # ${experiment.ckpt_format}
ckpt_convert_format: torch # ${experiment.ckpt_convert_format}
ckpt_convert_save: ${experiment.ckpt_convert_save}
save_interval: 5000000
rampup_save_interval: 50000

model:
use_mcore_models: true
num_layers: 28
hidden_size: 1536
num_attention_heads: 12
num_query_groups: 2
group_query_attention: True
ffn_hidden_size: 8960
seq_length: 4096
max_position_embeddings: 4096
norm_epsilon: 1e-6
norm_init_weight: 0.02
use_rotary_position_embeddings: true
rotary_base: 1000000.0
no_position_embedding: true
reset_position_ids: true
add_qkv_bias: true
reset_attention_mask: true
swiglu: true
normalization: RMSNorm
untie_embeddings_and_output_weights: false
init_method_std: 0.02
attention_dropout: 0.0
hidden_dropout: 0.0
weight_decay: 0.0
clip_grad: 1.0
train_samples: 1478125
eval_iters: 0
eval_interval: 2000000
micro_batch_size: 1
global_batch_size: 512
finetune: true
transformer_impl: transformer_engine
seed: 42
#data_searching_range: [1156,1274]
optimizer:
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.95
lr_scheduler:
lr: 1e-5
min_lr: 0
lr_warmup_samples: 21120
lr_decay_style: cosine

data:
data_path: ${data_path:??}
split: 1
apply_sft_dataset_separated_loss_mask_if_existed: true
tokenizer:
tokenizer_type: HFTokenizerFS
tokenizer_path: ${HF_model_path:??}
vocab_size: 151665
Empty file added examples/qwen/utils/__init__.py
Empty file.
237 changes: 237 additions & 0 deletions examples/qwen/utils/convo_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
"""GPT style dataset."""

import copy
import hashlib
import os
import time

import numpy as np
import torch

from megatron import print_rank_0
from megatron.core import mpu
from megatron.data.data_samplers import RandomSeedDataset

class ConversationDatasetCPT(torch.utils.data.Dataset):
def __init__(self, conversations, tokenizer, maxlen, seed, num_samples, role_sep="\n\n"):
super(ConversationDatasetCPT, self).__init__()
self.conversations = conversations
self.tokenizer = tokenizer
self.maxlen = maxlen+1
self.seed = seed
self.num_samples = num_samples

## TODO convo template
self.sep = role_sep

# rng state
np_rng = np.random.RandomState(seed=seed)
np_rng.shuffle(self.conversations)

def __getitem__(self, i):
source = self.conversations[i]

instruction = source['instruction']
conversations = source['conversations']

BOS_TOKEN = self.tokenizer.cls
EOS_TOKEN = self.tokenizer.eod
example = [BOS_TOKEN]

# instruction
instruction = self.tokenizer.tokenize(f"{instruction}")
example += instruction

labels = [-100] * len(example)

for conversation in conversations:
role = conversation['from']
content = conversation['value']
content += self.sep

content = self.tokenizer.tokenize(f"{content}")

example += content
if role == 'gpt':
role_labels = copy.deepcopy(content)
else:
# masking
role_labels = [-100] * len(content)
labels += role_labels

example.append(EOS_TOKEN)
labels.append(EOS_TOKEN)

# maxlen
example = example[:self.maxlen]
labels = labels[:self.maxlen]

# padding
delta = self.maxlen - len(example)
if delta > 0:
example.extend([self.tokenizer.pad]*delta)
labels.extend([-100]*delta)

output = {
"tokens": np.array(example, dtype=np.int64),
"labels": np.array(labels, dtype=np.int64),
}
return output

def __len__(self):
return len(self.conversations)


class ConversationDatasetV2(torch.utils.data.Dataset):
def __init__(self, conversations, tokenizer, maxlen, seed, num_samples):
super(ConversationDatasetV2, self).__init__()
self.conversations = conversations
self.tokenizer = tokenizer
self.maxlen = maxlen+1
self.seed = seed
self.num_samples = num_samples

# rng state
np_rng = np.random.RandomState(seed=seed)
np_rng.shuffle(self.conversations)


def __getitem__(self, i):
from examples.aquila.utils.convo_prompt import _add_speaker_and_signal
from examples.aquila.utils.convo_prompt import header

#source = self.conversations[self.sample_idx[i]]
source = self.conversations[i]
_add_speaker_and_signal(source)

source["chat_desc"] = header
chat_desc = source['chat_desc']
instruction = source['instruction']
conversations = source['conversations']

BOS_TOKEN = self.tokenizer.cls
EOS_TOKEN = self.tokenizer.eod
example = [BOS_TOKEN]

# chat_desc
example += self.tokenizer.tokenize(f"{chat_desc}")

# instruction
instruction = self.tokenizer.tokenize(f"{instruction}")
example += instruction

labels = copy.deepcopy(example)
# add zero-out
#labels = [-100] * len(example)

for conversation in conversations:
role = conversation['from']
content = conversation['value']
content = self.tokenizer.tokenize(f"{content}")
example += content
if role == 'gpt':
role_labels = copy.deepcopy(content)
else:
# masking
role_labels = [-100] * len(content)
labels += role_labels

example.append(EOS_TOKEN)
labels.append(EOS_TOKEN)

# maxlen
example = example[:self.maxlen]
labels = labels[:self.maxlen]

# padding
delta = self.maxlen - len(example)
if delta > 0:
example.extend([self.tokenizer.pad]*delta)
labels.extend([-100]*delta)

output = {
"tokens": np.array(example, dtype=np.int64),
"labels": np.array(labels, dtype=np.int64),
}
return output

def __len__(self):
#return len(self.sample_idx)
return len(self.conversations)


def build_train_valid_test_datasets(train_valid_test_num_samples,
seq_length, seed, tokenizer,
train_data_prefix,
valid_data_prefix,
test_data_prefix=None,
finetune_dataset_type=None):
"""Build train, valid, and test datasets."""
suppored_dataset_types = dict(CPT=ConversationDatasetCPT)
dataset_cls = ConversationDatasetV2
if finetune_dataset_type in suppored_dataset_types:
dataset_cls = suppored_dataset_types[finetune_dataset_type]

def read_file(jsonl_file):
import jsonlines
conversations = []
with jsonlines.open(jsonl_file) as reader:
for line in reader:
conversations.append(line)
return conversations

train_dataset, valid_dataset, test_dataset = None, None, None
# Single dataset.
if train_data_prefix is not None:
train_conversations = read_file(train_data_prefix[0])
train_dataset = dataset_cls(
train_conversations,
tokenizer=tokenizer,
maxlen=seq_length,
seed=seed,
num_samples=train_valid_test_num_samples[0])
train_dataset = RandomSeedDataset(train_dataset)

if valid_data_prefix is not None:
valid_conversations = read_file(valid_data_prefix[0])
valid_dataset = dataset_cls(
valid_conversations,
tokenizer=tokenizer,
maxlen=seq_length,
seed=seed,
num_samples=train_valid_test_num_samples[1])
valid_dataset = RandomSeedDataset(valid_dataset)

if test_data_prefix is not None:
test_conversations = read_file(test_data_prefix[0])
test_dataset = dataset_cls(
test_conversations,
tokenizer=tokenizer,
maxlen=seq_length,
seed=seed,
num_samples=train_valid_test_num_samples[2])
test_dataset = RandomSeedDataset(test_dataset)

return (train_dataset, valid_dataset, test_dataset)

if __name__ == "__main__":
train_valid_test_num_samples = [12000,2000,0]
seq_length = 2048
seed = 1234
from megatron.tokenizer.tokenizer import _AquilaTokenizer
tokenizer = _AquilaTokenizer(
'../examples/aquila/tokenizer/vocab.json',
'../examples/aquila/tokenizer/merges.txt')
print(f"{dir(tokenizer)}")
train_data_prefix = ['path/to/train/set']
valid_data_prefix = ['path/to/valid/set']
train_dataset, valid_dataset, test_dataset = build_train_valid_test_datasets(
train_valid_test_num_samples,
seq_length, seed, tokenizer,
train_data_prefix,
valid_data_prefix,
test_data_prefix=None)
for idx, sample in enumerate(train_dataset):
print(f"idx={idx} sample={type(sample['labels'])}")
break

Loading

0 comments on commit ebcc9de

Please sign in to comment.