Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #446

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open

Dev #446

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ jobs/logs/
*ipynb
.history/
*.json
*.sh
# *.sh
.ipynb_common
logs/
results/
Expand Down
199 changes: 199 additions & 0 deletions 10_vicuna7b_qformer_base_ft_mix6_blip2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
#!/bin/bash

# start: lizrun start -c "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/10_vicuna7b_qformer_base_ft_mix6_blip2.sh" -n 1 -j mix6-base-5e5-0319 -i reg-ai.chehejia.com/ssai/lizr/cu118/py310/pytorch:2.0.1-multinode-nccl -p sft
# PATH_ORI=${0%/*}
# PROJECT_PATH=$(echo ${PATH_ORI} | sed -r 's/\/{2,}/\//')
# echo "========"
# echo $PROJECT_PATH
PROJECT_PATH=/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE
cd ${PROJECT_PATH}
# pip install -e .

### RDMA Config ###
# export NCCL_IB_HCA=^mlx5_0,mlx5_1,^mlx5_2,mlx5_3,^mlx5_4,mlx5_5,^mlx5_6,mlx5_7,^mlx5_8
export NCCL_IB_GID_INDEX=3
### RDMA Config ###

pip install -r requirements.txt

# DNS to IP
# sleep 30 # waiting for system init
MASTER_IP=""
if [ "${RANK}" == "0" ];then
while [[ "$MASTER_IP" == "" ]]
do
MASTER_IP=`ping ${MASTER_ADDR} -c 3 | sed '1{s/[^(]*(//;s/).*//;q}'`
# MASTER_IP=127.0.0.1
sleep 1
done
else
## Convert DNS to IP for torch
MASTER_IP=`getent hosts ${MASTER_ADDR} | awk '{print $1}'` # Ethernet
fi

# training cofiguration
CONFIG_FILE=/tmp/blip2_config_${RANK}.yaml
# WORLD_SIZE=`expr ${WORLD_SIZE} \* 8`
DIST_URL="env://${MASTER_IP}:${MASTER_PORT}"
# 配置生成
cat <<EOT > ${CONFIG_FILE}
model:
arch: blip2_vicuna_instruct
model_type: vicuna7b_pretrain
load_pretrained: True
load_finetuned: False
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"

# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"

# Q-Former
num_query_token: 32
qformer_text_input: True

# vicuna7b
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256

# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False

# moe
general_version: 'base'

datasets:
gqa: # train: 943000, 12578, 12578)
type: balanced_sft_raw
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 10

ok_vqa: # train, valid (9009, 5046)
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 1

coco_vqa: # 658104
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 9

coco_caption: # 414113 train
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 7

aok_vqa: # train: 17056, val: 1145
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 2

textcaps_caption: # train: 109765, val: 15830
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 4

run:
task: instruction_tuning
lr_sched: "linear_warmup_cosine_lr"
init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500

weight_decay: 0.05
max_epoch: 10
num_workers: 4
warmup_steps: 600
iters_per_epoch: 5000

seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_uni_route/mix_coco_gqa_ao_cocap_tcap_raw_Qformer_base_lr5e5_10epo_0319/"

amp: True
resume_ckpt_path: null

evaluate: False
train_splits: ["train"]
valid_splits: ["val"]

wandb_log: False

device: "cuda"
world_size: ${WORLD_SIZE}
dist_url: ${DIST_URL}
distributed: True
EOT


torchrun --nnodes=${WORLD_SIZE} --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_IP} \
train.py \
--cfg-path ${CONFIG_FILE}
Loading