fix llava bench (#819)

PaddlePaddle · Nov 21, 2024 · 7fa1f2e · 7fa1f2e
1 parent cad8092
commit 7fa1f2e
Show file tree

Hide file tree

Showing 13 changed files with 164 additions and 131 deletions.
diff --git a/build_paddle_env.sh b/build_paddle_env.sh
@@ -57,23 +57,23 @@ if command -v nvcc > /dev/null 2>&1; then
     case $cuda_version in
         "11.2")
             echo "安装CUDA 11.2版本的paddlepaddle..."
-            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu112/
+            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu112/
             ;;
         "11.6")
             echo "安装CUDA 11.6版本的paddlepaddle..."
-            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu116/
+            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu116/
             ;;
         "11.7")
             echo "安装CUDA 11.7版本的paddlepaddle..."
-            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu117/
+            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu117/
             ;;
         "11.8")
             echo "安装CUDA 11.8版本的paddlepaddle..."
-            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
             ;;
         "12.3")
             echo "安装CUDA 12.3版本的paddlepaddle..."
-            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
+            $PYTHON_CMD -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
             ;;
         *)
             echo "警告: 不支持的CUDA版本 ($cuda_version)"
@@ -83,14 +83,14 @@ if command -v nvcc > /dev/null 2>&1; then
     esac
 else
     echo "未检测到CUDA。安装CPU版本的paddlepaddle..."
-    $PYTHON_CMD -m pip install paddlepaddle==3.0.0b1
+    $PYTHON_CMD -m pip install paddlepaddle==3.0.0b2
 fi
 
 # 验证安装
-echo "验证PaddlePaddle 3.0.0b1安装..."
+echo "验证PaddlePaddle 3.0.0b2安装..."
 if $PYTHON_CMD -c "import paddle; paddle.utils.run_check()"; then
-    echo "PaddlePaddle 3.0.0b1安装成功！"
+    echo "PaddlePaddle 3.0.0b2安装成功！"
 else
-    echo "PaddlePaddle 3.0.0b1安装验证失败，请检查安装日志"
+    echo "PaddlePaddle 3.0.0b2安装验证失败，请检查安装日志"
     exit 1
 fi
diff --git a/paddlemix/examples/llava/pretrain.py b/paddlemix/examples/llava/pretrain.py
@@ -156,11 +156,11 @@ def main():
         if training_args.benchmark:
             total_effective_samples = total_samples * training_args.num_train_epochs
             effective_samples_per_second = total_effective_samples / train_result.metrics["train_runtime"]
-            mem_gpu = (
-                train_result.metrics["train_mem_gpu_peaked_delta"] + train_result.metrics["train_mem_gpu_alloc_delta"]
-            )
+            # mem_gpu = (
+            #     train_result.metrics["train_mem_gpu_peaked_delta"] + train_result.metrics["train_mem_gpu_alloc_delta"]
+            # )
             logger.info(f"Effective_samples_per_second: {effective_samples_per_second} ")
-            logger.info(f"train_mem_gpu_peaked: {int(mem_gpu/ (2**20))} MB")
+            # logger.info(f"train_mem_gpu_peaked: {int(mem_gpu/ (2**20))} MB")
             logger.info("Benchmark done.")
         else:
             trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1)

diff --git a/paddlemix/tools/supervised_finetune.py b/paddlemix/tools/supervised_finetune.py
@@ -182,11 +182,11 @@ def main():
         if training_args.benchmark:
             total_effective_samples = total_samples * training_args.num_train_epochs
             effective_samples_per_second = total_effective_samples / train_result.metrics["train_runtime"]
-            mem_gpu = (
-                train_result.metrics["train_mem_gpu_peaked_delta"] + train_result.metrics["train_mem_gpu_alloc_delta"]
-            )
+            # mem_gpu = (
+            #     train_result.metrics["train_mem_gpu_peaked_delta"] + train_result.metrics["train_mem_gpu_alloc_delta"]
+            # )
             logger.info(f"Effective_samples_per_second: {effective_samples_per_second} ")
-            logger.info(f"train_mem_gpu_peaked: {int(mem_gpu/ (2**20))} MB")
+            # logger.info(f"train_mem_gpu_peaked: {int(mem_gpu/ (2**20))} MB")
             logger.info("Benchmark done.")
         else:
             trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1)

diff --git a/.../llava-v1.6-vicuna-7b_sft_bs16_bf16_DP.sh → ...-v1.6-vicuna-13b-lora_sft_bs16_bf16_DP.sh b/.../llava-v1.6-vicuna-7b_sft_bs16_bf16_DP.sh → ...-v1.6-vicuna-13b-lora_sft_bs16_bf16_DP.sh
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 model=llava
-model_item=llava-v1.6-vicuna-7b
+model_item=llava-v1.6-vicuna-13b-lora_sft
 bs_item=16
 fp_item=bf16
 run_mode=DP
 device_num=N1C8
 max_epochs=3
 num_workers=0
-train_stage=sft
 
 # get data
-bash tests/test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
 # run
-bash tests/test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} ${train_stage} 2>&1;
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
diff --git a/...-v1.6-vicuna-13b_pretrain_bs16_bf16_DP.sh → ...-v1.6-vicuna-13b-pretrain_bs16_bf16_DP.sh b/...-v1.6-vicuna-13b_pretrain_bs16_bf16_DP.sh → ...-v1.6-vicuna-13b-pretrain_bs16_bf16_DP.sh
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 model=llava
-model_item=vicuna-13b-v1.5
+model_item=llava-v1.6-vicuna-13b-pretrain
 bs_item=16
 fp_item=bf16
 run_mode=DP
 device_num=N1C8
 max_epochs=3
 num_workers=0
-train_stage=pretrain
 
 # get data
-bash tests/test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
 # run
-bash tests/test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} ${train_stage} 2>&1;
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
diff --git a/...llava-v1.6-vicuna-13b_sft_bs16_bf16_DP.sh → ...llava-v1.6-vicuna-13b-sft_bs16_bf16_DP.sh b/...llava-v1.6-vicuna-13b_sft_bs16_bf16_DP.sh → ...llava-v1.6-vicuna-13b-sft_bs16_bf16_DP.sh
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 model=llava
-model_item=llava-v1.6-vicuna-13b
+model_item=llava-v1.6-vicuna-13b-sft
 bs_item=16
 fp_item=bf16
 run_mode=DP
 device_num=N1C8
 max_epochs=3
 num_workers=0
-train_stage=sft
 
 # get data
-bash tests/test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
 # run
-bash tests/test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} ${train_stage} 2>&1;
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/llava/N1C8/llava-v1.6-vicuna-13b_lora_sft_bs16_bf16_DP.sh b/tests/test_tipc/dygraph/dp/llava/N1C8/llava-v1.6-vicuna-13b_lora_sft_bs16_bf16_DP.sh
diff --git a/...a-v1.6-vicuna-7b_lora_sft_bs16_bf16_DP.sh → ...a-v1.6-vicuna-7b-lora_sft_bs16_bf16_DP.sh b/...a-v1.6-vicuna-7b_lora_sft_bs16_bf16_DP.sh → ...a-v1.6-vicuna-7b-lora_sft_bs16_bf16_DP.sh
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 model=llava
-model_item=llava-v1.6-vicuna-7b
+model_item=llava-v1.6-vicuna-7b-lora_sft
 bs_item=16
 fp_item=bf16
 run_mode=DP
 device_num=N1C8
 max_epochs=3
 num_workers=0
-train_stage=lora_sft
 
 # get data
-bash tests/test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
 # run
-bash tests/test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} ${train_stage} 2>&1;
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/llava/N1C8/llava-v1.6-vicuna-7b-pretrain_bs16_bf16_DP.sh b/tests/test_tipc/dygraph/dp/llava/N1C8/llava-v1.6-vicuna-7b-pretrain_bs16_bf16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model=llava
+model_item=llava-v1.6-vicuna-7b-pretrain
+bs_item=16
+fp_item=bf16
+run_mode=DP
+device_num=N1C8
+max_epochs=3
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/llava/N1C8/llava-v1.6-vicuna-7b-sft_bs16_bf16_DP.sh b/tests/test_tipc/dygraph/dp/llava/N1C8/llava-v1.6-vicuna-7b-sft_bs16_bf16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model=llava
+model_item=llava-v1.6-vicuna-7b-sft
+bs_item=16
+fp_item=bf16
+run_mode=DP
+device_num=N1C8
+max_epochs=3
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/llava/N1C8/llava-v1.6-vicuna-7b_pretrain_bs16_bf16_DP.sh b/tests/test_tipc/dygraph/dp/llava/N1C8/llava-v1.6-vicuna-7b_pretrain_bs16_bf16_DP.sh
diff --git a/tests/test_tipc/dygraph/dp/llava/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/dp/llava/benchmark_common/prepare.sh
@@ -19,7 +19,7 @@ wget https://paddlenlp.bj.bcebos.com/models/community/paddlemix/benchmark/llava_
 tar -xf llava_bench_data.tar
 mv llava_bench_data /root/.paddlemix/datasets/
 rm -rf llava_bench_data.tar
-ln -s /root/.paddlemix/datasets/llava_bench_data ../
+ln -s /root/.paddlemix/datasets/llava_bench_data ./
 
 export http_proxy=agent.baidu.com:8188
 export https_proxy=agent.baidu.com:8188
@@ -32,7 +32,8 @@ python -m pip install -e ../
 python -m pip install --upgrade paddlenlp pybind11 regex sentencepiece tqdm visualdl attrdict easydict pyyaml -i https://mirror.baidu.com/pypi/simple
 pip install -r ../paddlemix/appflow/requirements.txt
 pip install -U ppdiffusers
-python -m pip install https://paddle-wheel.bj.bcebos.com/develop/linux/linux-gpu-cuda11.8-cudnn8.6-mkl-gcc8.2-avx/paddlepaddle_gpu-0.0.0.post118-cp310-cp310-linux_x86_64.whl
+bash ../build_paddle_env.sh
+# python -m pip install https://paddle-wheel.bj.bcebos.com/develop/linux/linux-gpu-cuda11.8-cudnn8.6-mkl-gcc8.2-avx/paddlepaddle_gpu-0.0.0.post118-cp310-cp310-linux_x86_64.whl
 python -m pip install paddlenlp==3.0.0b2
 python -m pip install huggingface_hub==0.23.0
 python -m pip list