diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d6e41be9f..6768c3d57 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,10 @@ jobs: steps: - name: Checkout Code uses: actions/checkout@v2 - + + - name: Check GPU is Free + run: tests/scripts/gpu_check.sh + - name: Megatron Unit Test run: tests/scripts/unit_test_megatron.sh ${{github.sha}} @@ -51,6 +54,9 @@ jobs: - name: Checkout Code uses: actions/checkout@v2 + - name: Check GPU is Free + run: tests/scripts/gpu_check.sh + - name: Flagscale Unit Test run: tests/scripts/unit_test_flagscale.sh ${{github.sha}} @@ -74,5 +80,8 @@ jobs: - name: Checkout Code uses: actions/checkout@v2 + - name: Check GPU is Free + run: tests/scripts/gpu_check.sh + - name: Flagscale Functional Test run: tests/scripts/functional_test_flagscale.sh \ No newline at end of file diff --git a/tests/scripts/gpu_check.sh b/tests/scripts/gpu_check.sh new file mode 100755 index 000000000..80dd99d86 --- /dev/null +++ b/tests/scripts/gpu_check.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# memory MB + +gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + +memory_usage_max=30000 + +while true; do + + IFS=$'\n' read -d '' -r -a memory_usage_array <<< "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits)" + IFS=$'\n' read -d '' -r -a memory_total_array <<< "$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits)" + + need_wait=false + + for ((i=0; i<$gpu_count; i++)); do + + memory_usage_i=$((${memory_usage_array[$i]})) + memory_total_i=$((${memory_total_array[$i]})) + memory_remin_i=$(($memory_total_i-$memory_usage_i)) + + if [ $memory_remin_i -lt $memory_usage_max ]; then + need_wait=true + fi + + done + + if [ "$need_wait" = false ]; then + break + fi + + echo "wait for gpu free" + sleep 5m + + unset memory_usage_array + unset memory_total_array + +done \ No newline at end of file