Skip to content

Nightly-OnDemand Tests #297

Nightly-OnDemand Tests

Nightly-OnDemand Tests #297

name: Nightly-OnDemand Tests
on:
schedule:
# GMT+8 21:00 every workday
- cron: '0 13 * * 0-4'
# GMT+8 0:00 Saturday
- cron: '0 16 * * 5'
workflow_dispatch:
inputs:
pytorch:
required: false
type: string
default: 'main'
description: Pytorch branch/commit
keep_torch_xpu_ops:
required: false
type: string
default: 'false'
description: Keep torch-xpu-ops pin. `true` means use pined commit
ut:
required: false
type: string
default: 'torch_xpu'
description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu`. Delimiter is comma
triton:
required: false
type: string
default: ''
description: Triton commit. Use pytorch pined commit by default
suite:
required: true
type: string
default: 'huggingface'
description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench`. Delimiter is comma
dt:
required: true
type: string
default: 'float32'
description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma
mode:
required: true
type: string
default: 'inference'
description: Test mode. `inference,training`. Delimiter is comma
scenario:
required: true
type: string
default: 'accuracy'
description: Test scenario. `accuracy,performance`. Delimiter is comma
model:
required: false
type: string
default: ''
description: Model. Will only run this one mode if set
python:
required: false
type: string
default: '3.10'
description: Python version
permissions: read-all
concurrency:
group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
cancel-in-progress: true
jobs:
Linux-Nightly-Ondemand-UT-Tests:
if: github.event_name == 'schedule' || ${{ inputs.ut_suite }}
uses: ./.github/workflows/_linux_ut.yml
with:
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
runner: linux.idc.xpu
Linux-Weekly-UT-Tests-ABI-0:
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
uses: ./.github/workflows/_linux_ut.yml
with:
abi: 0
ut: op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu
runner: linux.idc.xpu
Linux-Nightly-Ondemand-E2E-Tests:
runs-on: pvc_e2e
# Don't run on forked repos
if: github.repository_owner == 'intel'
timeout-minutes: 3600
env:
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
outputs:
TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }}
TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }}
DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }}
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Prepare Conda ENV
run: |
which conda && conda clean -ay
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
source activate e2e_ci
pip install mkl-static mkl-include
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
pwd
cd ../ && rm -rf pytorch
source activate e2e_ci
git clone https://github.com/pytorch/pytorch pytorch
cd pytorch && git checkout $(echo ${{ env.pytorch }} |awk '{print $1}')
# apply PRs for stock pytorch
pip install requests
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
git status && git show -s
git submodule sync && git submodule update --init --recursive
if [[ ${{ env.keep_torch_xpu_ops }} == 'true' ]]; then
echo "Don't replace torch-xpu-ops!"
else
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
# Workaround for torch-xpu-ops ci test
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
fi
- name: Identify pinned versions
id: pinned
run: |
source .github/scripts/env.sh
cd ../pytorch
if [ -z ${{ inputs.triton }} ]; then
echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
else
echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
fi
echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
. /etc/os-release
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo ${GITHUB_ENV}
- name: Triton Installation
run: |
source activate e2e_ci
cd ../pytorch
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
- name: Build Pytorch XPU
run: |
source activate e2e_ci
source .github/scripts/env.sh
cd ../pytorch
pip install -r requirements.txt
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
python setup.py bdist_wheel
pip install --force-reinstall dist/*.whl
- name: Show GITHUB_ENV
run: |
echo "$GITHUB_ENV"
rm -rf ../pytorch/inductor_log
rm -rf /tmp/torchinductor_*
rm -rf ~/.triton/cache
# Nihglty launch
- name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
env_prepare: true
dt: float32,bfloat16,float16,amp_bf16,amp_fp16
mode: inference,training
scenario: accuracy
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Torchbench BF16 Training Accuracy Test
if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: torchbench
dt: bfloat16
mode: training
scenario: accuracy
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Timm_models FP16 Training Accuracy Test
if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: timm_models
dt: float16
mode: training
scenario: accuracy
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# Weekly launch
- name: Weekly Huggingface Full Test
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
env_prepare: true
dt: float32,bfloat16,float16,amp_bf16,amp_fp16
mode: inference,training
scenario: accuracy,performance
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Weekly Torchbench Full Test
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: torchbench
env_prepare: true
dt: float32,bfloat16,float16,amp_bf16,amp_fp16
mode: inference,training
scenario: accuracy,performance
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Weekly Timm_models Full Test
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: timm_models
env_prepare: true
dt: float32,bfloat16,float16,amp_bf16,amp_fp16
mode: inference,training
scenario: accuracy,performance
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# On-demand launch
- name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
if: github.event_name != 'schedule'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: ${{ inputs.suite }}
env_prepare: true
dt: ${{ inputs.dt }}
mode: ${{ inputs.mode }}
scenario: ${{ inputs.scenario }}
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Summarize archieve files
id: summary
if: ${{ ! cancelled() }}
run: |
rm -rf ${{ github.workspace }}/upload_files
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/
find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days
tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs
failed_models=$(grep "Real failed models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
timeout_models=$(grep "timeout models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
if [ ${timeout_models} -ne 0 ];then
TIMEOUT_MODELS="$(
grep -B 1 "timeout models: [1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log
)"
echo "TIMEOUT_MODELS=\"${TIMEOUT_MODELS}\"" |awk '{printf("%s\\n", $0)}' |sed 's/\\n$//' |tee -a "${GITHUB_OUTPUT}"
fi
if [ ${failed_models} -ne 0 ];then
grep -E "Real failed models: [1-9]|Summary for" ${{ github.workspace }}/upload_files/summary_accuracy.log |grep "failed" -B 1
exit 1
fi
- name: Upload Inductor XPU E2E Data
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
with:
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
path: ${{ github.workspace }}/upload_files
Tests-Failure-And-Report:
if: ${{ ! cancelled() }}
runs-on: [ self-hosted, Linux ]
permissions:
issues: write
env:
GH_TOKEN: ${{ github.token }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
needs: Linux-Nightly-Ondemand-E2E-Tests
steps:
- name: Report github issue for XPU OPS nightly
if: github.repository_owner == 'intel'
run: |
set -xe
# Test env
build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
repo="${{ github.repository }}"
TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_BRANCH_ID }}"
TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_COMMIT_ID }}"
DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.DRIVER_VERSION }}"
KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.KERNEL_VERSION }}"
BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.BUNDLE_VERSION }}"
OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.OS_PRETTY_NAME }}"
GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.GCC_VERSION }}"
TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHBENCH_COMMIT_ID }}"
TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHVISION_COMMIT_ID }}"
TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHAUDIO_COMMIT_ID }}"
TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRANSFORMERS_VERSION }}"
TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMM_COMMIT_ID }}"
TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRITON_COMMIT_ID }}"
TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMEOUT_MODELS }}"
# Test status
if [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "success" ];then
test_status=Success
elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "failure" ];then
test_status=Failure
cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}"
else
test_status=None
exit 0
fi
# Test Type
if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
test_type="On-demand"
test_issue_id=426
cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}"
elif [ "${{ github.event.schedule }}" == "0 16 * * 5" ];then
test_type="Weekly"
test_issue_id=432
else
test_type="Nightly"
test_issue_id=432
fi
# Test report
echo -e "**${test_status}** $test_type Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt
printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt
printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt
echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt
printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION | $KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt
if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}"
if [ "${{ inputs.model }}" != "" ];then
test_scope+="; model=${{ inputs.model }}"
fi
echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt
fi
echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt
echo "$cc_comment" >> ${{ github.workspace }}/report.txt
# Report
report_txt=$(cat ${{ github.workspace }}/report.txt)
gh --repo $repo issue comment $test_issue_id --body "$report_txt"