Performance Regression Test - Score Director #114
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# - Runs entirely on a single machine, a self-hosted runner on Github Actions. | |
# - Both baseline and SUT (Software Under Test) are built from source first, | |
# as it is wasteful to run the baseline only to find out that the SUT does not build. | |
# - The baseline is established first, then the SUT is measured. | |
# - Each benchmark gives a 99.9 % confidence interval. | |
# - The confidence intervals are compared to determine if the branch under test is a regression or an improvement. | |
# - The error threshold is expected to be below +/- 2.0 %. | |
name: Performance Regression Test - Score Director | |
on: | |
workflow_dispatch: | |
inputs: | |
jdk: | |
description: 'JDK version' | |
default: '21' | |
required: true | |
baseline: | |
description: 'Timefold Solver release' | |
default: '1.17.0' | |
required: true | |
branch: | |
description: 'Branch to benchmark (needs to use 999-SNAPSHOT)' | |
default: 'main' | |
required: true | |
branch_owner: | |
description: 'User owning the branch' | |
default: 'TimefoldAI' | |
required: true | |
async_profiler_version: | |
description: 'async-profiler version' | |
default: '3.0' | |
required: true | |
jobs: | |
build: | |
runs-on: self-hosted | |
strategy: | |
fail-fast: true # If one compilation fails, abort everything. | |
matrix: | |
example: [cloud_balancing, conference_scheduling, curriculum_course, examination, machine_reassignment, meeting_scheduling, nurse_rostering, patient_admission_scheduling, task_assigning, traveling_tournament, tsp, vehicle_routing] | |
env: | |
MVN_USERNAME: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_USERNAME }}' | |
MVN_PASSWORD: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_TOKEN }}' | |
steps: | |
- name: Phase 0 - Checkout timefold-solver-benchmarks | |
uses: actions/checkout@v4 | |
with: | |
repository: TimefoldAI/timefold-solver-benchmarks | |
path: ./timefold-solver-benchmarks | |
- name: Phase 0 - Setup JDK and Maven | |
uses: actions/setup-java@v4 | |
with: | |
java-version: ${{ github.event.inputs.jdk }} | |
distribution: 'temurin' | |
cache: 'maven' | |
server-id: 'timefold-solver-enterprise' | |
server-username: 'MVN_USERNAME' | |
server-password: 'MVN_PASSWORD' | |
- name: Phase 0 - Setup Async Profiler | |
working-directory: ./timefold-solver-benchmarks | |
run: | | |
export FILENAME=async-profiler-${{ github.event.inputs.async_profiler_version }}-linux-x64.tar.gz | |
wget https://github.com/async-profiler/async-profiler/releases/download/v${{ github.event.inputs.async_profiler_version }}/$FILENAME | |
tar -xzf $FILENAME | |
ls -l | |
- name: Phase 1 - (Baseline) Compile the benchmark | |
working-directory: ./timefold-solver-benchmarks | |
shell: bash | |
run: | | |
mvn clean install -B -Dquickly -Dversion.ai.timefold.solver=${{ github.event.inputs.baseline }} -Dversion.tools.provider="${{ github.event.inputs.async_profiler_version }}" | |
mv target/benchmarks.jar benchmarks-baseline.jar | |
- name: Phase 1 - (Baseline) Upload the binary | |
uses: actions/upload-artifact@v4 | |
with: | |
name: results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }} | |
path: benchmarks-baseline.jar | |
- name: Phase 1 - (SUT) Checkout timefold-solver | |
uses: actions/checkout@v4 | |
with: | |
repository: ${{ github.event.inputs.branch_owner }}/timefold-solver | |
ref: ${{ github.event.inputs.branch }} | |
path: ./timefold-solver | |
- name: Phase 1 - (SUT) Quickly build timefold-solver | |
working-directory: ./timefold-solver | |
shell: bash | |
run: mvn -B -Dquickly clean install | |
# Clone timefold-solver-enterprise | |
- name: Phase 1 - (SUT) Checkout timefold-solver-enterprise (Specified) | |
id: checkout-solver-enterprise | |
uses: actions/checkout@v4 | |
continue-on-error: true | |
with: | |
repository: TimefoldAI/timefold-solver-enterprise | |
ref: ${{ github.event.inputs.branch }} | |
token: ${{ secrets.BENCHMARK_PUBLISH_TOKEN }} | |
path: ./timefold-solver-enterprise | |
- name: Phase 1 - (SUT) Checkout timefold-solver-enterprise (Fallback) | |
if: steps.checkout-solver-enterprise.outcome != 'success' | |
uses: actions/checkout@v4 | |
with: | |
repository: TimefoldAI/timefold-solver-enterprise | |
ref: main | |
token: ${{ secrets.BENCHMARK_PUBLISH_TOKEN }} | |
path: ./timefold-solver-enterprise | |
- name: Phase 1 - (SUT) Quickly build timefold-solver-enterprise | |
working-directory: ./timefold-solver-enterprise | |
shell: bash | |
run: mvn -B -Dquickly clean install | |
- name: Phase 1 - (SUT) Compile the benchmarks | |
working-directory: ./timefold-solver-benchmarks | |
shell: bash | |
run: | | |
mvn clean install -B -Dquickly -Dversion.tools.provider="${{ github.event.inputs.async_profiler_version }}" | |
mv target/benchmarks.jar benchmarks-sut.jar | |
- name: Phase 1 - (SUT) Upload the binary | |
uses: actions/upload-artifact@v4 | |
with: | |
name: results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }} | |
path: benchmarks-sut.jar | |
benchmark: | |
needs: build | |
runs-on: self-hosted | |
strategy: | |
fail-fast: false # Jobs fail if the benchmark error is over predefined thresholds; other benchmarks continue. | |
matrix: | |
example: [cloud_balancing, conference_scheduling, curriculum_course, examination, machine_reassignment, meeting_scheduling, nurse_rostering, patient_admission_scheduling, task_assigning, traveling_tournament, tsp, vehicle_routing] | |
env: | |
MVN_USERNAME: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_USERNAME }}' | |
MVN_PASSWORD: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_TOKEN }}' | |
steps: | |
- name: Phase 2 - Setup JDK and Maven | |
uses: actions/setup-java@v4 | |
with: | |
java-version: ${{ github.event.inputs.jdk }} | |
distribution: 'temurin' | |
cache: 'maven' | |
server-id: 'timefold-solver-enterprise' | |
server-username: 'MVN_USERNAME' | |
server-password: 'MVN_PASSWORD' | |
- name: Phase 2 - Setup Async Profiler | |
working-directory: ./timefold-solver-benchmarks | |
run: | | |
export FILENAME=async-profiler-${{ github.event.inputs.async_profiler_version }}-linux-x64.tar.gz | |
wget https://github.com/async-profiler/async-profiler/releases/download/v${{ github.event.inputs.async_profiler_version }}/$FILENAME | |
tar -xzf $FILENAME | |
ls -l | |
# Fine-tuned for stability on GHA. | |
- name: Phase 3 - Configure the benchmark | |
working-directory: ./timefold-solver-benchmarks | |
shell: bash | |
run: | | |
echo "forks=15" > scoredirector-benchmark.properties | |
echo "warmup_iterations=5" >> scoredirector-benchmark.properties | |
echo "measurement_iterations=5" >> scoredirector-benchmark.properties | |
echo "relative_score_error_threshold=0.02" >> scoredirector-benchmark.properties | |
echo "score_director_type=cs" >> scoredirector-benchmark.properties | |
echo "example=${{ matrix.example }}" >> scoredirector-benchmark.properties | |
cat scoredirector-benchmark.properties | |
chmod +x run-scoredirector.sh | |
- name: Phase 3 - (Baseline) Download the benchmark | |
uses: actions/download-artifact@v2 | |
with: | |
name: results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }} | |
path: benchmarks-baseline.jar | |
- name: Phase 3 - (Baseline) Run the benchmark | |
working-directory: ./timefold-solver-benchmarks | |
id: benchmark_baseline | |
env: | |
RUN_ID: ${{ github.event.inputs.baseline }} | |
shell: bash | |
run: | | |
cp benchmarks-baseline.jar target/benchmarks.jar | |
./run-scoredirector.sh | |
echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]|round' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT" | |
echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]|round' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT" | |
echo "RANGE_MID=$(jq '.[0].primaryMetric.score|round' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT" | |
- name: Phase 4 - (Baseline) Download the benchmark | |
uses: actions/download-artifact@v2 | |
with: | |
name: results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }} | |
path: benchmarks-sut.jar | |
- name: Phase 4 - (SUT) Run the benchmark | |
id: benchmark_new | |
working-directory: ./timefold-solver-benchmarks | |
env: | |
RUN_ID: ${{ github.event.inputs.branch }} | |
shell: bash | |
run: | | |
rm target/benchmarks.jar | |
cp benchmarks-sut.jar target/benchmarks.jar | |
./run-scoredirector.sh | |
echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]|round' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT" | |
echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]|round' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT" | |
echo "RANGE_MID=$(jq '.[0].primaryMetric.score|round' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT" | |
- name: Phase 5 - Archive benchmark data | |
uses: actions/upload-artifact@v4 | |
with: | |
name: results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }} | |
path: | | |
./timefold-solver-benchmarks/benchmarks-baseline.jar | |
./timefold-solver-benchmarks/benchmarks-sut.jar | |
./timefold-solver-benchmarks/scoredirector-benchmark.properties | |
./timefold-solver-benchmarks/results/scoredirector | |
- name: Phase 5 - Report results | |
working-directory: ./timefold-solver-benchmarks | |
env: | |
OLD_RANGE_START: ${{ steps.benchmark_baseline.outputs.RANGE_START }} | |
OLD_RANGE_MID: ${{ steps.benchmark_baseline.outputs.RANGE_MID }} | |
OLD_RANGE_END: ${{ steps.benchmark_baseline.outputs.RANGE_END }} | |
NEW_RANGE_START: ${{ steps.benchmark_new.outputs.RANGE_START }} | |
NEW_RANGE_MID: ${{ steps.benchmark_new.outputs.RANGE_MID }} | |
NEW_RANGE_END: ${{ steps.benchmark_new.outputs.RANGE_END }} | |
shell: bash | |
run: | | |
export OLD_DEV=$(echo "scale=2; ($OLD_RANGE_MID / $OLD_RANGE_START) * 100 - 100" | bc) | |
export NEW_DEV=$(echo "scale=2; ($NEW_RANGE_MID / $NEW_RANGE_START) * 100 - 100" | bc) | |
export DIFF_START=$(echo "scale=2; ($OLD_RANGE_START / $NEW_RANGE_START) * 100" | bc) | |
export DIFF_MID=$(echo "scale=2; ($OLD_RANGE_MID / $NEW_RANGE_MID) * 100" | bc) | |
export DIFF_END=$(echo "scale=2; ($OLD_RANGE_END / $NEW_RANGE_END) * 100" | bc) | |
export FAIL=false | |
if (( $(echo "$DIFF_MID >= 98.00" | bc -l) && $(echo "$DIFF_MID <= 102.00"|bc -l) )); then | |
# Ignore differences of up to 2 %; we can't expect that level of precision anyway. | |
exit 0 | |
else | |
if [ "$NEW_RANGE_START" -le "$OLD_RANGE_END" ] && [ "$NEW_RANGE_END" -ge "$OLD_RANGE_START" ]; then | |
if [ "$NEW_RANGE_START" -ge "$OLD_RANGE_MID" ]; then | |
echo "### 🍀 Possible improvement 🍀" >> $GITHUB_STEP_SUMMARY | |
elif [ "$OLD_RANGE_END" -le "$NEW_RANGE_MID" ]; then | |
echo "### ⚠️ Possible regression ⚠️" >> $GITHUB_STEP_SUMMARY | |
else | |
exit 0 | |
fi | |
elif [ "$NEW_RANGE_START" -gt "$OLD_RANGE_END" ]; then | |
echo "### 🚀🚀🚀 Statistically significant improvement 🚀🚀🚀" >> $GITHUB_STEP_SUMMARY | |
else | |
echo "### ‼️‼️‼️ Statistically significant regression ‼️‼️‼️" >> $GITHUB_STEP_SUMMARY | |
export FAIL=true | |
fi | |
fi | |
echo "| | **Ref** | **Mean** |" >> $GITHUB_STEP_SUMMARY | |
echo "|:------:|:-----------:|:-----------------:|" >> $GITHUB_STEP_SUMMARY | |
echo "| _Old_ | [v${{ github.event.inputs.baseline }}](https://github.com/TimefoldAI/timefold-solver/releases/tag/v${{ github.event.inputs.baseline }}) | ${OLD_RANGE_MID} ± ${OLD_DEV} % |" >> $GITHUB_STEP_SUMMARY | |
echo "| _New_ | [${{ github.event.inputs.branch_owner }}'s ${{ github.event.inputs.branch }}](https://github.com/${{ github.event.inputs.branch_owner }}/timefold-solver/tree/${{ github.event.inputs.branch }}) | ${NEW_RANGE_MID} ± ${NEW_DEV} % |" >> $GITHUB_STEP_SUMMARY | |
echo "| _Diff_ | | ${DIFF_MID} % |" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "Mean is in operations per second. Higher is better." >> $GITHUB_STEP_SUMMARY | |
echo "Mean ± X % describes a 99.9 % confidence interval." >> $GITHUB_STEP_SUMMARY | |
echo "Diff under 100 % represents an improvement, over 100 % a regression." >> $GITHUB_STEP_SUMMARY | |
if [ "$FAIL" = true ]; then | |
exit 1 | |
fi |