Performance Regression Test - Score Director #73
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# - Runs entirely on a single machine. | |
# - The baseline is established first, then the branch under test is measured. | |
# - Each benchmark gives a 99.9 % confidence interval. | |
# - The confidence intervals are compared to determine if the branch under test is a regression or an improvement. | |
# - The error threshold is expected to be below +/- 2.5 %. | |
# We have yet to see an error of over +/- 4 %. | |
# With the error so high, the impact is that small regressions are not considered statistically significant. | |
name: Performance Regression Test - Score Director | |
on: | |
workflow_dispatch: | |
inputs: | |
jdk: | |
description: 'JDK version' | |
default: '21' | |
required: true | |
baseline: | |
description: 'Timefold Solver release' | |
default: '1.16.0' | |
required: true | |
branch: | |
description: 'Branch to benchmark (needs to use 999-SNAPSHOT)' | |
default: 'main' | |
required: true | |
branch_owner: | |
description: 'User owning the branch' | |
default: 'TimefoldAI' | |
required: true | |
async_profiler_version: | |
description: 'async-profiler version' | |
default: '3.0' | |
required: true | |
jobs: | |
benchmark: | |
runs-on: self-hosted | |
strategy: | |
fail-fast: false # Jobs fail if the benchmark error is over predefined thresholds; other benchmarks continue. | |
matrix: | |
example: [cloud_balancing, conference_scheduling, curriculum_course, examination, machine_reassignment, meeting_scheduling, nurse_rostering, patient_admission_scheduling, task_assigning, traveling_tournament, tsp, vehicle_routing] | |
env: | |
MVN_USERNAME: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_USERNAME }}' | |
MVN_PASSWORD: '${{ secrets.JFROG_ENTERPRISE_READ_ONLY_ACCESS_TOKEN }}' | |
steps: | |
- name: Phase 0 - Checkout timefold-solver-benchmarks | |
uses: actions/checkout@v4 | |
with: | |
repository: TimefoldAI/timefold-solver-benchmarks | |
path: ./timefold-solver-benchmarks | |
- name: Phase 0 - Setup JDK and Maven | |
uses: actions/setup-java@v4 | |
with: | |
java-version: ${{ github.event.inputs.jdk }} | |
distribution: 'temurin' | |
cache: 'maven' | |
server-id: 'timefold-solver-enterprise' | |
server-username: 'MVN_USERNAME' | |
server-password: 'MVN_PASSWORD' | |
- name: Phase 0 - Setup Async Profiler | |
working-directory: ./timefold-solver-benchmarks | |
run: | | |
export FILENAME=async-profiler-${{ github.event.inputs.async_profiler_version }}-linux-x64.tar.gz | |
wget https://github.com/async-profiler/async-profiler/releases/download/v${{ github.event.inputs.async_profiler_version }}/$FILENAME | |
tar -xzf $FILENAME | |
ls -l | |
# Fine-tuned for stability on GHA. | |
- name: Phase 0 - Configure the benchmark | |
working-directory: ./timefold-solver-benchmarks | |
shell: bash | |
run: | | |
echo "forks=20" > scoredirector-benchmark.properties | |
echo "warmup_iterations=10" >> scoredirector-benchmark.properties | |
echo "measurement_iterations=10" >> scoredirector-benchmark.properties | |
echo "relative_score_error_threshold=0.02" >> scoredirector-benchmark.properties | |
echo "score_director_type=cs" >> scoredirector-benchmark.properties | |
echo "example=${{ matrix.example }}" >> scoredirector-benchmark.properties | |
cat scoredirector-benchmark.properties | |
chmod +x run-scoredirector.sh | |
- name: Phase 1 - Compile the benchmark | |
working-directory: ./timefold-solver-benchmarks | |
shell: bash | |
run: mvn clean install -B -Dquickly -Dversion.ai.timefold.solver=${{ github.event.inputs.baseline }} -Dversion.tools.provider="${{ github.event.inputs.async_profiler_version }}" | |
- name: Phase 1 - Run the baseline configuration | |
working-directory: ./timefold-solver-benchmarks | |
id: benchmark_baseline | |
env: | |
RUN_ID: ${{ github.event.inputs.baseline }} | |
shell: bash | |
run: | | |
./run-scoredirector.sh | |
echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]|round' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT" | |
echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]|round' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT" | |
echo "RANGE_MID=$(jq '.[0].primaryMetric.score|round' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT" | |
- name: Phase 2 - Checkout timefold-solver | |
uses: actions/checkout@v4 | |
with: | |
repository: ${{ github.event.inputs.branch_owner }}/timefold-solver | |
ref: ${{ github.event.inputs.branch }} | |
path: ./timefold-solver | |
- name: Phase 2 - Quickly build timefold-solver | |
working-directory: ./timefold-solver | |
shell: bash | |
run: mvn -B -Dquickly clean install | |
# Clone timefold-solver-enterprise | |
- name: Phase 2 - Checkout timefold-solver-enterprise (Specified) | |
id: checkout-solver-enterprise | |
uses: actions/checkout@v4 | |
continue-on-error: true | |
with: | |
repository: TimefoldAI/timefold-solver-enterprise | |
ref: ${{ github.event.inputs.branch }} | |
token: ${{ secrets.BENCHMARK_PUBLISH_TOKEN }} | |
path: ./timefold-solver-enterprise | |
- name: Phase 2 - Checkout timefold-solver-enterprise (Fallback) | |
if: steps.checkout-solver-enterprise.outcome != 'success' | |
uses: actions/checkout@v4 | |
with: | |
repository: TimefoldAI/timefold-solver-enterprise | |
ref: main | |
token: ${{ secrets.BENCHMARK_PUBLISH_TOKEN }} | |
path: ./timefold-solver-enterprise | |
- name: Phase 2 - Quickly build timefold-solver-enterprise | |
working-directory: ./timefold-solver-enterprise | |
shell: bash | |
run: mvn -B -Dquickly clean install | |
- name: Phase 2 - Compile the benchmarks | |
working-directory: ./timefold-solver-benchmarks | |
shell: bash | |
run: mvn clean install -B -Dquickly -Dversion.tools.provider="${{ github.event.inputs.async_profiler_version }}" | |
- name: Phase 2 - Run the benchmark on the new code | |
id: benchmark_new | |
working-directory: ./timefold-solver-benchmarks | |
env: | |
RUN_ID: ${{ github.event.inputs.branch }} | |
shell: bash | |
run: | | |
./run-scoredirector.sh | |
echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]|round' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT" | |
echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]|round' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT" | |
echo "RANGE_MID=$(jq '.[0].primaryMetric.score|round' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT" | |
- name: Phase 3 - Archive benchmark data | |
uses: actions/upload-artifact@v4 | |
with: | |
name: results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }} | |
path: | | |
./timefold-solver-benchmarks/results/scoredirector | |
- name: Phase 3 - Report results | |
working-directory: ./timefold-solver-benchmarks | |
env: | |
OLD_RANGE_START: ${{ steps.benchmark_baseline.outputs.RANGE_START }} | |
OLD_RANGE_MID: ${{ steps.benchmark_baseline.outputs.RANGE_MID }} | |
OLD_RANGE_END: ${{ steps.benchmark_baseline.outputs.RANGE_END }} | |
NEW_RANGE_START: ${{ steps.benchmark_new.outputs.RANGE_START }} | |
NEW_RANGE_MID: ${{ steps.benchmark_new.outputs.RANGE_MID }} | |
NEW_RANGE_END: ${{ steps.benchmark_new.outputs.RANGE_END }} | |
shell: bash | |
run: | | |
export DIFF_START=$(echo "scale=2; ($OLD_RANGE_START / $NEW_RANGE_START) * 100" | bc) | |
export DIFF_MID=$(echo "scale=2; ($OLD_RANGE_MID / $NEW_RANGE_MID) * 100" | bc) | |
export DIFF_END=$(echo "scale=2; ($OLD_RANGE_END / $NEW_RANGE_END) * 100" | bc) | |
export FAIL=false | |
if (( $(echo "$DIFF_MID >= 98.00" | bc -l) && $(echo "$DIFF_MID <= 102.00"|bc -l) )); then | |
# Ignore differences of up to 2 %. | |
echo "### Performance unchanged" >> $GITHUB_STEP_SUMMARY | |
echo "(Decided to ignore a very small difference of under 2 %.)" >> $GITHUB_STEP_SUMMARY | |
else | |
if [ "$NEW_RANGE_START" -le "$OLD_RANGE_END" ] && [ "$NEW_RANGE_END" -ge "$OLD_RANGE_START" ]; then | |
if [ "$NEW_RANGE_START" -ge "$OLD_RANGE_MID" ]; then | |
echo "### 🍀 Possible improvement 🍀" >> $GITHUB_STEP_SUMMARY | |
elif [ "$OLD_RANGE_END" -le "$NEW_RANGE_MID" ]; then | |
echo "### ⚠️ Possible regression ⚠️" >> $GITHUB_STEP_SUMMARY | |
else | |
echo "### Performance unchanged " >> $GITHUB_STEP_SUMMARY | |
fi | |
elif [ "$NEW_RANGE_START" -gt "$OLD_RANGE_END" ]; then | |
echo "### 🚀🚀🚀 Statistically significant improvement 🚀🚀🚀" >> $GITHUB_STEP_SUMMARY | |
else | |
echo "### ‼️‼️‼️ Statistically significant regression ‼️‼️‼️" >> $GITHUB_STEP_SUMMARY | |
export FAIL=true | |
fi | |
fi | |
echo "| | **Ref** | **Min** | **Mean** | **Max** |" >> $GITHUB_STEP_SUMMARY | |
echo "|:------:|:-----------:|:-----------------:|:-----------------:|:-----------------:|" >> $GITHUB_STEP_SUMMARY | |
echo "| _Old_ | [v${{ github.event.inputs.baseline }}](https://github.com/TimefoldAI/timefold-solver/releases/tag/v${{ github.event.inputs.baseline }}) | ${OLD_RANGE_START} | ${OLD_RANGE_MID} | ${OLD_RANGE_END} |" >> $GITHUB_STEP_SUMMARY | |
echo "| _New_ | [${{ github.event.inputs.branch_owner }}'s ${{ github.event.inputs.branch }}](https://github.com/${{ github.event.inputs.branch_owner }}/timefold-solver/tree/${{ github.event.inputs.branch }}) | ${NEW_RANGE_START} | ${NEW_RANGE_MID} | ${NEW_RANGE_END} |" >> $GITHUB_STEP_SUMMARY | |
echo "| _Diff_ | | ${DIFF_START} % | ${DIFF_MID} % | ${DIFF_END} % |" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "Min and max define a 99.9 % confidence interval." >> $GITHUB_STEP_SUMMARY | |
echo "Min and max are in operations per second. Higher is better." >> $GITHUB_STEP_SUMMARY | |
echo "Diff under 100 % represents an improvement, over 100 % a regression." >> $GITHUB_STEP_SUMMARY | |
if [ "$FAIL" = true ]; then | |
exit 1 | |
fi |