Skip to content

Add an LLM fine-tuning example #553

Add an LLM fine-tuning example

Add an LLM fine-tuning example #553

Workflow file for this run

# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Python application
on:
push:
branches:
- master
pull_request:
permissions:
contents: read
# https://stackoverflow.com/a/72408109/6388696
# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-concurrency-to-cancel-any-in-progress-job-or-run
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
linting:
name: Run linting/pre-commit checks
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: "3.10"
- run: pip install 'pre-commit<4.0.0'
- run: pre-commit --version
- run: pre-commit install
- run: pre-commit run --all-files --show-diff-on-failure
check_docs:
needs: [linting]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install the latest version of uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"
enable-cache: true
# https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token
github-token: ${{ secrets.GITHUB_TOKEN }}
cache-suffix: "3.10"
- name: Install dependencies
run: uv sync --frozen --extra docs
- name: Build the documentation (strict mode)
run: uv run mkdocs build --strict
unit_tests:
needs: [linting]
runs-on: ${{ matrix.platform }}
strategy:
max-parallel: 4
matrix:
platform: [ubuntu-latest, macos-latest]
python-version: ["3.10"]
steps:
- uses: actions/checkout@v4
- name: Install the latest version of uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"
enable-cache: true
# https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token
github-token: ${{ secrets.GITHUB_TOKEN }}
cache-suffix: ${{ matrix.python-version }}
- name: Pin python-version ${{ matrix.python-version }}
run: uv python pin ${{ matrix.python-version }}
- name: Install dependencies
run: uv sync --frozen
- name: Test with pytest
env:
JAX_PLATFORMS: cpu
run: uv run pytest -v --cov=project --cov-report=xml --cov-append --skip-if-files-missing
- name: Store coverage report as an artifact
uses: actions/upload-artifact@v4
with:
name: coverage-reports-unit-tests-${{ matrix.platform }}-${{ matrix.python-version }}
path: ./coverage.xml
local_integration_tests:
needs: [unit_tests, check_docs]
runs-on: self-hosted
timeout-minutes: 20
strategy:
max-parallel: 1
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@v4
- name: Install the latest version of uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"
enable-cache: false # no need, uses the local uv cache.
# https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Pin python-version ${{ matrix.python-version }}
run: uv python pin ${{ matrix.python-version }}
- name: Install dependencies
run: uv sync --all-extras --frozen
- name: Test with pytest
run: uv run pytest -v --cov=project --cov-report=xml --cov-append
- name: Test with pytest (only slow tests)
run: uv run pytest -v -m slow --slow --cov=project --cov-report=xml --cov-append
- name: Store coverage report as an artifact
uses: actions/upload-artifact@v4
with:
name: coverage-reports-integration-tests-${{ matrix.python-version }}
path: ./coverage.xml
launch-slurm-actions-runner:
needs: [local_integration_tests]
runs-on: self-hosted
timeout-minutes: 5
strategy:
max-parallel: 5
matrix:
cluster: ["mila"] #, 'narval', 'beluga']
outputs:
job_id: ${{ steps.sbatch.outputs.stdout }}
steps:
- uses: actions/checkout@v4
- name: Copy job script to the cluster
run: "scp .github/actions-runner-job.sh ${{ matrix.cluster }}:actions-runner-job.sh"
- name: Launch Slurm Actions Runner
id: sbatch
# TODO: for DRAC clusters, the account needs to be set somehow (and obviously not be hard-coded here).
# Output the job ID to a file so that the next step can use it.
# NOTE: Could also use the --wait flag to wait for the job to finish (and have this run at the same time as the other step).
run: |
job_id=`ssh ${{ matrix.cluster }} 'cd $SCRATCH && sbatch --parsable $HOME/actions-runner-job.sh'`
echo "Submitted job $job_id on the ${{ matrix.cluster }} cluster!"
echo "job_id=$job_id" >> "$GITHUB_OUTPUT"
# This step runs in a self-hosted Github Actions runner inside a SLURM job on the compute node of the cluster.
slurm_integration_tests:
name: Run integration tests on the ${{ matrix.cluster }} cluster in job ${{ needs.launch-slurm-actions-runner.outputs.job_id}}
needs: [launch-slurm-actions-runner]
runs-on: ${{ matrix.cluster }}
timeout-minutes: 20
strategy:
max-parallel: 5
matrix:
# TODO: this should be tied to the same setting in the `launch-slurm-actions-runner` job.
# cluster: ${{ needs.launch-slurm-actions-runner.strategy.matrix.cluster }}
cluster: ["mila"]
steps:
- uses: actions/checkout@v4
- name: Set up the repo using the setup script
run: scripts/mila_setup.sh
- name: Install dependencies
run: uv sync --all-extras --frozen
- name: Show installed packages
run: uv pip list
- name: Test with pytest
run: uv run pytest -v --cov=project --cov-report=xml --cov-append
# TODO: Disabling full regression tests on the cluster for now, because the worker is often
# interrupted and we want to avoid using the unkillable partition to not interrupt other's
# work. The full regression tests are still run on the local_integration_tests job.
# - name: Test with pytest (only slow tests)
# run: rye run pytest -v -m slow --slow --cov=project --cov-report=xml --cov-append
- name: Store coverage report as an artifact
uses: actions/upload-artifact@v4
with:
name: coverage-reports-slurm-integration-tests-${{ matrix.cluster }}
path: ./coverage.xml
# https://about.codecov.io/blog/uploading-code-coverage-in-a-separate-job-on-github-actions/
upload-coverage-codecov:
needs: [local_integration_tests, slurm_integration_tests]
runs-on: ubuntu-latest
name: Upload coverage reports to Codecov
timeout-minutes: 5
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download artifacts
uses: actions/download-artifact@v4
with:
pattern: coverage-reports-*
merge-multiple: false
# download all the artifacts in this directory (each .coverage.xml will be in a subdirectory)
# Next step if this doesn't work would be to give the coverage files a unique name and use merge-multiple: true
path: coverage_reports
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: coverage_reports
fail_ci_if_error: true