Add an LLM fine-tuning example #552
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This workflow will install Python dependencies, run tests and lint with a single version of Python | |
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python | |
name: Python application | |
on: | |
push: | |
branches: | |
- master | |
pull_request: | |
permissions: | |
contents: read | |
# https://stackoverflow.com/a/72408109/6388696 | |
# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-concurrency-to-cancel-any-in-progress-job-or-run | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
cancel-in-progress: true | |
jobs: | |
linting: | |
name: Run linting/pre-commit checks | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: "3.10" | |
- run: pip install 'pre-commit<4.0.0' | |
- run: pre-commit --version | |
- run: pre-commit install | |
- run: pre-commit run --all-files --show-diff-on-failure | |
check_docs: | |
needs: [linting] | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Install the latest version of uv | |
uses: astral-sh/setup-uv@v3 | |
with: | |
version: "latest" | |
enable-cache: true | |
# https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token | |
github-token: ${{ secrets.GITHUB_TOKEN }} | |
cache-suffix: "3.10" | |
- name: Install dependencies | |
run: uv sync --frozen --extra docs | |
- name: Build the documentation (strict mode) | |
run: uv run mkdocs build --strict | |
unit_tests: | |
needs: [linting] | |
runs-on: ${{ matrix.platform }} | |
strategy: | |
max-parallel: 4 | |
matrix: | |
platform: [ubuntu-latest, macos-latest] | |
python-version: ["3.10"] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Install the latest version of uv | |
uses: astral-sh/setup-uv@v3 | |
with: | |
version: "latest" | |
enable-cache: true | |
# https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token | |
github-token: ${{ secrets.GITHUB_TOKEN }} | |
cache-suffix: ${{ matrix.python-version }} | |
- name: Pin python-version ${{ matrix.python-version }} | |
run: uv python pin ${{ matrix.python-version }} | |
- name: Install dependencies | |
run: uv sync --frozen | |
- name: Test with pytest | |
env: | |
JAX_PLATFORMS: cpu | |
run: uv run pytest -v --cov=project --cov-report=xml --cov-append --skip-if-files-missing | |
- name: Store coverage report as an artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: coverage-reports-unit-tests-${{ matrix.platform }}-${{ matrix.python-version }} | |
path: ./coverage.xml | |
local_integration_tests: | |
needs: [unit_tests, check_docs] | |
runs-on: self-hosted | |
timeout-minutes: 20 | |
strategy: | |
max-parallel: 1 | |
matrix: | |
python-version: ["3.10"] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Install the latest version of uv | |
uses: astral-sh/setup-uv@v3 | |
with: | |
version: "latest" | |
enable-cache: false # no need, uses the local uv cache. | |
# https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token | |
github-token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Pin python-version ${{ matrix.python-version }} | |
run: uv python pin ${{ matrix.python-version }} | |
- name: Install dependencies | |
run: uv sync --all-extras --frozen | |
- name: Test with pytest | |
run: uv run pytest -v --cov=project --cov-report=xml --cov-append | |
- name: Test with pytest (only slow tests) | |
run: uv run pytest -v -m slow --slow --cov=project --cov-report=xml --cov-append | |
- name: Store coverage report as an artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: coverage-reports-integration-tests-${{ matrix.python-version }} | |
path: ./coverage.xml | |
launch-slurm-actions-runner: | |
needs: [local_integration_tests] | |
runs-on: self-hosted | |
timeout-minutes: 5 | |
strategy: | |
max-parallel: 5 | |
matrix: | |
cluster: ["mila"] #, 'narval', 'beluga'] | |
outputs: | |
job_id: ${{ steps.sbatch.outputs.stdout }} | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Copy job script to the cluster | |
run: "scp .github/actions-runner-job.sh ${{ matrix.cluster }}:actions-runner-job.sh" | |
- name: Launch Slurm Actions Runner | |
id: sbatch | |
# TODO: for DRAC clusters, the account needs to be set somehow (and obviously not be hard-coded here). | |
# Output the job ID to a file so that the next step can use it. | |
# NOTE: Could also use the --wait flag to wait for the job to finish (and have this run at the same time as the other step). | |
run: | | |
job_id=`ssh ${{ matrix.cluster }} 'cd $SCRATCH && sbatch --parsable $HOME/actions-runner-job.sh'` | |
echo "Submitted job $job_id on the ${{ matrix.cluster }} cluster!" | |
echo "job_id=$job_id" >> "$GITHUB_OUTPUT" | |
# This step runs in a self-hosted Github Actions runner inside a SLURM job on the compute node of the cluster. | |
slurm_integration_tests: | |
name: Run integration tests on the ${{ matrix.cluster }} cluster in job ${{ needs.launch-slurm-actions-runner.outputs.job_id}} | |
needs: [launch-slurm-actions-runner] | |
runs-on: ${{ matrix.cluster }} | |
timeout-minutes: 20 | |
strategy: | |
max-parallel: 5 | |
matrix: | |
# TODO: this should be tied to the same setting in the `launch-slurm-actions-runner` job. | |
# cluster: ${{ needs.launch-slurm-actions-runner.strategy.matrix.cluster }} | |
cluster: ["mila"] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up the repo using the setup script | |
run: scripts/mila_setup.sh | |
- name: Install dependencies | |
run: uv sync --all-extras --frozen | |
- name: Show installed packages | |
run: uv pip list | |
- name: Test with pytest | |
run: uv run pytest -v --cov=project --cov-report=xml --cov-append | |
# TODO: Disabling full regression tests on the cluster for now, because the worker is often | |
# interrupted and we want to avoid using the unkillable partition to not interrupt other's | |
# work. The full regression tests are still run on the local_integration_tests job. | |
# - name: Test with pytest (only slow tests) | |
# run: rye run pytest -v -m slow --slow --cov=project --cov-report=xml --cov-append | |
- name: Store coverage report as an artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: coverage-reports-slurm-integration-tests-${{ matrix.cluster }} | |
path: ./coverage.xml | |
# https://about.codecov.io/blog/uploading-code-coverage-in-a-separate-job-on-github-actions/ | |
upload-coverage-codecov: | |
needs: [local_integration_tests, slurm_integration_tests] | |
runs-on: ubuntu-latest | |
name: Upload coverage reports to Codecov | |
timeout-minutes: 5 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Download artifacts | |
uses: actions/download-artifact@v4 | |
with: | |
pattern: coverage-reports-* | |
merge-multiple: false | |
# download all the artifacts in this directory (each .coverage.xml will be in a subdirectory) | |
# Next step if this doesn't work would be to give the coverage files a unique name and use merge-multiple: true | |
path: coverage_reports | |
- name: Upload coverage reports to Codecov | |
uses: codecov/codecov-action@v4 | |
with: | |
token: ${{ secrets.CODECOV_TOKEN }} | |
directory: coverage_reports | |
fail_ci_if_error: true |