diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83e6846..345514a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,8 +7,10 @@ on: push: branches: - main + jobs: - test: + # Tests ci-storage tool itself. + ci-storage-test: runs-on: ubuntu-latest steps: - name: Checkout @@ -32,3 +34,38 @@ jobs: set -e ls -la ~/ci-storage/dimikot/ci-storage [ "$(cat dummy.txt)" = "dummy" ] || { echo "dummy.txt was not restored"; exit 1; } + + # Builds and boots a self-hosted runner inside GitHub's infra. Once it's + # settled, there is a container with one self-hosted runner running and + # waiting for jobs with "ci-storage-test" tag to pick up (based on + # docker/Dockerfile image). + self-hosted-runner-boot-docker: + runs-on: ubuntu-latest + timeout-minutes: 5 + permissions: + actions: write + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Build Docker image + run: cd docker && docker-compose build + - name: Start Docker container + run: cd docker && docker-compose up + env: + GH_REPOSITORY: ${{ github.repository }} + GH_LABELS: ci-storage-test + GH_TOKEN: ${{ secrets.CI_PAT }} + + # The test job with ci-storage-test tag which is initially queued, but then is + # picked up by the self-hosted runner booted in the previous job. In the end, + # the test job sends SIGINT to the container entrypoint.sh PID, so the + # container (based on docker/Dockerfile image) shuts down gracefully. + self-hosted-runner-spawn-job-test: + runs-on: ["self-hosted", "ci-storage-test"] + steps: + - name: Run Hello World job and then terminate run.sh + run: | + set -e -o xtrace + echo "Hello, world!" + cd /home/ubuntu/actions-runner + kill -SIGINT $(cat runner.pid) diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..4272ec5 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,45 @@ +FROM ubuntu:22.04 + +ARG RUNNER_VERSION=2.314.1 + +ENV GH_REPOSITORY="" +ENV GH_LABELS="" +ENV GH_TOKEN="" + +ENV DEBIAN_FRONTEND=noninteractive +RUN true \ + && apt-get update -y \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + awscli jq gh \ + mc gcc git curl wget pv psmisc unzip vim nano telnet net-tools bash-completion \ + libssl-dev apt-transport-https build-essential ca-certificates locales pkg-config + +RUN true \ + && useradd -m ubuntu + +USER ubuntu +RUN true \ + && mkdir /home/ubuntu/actions-runner \ + && cd /home/ubuntu/actions-runner \ + && arch=$(dpkg --print-architecture) \ + && case "$arch" in \ + x86_64|amd64) arch=linux-x64 ;; \ + aarch64|arm64) arch=linux-arm64 ;; \ + *) echo >&2 "unsupported architecture: $arch"; exit 1 ;; \ + esac \ + && curl --no-progress-meter -L https://github.com/actions/runner/releases/download/v$RUNNER_VERSION/actions-runner-$arch-$RUNNER_VERSION.tar.gz | tar xz + +USER root +RUN /home/ubuntu/actions-runner/bin/installdependencies.sh \ + apt-get autoremove \ + && apt-get clean \ + && apt-get autoclean \ + && rm -rf /var/lib/apt/lists/* + +USER ubuntu +COPY --chmod=755 --chown=ubuntu:ubuntu entrypoint.sh /home/ubuntu + +WORKDIR /home/ubuntu +ENTRYPOINT ["./entrypoint.sh"] +CMD ["./run.sh"] diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..51d0616 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,10 @@ +version: "3.4" +services: + ci-storage: + build: + context: . + dockerfile: ./Dockerfile + environment: + - GH_REPOSITORY + - GH_LABELS + - GH_TOKEN diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000..25d0e15 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# +# Here we make an opinionated decision to NOT use ephemeral or jit acton +# runners. Reasons: +# - We WANT to reuse the work directory across job runs, that's the whole point +# of ci-storage architecture and its speedup benefits. So once the runner +# finishes some job, we do NOT want it to terminate (as it does in ephemeral +# or jit mode), we want it to CONTINUE listening for more jobs to run. +# - GitHub doesn't allow to remove busy runners via API, which is very good for +# us: in case the container shuts down externaly due to downscaling, we just +# enter the graceful retry loop to delete the corresponding runner via API. +# - One downside happens when a runner container dies unexpectedly (rare). In +# this case, regular "offline" long-living runners are auto-removed by GitHub +# itself once in 2 weeks, whilst ephemeral (or jit) "offline" runners are +# auto-removed in 1 day. But we anyways need to implement some manual removal +# cycle exernally, since even 1 day is way too much for garbage accumulation. +# +set -u -e -o xtrace + +: $GH_REPOSITORY # {owner}/{repo} +: $GH_LABELS +: $GH_TOKEN # used by gh cli + +cd ./actions-runner + +name="ci-storage-$(hostname)" + +token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/registration-token") +./config.sh \ + --unattended \ + --url https://github.com/$GH_REPOSITORY \ + --token "$token" \ + --name "$name" \ + --labels "$GH_LABELS" + +cleanup() { + # Retry deleting the runner until it succeeds. + # - Busy runner fails in deletion, so we can retry safely until it becomes + # idle and is successfully deleted. + # - The extrnal orchestrator will eventually kill the container after a large + # timeout (say, 15 minutes or so) needed for a running job to finish. + while :; do + token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/remove-token") + ./config.sh remove --token "$token" && break + sleep 5 + : "Retrying deletion till the runner becomes idle and succeeds..." + done +} + +trap "cleanup; exit 130" INT +trap "cleanup; exit 143" TERM + +echo $$ > runner.pid + +eval "$@" & wait $!