diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d8e52fe..e2d3962 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,20 +48,19 @@ jobs: uses: actions/checkout@v2 - name: Start test Docker containers run: | - ssh-keygen -t ed25519 -qf /tmp/key -N "" - export CI_STORAGE_HOST_SSH_KEY="$(cat /tmp/key)" set -o xtrace cd docker - # Boot ci-storage-host container. It will have a default empty slot. + # Boot ci-storage-host container in background. docker compose up ci-storage-host -d --build # Now boot self-hosted-runner container. It will connect to - # ci-storage-host container and load the empty slot from there, then - # register a GitHub self-hosted runner and remain waiting for jobs. + # ci-storage-host container and load a test (non-existent) ci-storage + # slot from there, then register a GitHub self-hosted runner and + # remain waiting for jobs. docker compose up self-hosted-runner --build env: GH_REPOSITORY: ${{ github.repository }} GH_LABELS: ci-storage-test - GH_TOKEN: ${{ secrets.CI_PAT }} + GH_TOKEN: ${{ secrets.CI_PAT }} # The test job with ci-storage-test tag which is initially queued, but then is # picked up by the self-hosted-runner container booted in the previous job. In @@ -80,6 +79,4 @@ jobs: action: "store" storage-host: "ci-storage-host" - name: Kill self-hosted runner container - run: | - cd /home/ubuntu/actions-runner - kill -SIGINT $(cat runner.pid) + run: kill -SIGINT $(cat cd ~user/entrypoint.pid) diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..cc36e3e --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,10 @@ +{ + "recommendations": [ + "GitHub.vscode-github-actions", + "jeff-hykin.better-dockerfile-syntax", + "mads-hartmann.bash-ide-vscode", + "ms-python.black-formatter", + "ms-python.python", + "timonwong.shellcheck", + ] +} diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..7076e3d --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,20 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "git grok: push local commits as individual PRs", + "detail": "Install git-grok first: https://github.com/dimikot/git-grok", + "type": "shell", + "command": "git grok", + "problemMatcher": [], + "hide": false + }, + { + "label": "git rebase --interactive", + "detail": "Opens a UI for interactive rebase (install \"Git rebase shortcuts\" extension).", + "type": "shell", + "command": "GIT_EDITOR=\"code --wait\" git rebase -i", + "problemMatcher": [] + } + ] +} diff --git a/action.yml b/action.yml index 3353d8f..5ebf468 100644 --- a/action.yml +++ b/action.yml @@ -17,7 +17,7 @@ inputs: description: "Remove slots created earlier than this many seconds ago. If not set, uses the ci-storage tool default 4 hours." required: false slot-id: - description: 'Id of the slot to store to or load from; use "*" to load a random most recent slot. If empty, uses "$GITHUB_RUN_ID-$GITHUB_RUN_ATTEMPT" value.' + description: 'Id of the slot to store to or load from; use "*" to load a random most recent slot; use "?" to load a random most recent slot and skip if it does not exist. If empty, uses "$GITHUB_RUN_ID-$GITHUB_RUN_ATTEMPT" value.' required: false local-dir: description: 'Local directory path to store from or load to. If not set, uses "." (the current work directory).' @@ -27,7 +27,6 @@ inputs: required: false verbose: description: "If set, prints the list of transferred files." - type: boolean required: false runs: using: "composite" diff --git a/ci-storage b/ci-storage index 94b92d2..d90de52 100755 --- a/ci-storage +++ b/ci-storage @@ -74,7 +74,7 @@ def main(): "--slot-id", type=str, required=True, - help='id of the slot to store to or load from; use "*" to load a random most recent slot', + help='id of the slot to store to or load from; use "*" to load a random most recent slot; use "?" to load a random most recent slot and skip if it does not exist', ) parser.add_argument( "--local-dir", @@ -162,8 +162,8 @@ def action_store( exclude: list[str], verbose: bool, ): - if slot_id == "*": - raise UserException('slot_id="*" is not allowed for "store" action') + if slot_id == "*" or slot_id == "?": + raise UserException(f'slot_id="{slot_id}" is not allowed for "store" action') slot_id = normalize_slot_id(slot_id) slot_ids_and_ages = list_slots(storage_host=storage_host, storage_dir=storage_dir) slot_id_recent = slot_ids_and_ages[0][0] if len(slot_ids_and_ages) else None @@ -208,14 +208,18 @@ def action_load( exclude: list[str], verbose: bool, ): - if slot_id == "*": + if slot_id == "*" or slot_id == "?": slot_ids_and_ages = list_slots( storage_host=storage_host, storage_dir=storage_dir ) if len(slot_ids_and_ages) == 0: - raise UserException( - 'to use slot_id="*", there must be at least one slot in the storage.' - ) + if slot_id == "?": + print(f'No slots found, and slot-id="{slot_id}", so skipping.') + return + else: + raise UserException( + f'to use slot-id="{slot_id}", there must be at least one slot in the storage.' + ) slot_id = slot_ids_and_ages[0][0] else: slot_id = normalize_slot_id(slot_id) diff --git a/docker/.env b/docker/.env new file mode 100644 index 0000000..31a49e7 --- /dev/null +++ b/docker/.env @@ -0,0 +1,11 @@ +# This key (as well as docker-compose.yml) is only used in tests, so it's safe +# to have it here. +CI_STORAGE_HOST_PRIVATE_KEY_TEST_ONLY="-----BEGIN OPENSSH PRIVATE KEY----- +b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW +QyNTUxOQAAACBZgbuAsWHfeYshNJacifV30KxJVFKr4/B4WnvxO8x2jAAAAKgm2KyUJtis +lAAAAAtzc2gtZWQyNTUxOQAAACBZgbuAsWHfeYshNJacifV30KxJVFKr4/B4WnvxO8x2jA +AAAECRcPB4jRqJEgNBvFPA6+k5HPT5/ZbXnD2KUyE+oJFfA1mBu4CxYd95iyE0lpyJ9XfQ +rElUUqvj8Hhae/E7zHaMAAAAHmRtaXRyeUBEbWl0cnktTWFjQm9vay1NMS5sb2NhbAECAw +QFBgc= +-----END OPENSSH PRIVATE KEY-----" +CI_STORAGE_HOST_PUBLIC_KEY_TEST_ONLY="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFmBu4CxYd95iyE0lpyJ9XfQrElUUqvj8Hhae/E7zHaM" diff --git a/docker/ci-storage-host/Dockerfile b/docker/ci-storage-host/Dockerfile index e57c28f..0008929 100644 --- a/docker/ci-storage-host/Dockerfile +++ b/docker/ci-storage-host/Dockerfile @@ -2,22 +2,19 @@ ARG BASE_IMAGE="ubuntu:22.04" FROM $BASE_IMAGE -ENV GH_REPOSITORY="" -ENV CI_STORAGE_HOST_SSH_KEY="" +ENV CI_STORAGE_HOST_PUBLIC_KEY="" ENV DEBIAN_FRONTEND=noninteractive RUN true \ && apt-get update -y \ - && apt-get upgrade -y \ && apt-get install -y --no-install-recommends \ - awscli rsync openssh-server \ - mc gcc git curl wget pv psmisc unzip vim nano telnet net-tools bash-completion \ - libssl-dev apt-transport-https build-essential ca-certificates locales pkg-config \ + openssh-server \ + jq gh rsync python3 mc git curl wget pv psmisc unzip vim nano telnet net-tools apt-transport-https ca-certificates locales \ && sed -i -e "s|#PermitRootLogin.*|PermitRootLogin no|" /etc/ssh/sshd_config \ - && useradd -m ubuntu \ - && mkdir -p /home/ubuntu/.ssh \ - && chown -R ubuntu:ubuntu /home/ubuntu/.ssh \ - && chmod 700 /home/ubuntu/.ssh + && useradd -m user \ + && mkdir -p ~user/.ssh ~user/ci-storage \ + && chown -R user:user ~user \ + && chmod 700 ~user/.ssh COPY --chmod=755 entrypoint.sh / diff --git a/docker/ci-storage-host/README.md b/docker/ci-storage-host/README.md new file mode 100644 index 0000000..8cfd48b --- /dev/null +++ b/docker/ci-storage-host/README.md @@ -0,0 +1,6 @@ +# A Simple Container with SSH Server + +Build an image from this Dockerfile to launch a simple SSH server with rsync. + +- Pre-creates /home/user/ci-storage directory. +- Copies public key in CI_STORAGE_HOST_PUBLIC_KEY to user's authorized_keys. diff --git a/docker/ci-storage-host/entrypoint.sh b/docker/ci-storage-host/entrypoint.sh index f36fb14..11f7cec 100644 --- a/docker/ci-storage-host/entrypoint.sh +++ b/docker/ci-storage-host/entrypoint.sh @@ -1,28 +1,20 @@ #!/bin/bash # -# A container which holds ci-storage saved slots. Its ~ubuntu/ci-storage should -# be persistent across container restarts. +# A container which holds ci-storage saved slots. Its ~user/ci-storage should be +# persistent across container restarts (e.g. point to an AWS EBS volume). # set -u -e -if [ "${CI_STORAGE_HOST_SSH_KEY:-}" = "" ]; then - echo "CI_STORAGE_HOST_SSH_KEY is not set, exiting..." +if [[ "${CI_STORAGE_HOST_PUBLIC_KEY:=''}" == "" ]]; then + echo "CI_STORAGE_HOST_PUBLIC_KEY must be set to a valid SSH public key." exit 1 fi -cd /home/ubuntu +authorized_keys=~user/.ssh/authorized_keys -echo "$CI_STORAGE_HOST_SSH_KEY" > .ssh/id_ed25519 -chmod 600 .ssh/id_ed25519 -ssh-keygen -f .ssh/id_ed25519 -y > .ssh/authorized_keys -chown -R ubuntu:ubuntu .ssh - -# This code is for simplifying the CI tests and allow self-hosted-runner to boot -# in docker-compose. In real world, the 1st slot created should contain the real -# files (e.g. a cloned git repo). -if [ ! -e ci-storage -a "${GH_REPOSITORY:-}" != "" ]; then - mkdir -p ci-storage/$GH_REPOSITORY/initial - chown -R ubuntu:ubuntu ci-storage +if [[ ! -f $authorized_keys ]] || ! grep -qF "$CI_STORAGE_HOST_PUBLIC_KEY" $authorized_keys; then + echo "$CI_STORAGE_HOST_PUBLIC_KEY" >> $authorized_keys + chown user:user $authorized_keys fi mkdir -p /var/run/sshd diff --git a/docker/compose-up.sh b/docker/compose-up.sh new file mode 100755 index 0000000..0df0c56 --- /dev/null +++ b/docker/compose-up.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +echo "Booting containters on the local laptop for debugging purposes..." + +GH_REPOSITORY=$(gh repo view --json owner,name -q '.owner.login + "/" + .name') GH_TOKEN=$(gh auth token) docker compose up --build "$@" diff --git a/docker/compose.yml b/docker/compose.yml new file mode 100644 index 0000000..5cd42b2 --- /dev/null +++ b/docker/compose.yml @@ -0,0 +1,30 @@ +version: "3.4" +services: + ci-storage-host: + build: + context: ci-storage-host + dockerfile: Dockerfile + healthcheck: + test: ["CMD", "bash", "-c", "netstat -ltn | grep -c :22"] + interval: 1s + timeout: 3s + retries: 10 + ports: + - 10022:22 + environment: + - CI_STORAGE_HOST_PUBLIC_KEY=${CI_STORAGE_HOST_PUBLIC_KEY_TEST_ONLY?} + self-hosted-runner: + build: + context: self-hosted-runner + additional_contexts: + root: .. + dockerfile: Dockerfile + depends_on: + ci-storage-host: + condition: service_healthy + environment: + - GH_REPOSITORY=dimikot/ci-storage + - GH_LABELS=ci-storage + - GH_TOKEN + - CI_STORAGE_HOST=ci-storage-host + - CI_STORAGE_HOST_PRIVATE_KEY=${CI_STORAGE_HOST_PRIVATE_KEY_TEST_ONLY?} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml deleted file mode 100644 index a402b64..0000000 --- a/docker/docker-compose.yml +++ /dev/null @@ -1,23 +0,0 @@ -version: "3.4" -services: - ci-storage-host: - build: - context: ci-storage-host - dockerfile: Dockerfile - ports: - - 10022:22 - environment: - - GH_REPOSITORY - - CI_STORAGE_HOST_SSH_KEY - self-hosted-runner: - build: - context: self-hosted-runner - additional_contexts: - root: .. - dockerfile: Dockerfile - environment: - - GH_REPOSITORY - - GH_LABELS - - GH_TOKEN - - CI_STORAGE_HOST=ci-storage-host - - CI_STORAGE_HOST_SSH_KEY diff --git a/docker/self-hosted-runner/Dockerfile b/docker/self-hosted-runner/Dockerfile index 9dc5f07..140c1fe 100644 --- a/docker/self-hosted-runner/Dockerfile +++ b/docker/self-hosted-runner/Dockerfile @@ -1,5 +1,4 @@ ARG BASE_IMAGE="ubuntu:22.04" - FROM $BASE_IMAGE ARG RUNNER_VERSION="2.314.1" @@ -8,24 +7,22 @@ ENV GH_REPOSITORY="" ENV GH_LABELS="" ENV GH_TOKEN="" ENV CI_STORAGE_HOST="" -ENV CI_STORAGE_HOST_SSH_KEY="" +ENV CI_STORAGE_HOST_PRIVATE_KEY="" ENV DEBIAN_FRONTEND=noninteractive RUN true \ && apt-get update -y \ - && apt-get upgrade -y \ && apt-get install -y --no-install-recommends \ - awscli jq gh rsync openssh-client \ - mc gcc git curl wget pv psmisc unzip vim nano telnet net-tools bash-completion \ - libssl-dev apt-transport-https build-essential ca-certificates locales pkg-config \ - && useradd -m ubuntu + openssh-client \ + jq gh rsync python3 mc git curl wget pv psmisc unzip vim nano telnet net-tools apt-transport-https ca-certificates locales \ + && useradd -m user -USER ubuntu +USER user RUN true \ - && mkdir -p /home/ubuntu/.ssh \ - && chmod 700 /home/ubuntu/.ssh \ - && mkdir /home/ubuntu/actions-runner \ - && cd /home/ubuntu/actions-runner \ + && mkdir -p ~user/.ssh \ + && chmod 700 ~user/.ssh \ + && mkdir ~user/actions-runner \ + && cd ~user/actions-runner \ && arch=$(dpkg --print-architecture) \ && case "$arch" in \ x86_64|amd64) arch=linux-x64 ;; \ @@ -35,19 +32,19 @@ RUN true \ && curl --no-progress-meter -L https://github.com/actions/runner/releases/download/v$RUNNER_VERSION/actions-runner-$arch-$RUNNER_VERSION.tar.gz | tar xz USER root -RUN /home/ubuntu/actions-runner/bin/installdependencies.sh \ - apt-get autoremove \ +RUN ~user/actions-runner/bin/installdependencies.sh \ + && apt-get autoremove \ && apt-get clean \ && apt-get autoclean \ && rm -rf /var/lib/apt/lists/* -USER ubuntu -COPY --chmod=755 --chown=ubuntu:ubuntu entrypoint.sh /home/ubuntu +USER user +COPY --chmod=755 --chown=user:user entrypoint.sh /home/user COPY --chmod=755 --from=root ci-storage /usr/bin/ci-storage -WORKDIR /home/ubuntu +WORKDIR /home/user ENTRYPOINT ["./entrypoint.sh"] -# If overridden in the derived image, evals this as "ubuntu" user as a shell -# script after config.sh, but before run.sh. +# If overridden in the derived image, evals this as a shell script after +# config.sh, but before run.sh. CMD [] diff --git a/docker/self-hosted-runner/entrypoint.sh b/docker/self-hosted-runner/entrypoint.sh index ddd5ee5..7554aa1 100644 --- a/docker/self-hosted-runner/entrypoint.sh +++ b/docker/self-hosted-runner/entrypoint.sh @@ -15,30 +15,54 @@ # auto-removed in 1 day. But we anyways need to implement some manual removal # cycle exernally, since even 1 day is way too much for garbage accumulation. # -set -u -e -o xtrace +set -u -e -: $GH_REPOSITORY # {owner}/{repo} -: $GH_LABELS -: $GH_TOKEN # used by gh cli - -cd ./actions-runner +if [[ "${GH_REPOSITORY:=''}" != */* ]]; then + echo "GH_REPOSITORY must be set, and the format should be {owner}/{repo}."; + exit 1; +fi +if [[ "${GH_LABELS:=''}" == "" ]]; then + echo "GH_LABELS must be set."; + exit 1; +fi +if [[ "${GH_TOKEN:=''}" == "" ]]; then + echo "GH_TOKEN must be set."; + exit 1; +fi +if [[ "${CI_STORAGE_HOST:=''}" != "" && ! "$CI_STORAGE_HOST" =~ ^([-.[:alnum:]]+@)?[-.[:alnum:]]+$ ]]; then + echo "If CI_STORAGE_HOST is passed, it must be in form of {hostname} or {user}@{hostname}."; + exit 1; +fi +if [[ "${CI_STORAGE_HOST_PRIVATE_KEY:=''}" != "" && "$CI_STORAGE_HOST_PRIVATE_KEY" != *OPENSSH\ PRIVATE\ KEY* ]]; then + echo "If CI_STORAGE_HOST_PRIVATE_KEY is passed, it must be an SSH private key."; + exit 1; +fi -name="ci-storage-$(hostname)" -repo_name="${GH_REPOSITORY##*/}" -local_dir=_work/$repo_name/$repo_name +if [[ "$(whoami)" != user || ! -d ./actions-runner ]]; then + echo 'This script must be run as "user" user, and ./actions-runner/ should exist.'; + exit 1; +fi -set +o xtrace -if [ "${CI_STORAGE_HOST_SSH_KEY:-}" != "" ]; then - echo "$CI_STORAGE_HOST_SSH_KEY" > ~/.ssh/id_ed25519 +if [[ "$CI_STORAGE_HOST_PRIVATE_KEY" != "" ]]; then + echo "$CI_STORAGE_HOST_PRIVATE_KEY" > ~/.ssh/id_ed25519 chmod 600 ~/.ssh/id_ed25519 fi -set -o xtrace -if [ "${CI_STORAGE_HOST:-}" != "" ]; then +echo $$ > entrypoint.pid +cd ./actions-runner + +name="ci-storage-$(hostname)" +local_dir=_work/${GH_REPOSITORY##*/}/${GH_REPOSITORY##*/} + +if [[ "$CI_STORAGE_HOST" != "" ]]; then ssh-keyscan -H "$CI_STORAGE_HOST" >> ~/.ssh/known_hosts chmod 600 ~/.ssh/known_hosts - mkdir -p $local_dir - ci-storage --storage-host="$CI_STORAGE_HOST" --storage-dir="~/ci-storage/$GH_REPOSITORY" --slot-id="*" --local-dir="$local_dir" load + mkdir -p "$local_dir" + ci-storage load \ + --storage-host="$CI_STORAGE_HOST" \ + --storage-dir="~/ci-storage/$GH_REPOSITORY" \ + --slot-id="?" \ + --local-dir="$local_dir" fi token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/registration-token") @@ -53,21 +77,20 @@ cleanup() { # Retry deleting the runner until it succeeds. # - Busy runner fails in deletion, so we can retry safely until it becomes # idle and is successfully deleted. - # - The extrnal orchestrator will eventually kill the container after a large - # timeout (say, 15 minutes or so) needed for a running job to finish. + # - In case we can't delete the runner for a long time still, the extrnal + # orchestrator will eventually kill the container after a large timeout + # (say, 15 minutes or so) needed for a running job to finish. while :; do token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/remove-token") ./config.sh remove --token "$token" && break sleep 5 - : "Retrying deletion till the runner becomes idle and succeeds..." + echo "Retrying deletion till the runner becomes idle and succeeds..." done } trap "cleanup; exit 130" INT trap "cleanup; exit 143" TERM -echo $$ > runner.pid - -eval "$@" +"$@" ./run.sh & wait $!