Skip to content

script: error handling + send finish call #3332

script: error handling + send finish call

script: error handling + send finish call #3332

Workflow file for this run

name: tests
on:
push:
branches:
- '**'
tags-ignore:
- '**'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
# don't cancel on main/master/default
cancel-in-progress: ${{ format('refs/heads/{0}', github.event.repository.default_branch) != github.ref }}
env:
PY_COLORS: "1"
BOT_NAME: wipacdevbot
BOT_EMAIL: [email protected]
#
CI_DOCKER_IMAGE_TAG: icecube/skymap_scanner:local
#
CI_TEST_RUN_STDOUT_STDERR_DIR: /home/runner/work/skymap_scanner/testrun_outputs
N_WORKERS: 2
REALTIME_EVENTS_DIR: /home/runner/work/skymap_scanner/skymap_scanner/tests/data/realtime_events
CI_SKYSCAN_CACHE_DIR: /home/runner/work/skymap_scanner/skymap_scanner/cache
CI_SKYSCAN_OUTPUT_DIR: /home/runner/work/skymap_scanner/skymap_scanner/output
CI_SKYSCAN_DEBUG_DIR: /home/runner/work/skymap_scanner/skymap_scanner/debug
# see source tests/env-vars.sh
jobs:
#############################################################################
# PACKAGING & LINTERS
#############################################################################
py-versions:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.versions.outputs.matrix }}
steps:
- uses: actions/checkout@v4
- id: versions
uses: WIPACrepo/[email protected]
# flake8:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# - uses: WIPACrepo/[email protected]
mypy:
needs: [ py-versions ]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
py3: ${{ fromJSON(needs.py-versions.outputs.matrix) }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.py3 }}
- uses: WIPACrepo/[email protected]
py-setup:
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v4
with:
token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
- uses: WIPACrepo/[email protected]
if: github.actor != ${{ env.BOT_NAME }} # no auto-updates for bots # should match all 'git_committer_name' uses
with:
git_committer_name: ${{ env.BOT_NAME }}
git_committer_email: ${{ env.BOT_EMAIL }}
base-keywords: IceCube
py-dependencies:
runs-on: ubuntu-latest
steps:
- uses: jlumbroso/free-disk-space@main # need space for images to build
with:
docker-images: false
- name: checkout
uses: actions/checkout@v4
with:
token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
- uses: WIPACrepo/[email protected]
if: github.actor != ${{ env.BOT_NAME }} # no auto-updates for bots # should match all 'git_committer_name' uses
with:
git_committer_name: ${{ env.BOT_NAME }}
git_committer_email: ${{ env.BOT_EMAIL }}
dockerfile_nametags: Dockerfile:icecube/skymap_scanner:latest
#############################################################################
# TESTS
#############################################################################
test-build-docker:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
test-run-singularity-dummy-reco:
runs-on: ubuntu-latest
env:
SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS: user1:password@localhost/test
SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS: user1:password@localhost/test
# ..._AUTH_TOKEN: user1 # using this would override password in address
_EWMS_PILOT_CONTAINER_PLATFORM: apptainer
services:
rabbitmq:
# see image.tag -> https://github.com/Observation-Management-Service/path-kubernetes/blob/main/helm-values-rabbitmq-bitnami.yaml (see https://artifacthub.io/packages/helm/bitnami/rabbitmq/11.14.3)
image: bitnami/rabbitmq:3.11.15-debian-11-r0
env:
RABBITMQ_USERNAME: user1
RABBITMQ_PASSWORD: password
RABBITMQ_VHOST: test
BITNAMI_DEBUG: true
# Note: `--network` option is not supported.
options: >-
--name rabbitmq
--health-cmd "rabbitmqctl node_health_check"
--health-interval 5s
--health-timeout 5s
--health-retries 10
ports:
- 5672:5672
- 15672:15672
steps:
- uses: jlumbroso/free-disk-space@main # need space for mq broker and image
with:
docker-images: false
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- uses: eWaterCycle/setup-apptainer@v2
with:
apptainer-version: 1.3.2
- name: build singularity image
run: |
sudo singularity build skymap_scanner.sif docker-daemon://$CI_DOCKER_IMAGE_TAG
ls -lh skymap_scanner.sif
- name: run singularity container
run: |
set -x
source tests/env-vars.sh
export _RUN_THIS_SINGULARITY_IMAGE="$(realpath skymap_scanner.sif)"
export _RECO_ALGO=dummy
export _EVENTS_FILE=$(realpath $REALTIME_EVENTS_DIR/hese_event_01.json)
export _NSIDES="1:0"
cd ./resources/launch_scripts
./local-scan.sh $N_WORKERS $CI_TEST_RUN_STDOUT_STDERR_DIR
- name: look at results file (.npz)
run: |
ls .
ls $CI_SKYSCAN_OUTPUT_DIR
outfile=$(ls -d $CI_SKYSCAN_OUTPUT_DIR/*.npz)
echo $outfile
- name: central server stdout/stderr
if: always()
run: |
cat $CI_TEST_RUN_STDOUT_STDERR_DIR/server.out
- name: worker pilot \#1 stdout/stderr
if: always()
run: |
more $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot.out | cat
- name: worker clients / reco-icetray instances \#1 stdouts/stderrs
if: always()
run: |
find $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot-* -name "stderrfile" -o -name "stdoutfile" | xargs more | cat
echo "::::::::::::::" && tree $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot-*
- name: worker pilot \#2 stdout/stderr
if: always() && env.N_WORKERS == '2'
run: |
more $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot.out | cat
- name: worker clients / reco-icetray instances \#2 stdouts/stderrs
if: always() && env.N_WORKERS == '2'
run: |
find $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot-* -name "stderrfile" -o -name "stdoutfile" | xargs more | cat
echo "::::::::::::::" && tree $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot-*
- name: rabbitmq logs
if: always()
run: |
docker logs rabbitmq
test-run-nsides-thresholds-dummy:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
nsides: [
"1:0",
"1:0 2:12",
"1:0 2:12 4:12"
]
predictive_scanning_threshold: [
1.0,
0.65,
]
env:
SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS: user1@localhost/test
SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN: password # using this would override password in address
SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS: user1@localhost/test
SKYSCAN_MQ_FROMCLIENT_AUTH_TOKEN: password # using this would override password in address
services:
rabbitmq:
# see image.tag -> https://github.com/Observation-Management-Service/path-kubernetes/blob/main/helm-values-rabbitmq-bitnami.yaml (see https://artifacthub.io/packages/helm/bitnami/rabbitmq/11.14.3)
image: bitnami/rabbitmq:3.11.15-debian-11-r0
env:
RABBITMQ_USERNAME: user1
RABBITMQ_PASSWORD: password
RABBITMQ_VHOST: test
BITNAMI_DEBUG: true
# Note: `--network` option is not supported.
options: >-
--name rabbitmq
--health-cmd "rabbitmqctl node_health_check"
--health-interval 5s
--health-timeout 5s
--health-retries 10
ports:
- 5672:5672
- 15672:15672
steps:
- uses: jlumbroso/free-disk-space@main # need space for mq broker and image
with:
docker-images: false
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
timeout-minutes: 12 # on average max~=8.5min
run: |
set -x
source tests/env-vars.sh
export _RECO_ALGO=dummy
export _EVENTS_FILE=$(realpath $REALTIME_EVENTS_DIR/hese_event_01.json)
export _NSIDES="${{ matrix.nsides }}"
export _PREDICTIVE_SCANNING_THRESHOLD=${{ matrix.predictive_scanning_threshold }}
cd ./resources/launch_scripts
./local-scan.sh $N_WORKERS $CI_TEST_RUN_STDOUT_STDERR_DIR
- name: check no nsides skipped
run: |
ls $CI_SKYSCAN_OUTPUT_DIR
# get newest run*.json
export outfile=$(find $CI_SKYSCAN_OUTPUT_DIR -type f -name "run*.json" -exec stat -c '%y %n' {} + | sort | tail -1 | awk '{print $4}')
echo $outfile
python3 -c '
import json
import os
with open(os.getenv("outfile")) as f:
pydict = json.load(f)
nsides = "${{ matrix.nsides }}"
assert len(pydict) == nsides.count(":")
'
- name: central server stdout/stderr
if: always()
run: |
cat $CI_TEST_RUN_STDOUT_STDERR_DIR/server.out
- name: worker pilot \#1 stdout/stderr
if: always()
run: |
more $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot.out | cat
- name: worker clients / reco-icetray instances \#1 stdouts/stderrs
if: always()
run: |
find $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot-* -name "stderrfile" -o -name "stdoutfile" | xargs more | cat
echo "::::::::::::::" && tree $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot-*
- name: worker pilot \#2 stdout/stderr
if: always() && env.N_WORKERS == '2'
run: |
more $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot.out | cat
- name: worker clients / reco-icetray instances \#2 stdouts/stderrs
if: always() && env.N_WORKERS == '2'
run: |
find $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot-* -name "stderrfile" -o -name "stdoutfile" | xargs more | cat
echo "::::::::::::::" && tree $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot-*
- name: rabbitmq logs
if: always()
run: |
docker logs rabbitmq
test-run-crash-dummy:
runs-on: ubuntu-latest
env:
SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS: user1@localhost/test
SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN: password # using this would override password in address
SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS: user1@localhost/test
SKYSCAN_MQ_FROMCLIENT_AUTH_TOKEN: password # using this would override password in address
EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR: False # we want many crashes
services:
rabbitmq:
# see image.tag -> https://github.com/Observation-Management-Service/path-kubernetes/blob/main/helm-values-rabbitmq-bitnami.yaml (see https://artifacthub.io/packages/helm/bitnami/rabbitmq/11.14.3)
image: bitnami/rabbitmq:3.11.15-debian-11-r0
env:
RABBITMQ_USERNAME: user1
RABBITMQ_PASSWORD: password
RABBITMQ_VHOST: test
BITNAMI_DEBUG: true
# Note: `--network` option is not supported.
options: >-
--name rabbitmq
--health-cmd "rabbitmqctl node_health_check"
--health-interval 5s
--health-timeout 5s
--health-retries 10
ports:
- 5672:5672
- 15672:15672
steps:
- uses: jlumbroso/free-disk-space@main # need space for mq broker and image
with:
docker-images: false
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
timeout-minutes: 12 # on average ~9min # yes, `timeout` is used below but this is insurance
run: |
set -x
source tests/env-vars.sh
export _RECO_ALGO=crash_dummy
export _EVENTS_FILE=$(realpath $REALTIME_EVENTS_DIR/hese_event_01.json)
export _NSIDES="1:0 2:12 4:12"
export _SKYSCAN_CI_CRASH_DUMMY_PROBABILITY=0.75
export EWMS_PILOT_TASK_TIMEOUT=15
cd ./resources/launch_scripts
# since _SKYSCAN_CI_CRASH_DUMMY_PROBABILITY<1, this step will go forever
# so, stop it after some time and chek that it has stderrfiles
timeout 240 ./local-scan.sh $N_WORKERS $CI_TEST_RUN_STDOUT_STDERR_DIR || true
- name: look at stderrfiles
run: |
set -x
# check for fails/errors
error_type_1='intentional crash-dummy error'
error_type_2='subprocess timed out after'
pattern="$error_type_1|$error_type_2"
if find "$CI_TEST_RUN_STDOUT_STDERR_DIR/worker-"*/pilot.out -type f -exec grep -qE "$pattern" {} +; then
echo "Match(es) found: PilotSubprocessError and/or TimeoutError occurred."
else
echo "No matches found."
exit 1
fi
- name: central server stdout/stderr
if: always()
run: |
cat $CI_TEST_RUN_STDOUT_STDERR_DIR/server.out
- name: worker pilot \#1 stdout/stderr
if: always()
run: |
more $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot.out | cat
- name: worker clients / reco-icetray instances \#1 stdouts/stderrs
if: always()
run: |
find $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot-* -name "stderrfile" -o -name "stdoutfile" | xargs more | cat
echo "::::::::::::::" && tree $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot-*
- name: worker pilot \#2 stdout/stderr
if: always() && env.N_WORKERS == '2'
run: |
more $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot.out | cat
- name: worker clients / reco-icetray instances \#2 stdouts/stderrs
if: always() && env.N_WORKERS == '2'
run: |
find $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot-* -name "stderrfile" -o -name "stdoutfile" | xargs more | cat
echo "::::::::::::::" && tree $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot-*
test-run-realistic:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
reco_algo: [
millipede_original,
millipede_wilks,
splinempe,
splinempe_pointed
]
eventfile: [
hese_event_01.json,
run00136766-evt000007637140-GOLD.pkl,
run00136662-evt000035405932-BRONZE.pkl,
138632_31747601.json
]
exclude:
# splinempe should not run on HESE
- reco_algo: splinempe
eventfile: hese_event_01.json
- reco_algo: splinempe_pointed
eventfile: hese_event_01.json
env:
SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS: user1@localhost/test
SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN: password # using this would override password in address
SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS: user1@localhost/test
SKYSCAN_MQ_FROMCLIENT_AUTH_TOKEN: password # using this would override password in address
services:
rabbitmq:
# see image.tag -> https://github.com/Observation-Management-Service/path-kubernetes/blob/main/helm-values-rabbitmq-bitnami.yaml (see https://artifacthub.io/packages/helm/bitnami/rabbitmq/11.14.3)
image: bitnami/rabbitmq:3.11.15-debian-11-r0
env:
RABBITMQ_USERNAME: user1
RABBITMQ_PASSWORD: password
RABBITMQ_VHOST: test
BITNAMI_DEBUG: true
# Note: `--network` option is not supported.
options: >-
--name rabbitmq
--health-cmd "rabbitmqctl node_health_check"
--health-interval 5s
--health-timeout 5s
--health-retries 10
ports:
- 5672:5672
- 15672:15672
steps:
- uses: jlumbroso/free-disk-space@main # need space for mq broker and image
with:
docker-images: false
- uses: actionhippie/swap-space@v1
with:
size: 10G
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
timeout-minutes: 45 # on average max~=35min
run: |
set -x
lscpu
python3 --version
source tests/env-vars.sh
export _RECO_ALGO=${{ matrix.reco_algo }}
export _EVENTS_FILE=$(realpath $REALTIME_EVENTS_DIR/${{ matrix.eventfile }})
export _NSIDES="1:0"
cd ./resources/launch_scripts
./local-scan.sh $N_WORKERS $CI_TEST_RUN_STDOUT_STDERR_DIR
- name: test output against known result (.json)
run: |
ls $CI_SKYSCAN_OUTPUT_DIR
# get newest run*.json
outfile=$(find $CI_SKYSCAN_OUTPUT_DIR -type f -name "run*.json" -exec stat -c '%y %n' {} + | sort | tail -1 | awk '{print $4}')
echo $outfile
cat $outfile
pip install . # don't need icecube, so no docker container needed
python tests/compare_scan_results.py \
--actual $outfile \
--expected tests/data/results_json/${{ matrix.reco_algo }}/$(basename $outfile) \
--assert \
|| (cat $(ls *.diff.json) && false)
- name: central server stdout/stderr
if: always()
run: |
cat $CI_TEST_RUN_STDOUT_STDERR_DIR/server.out
- name: worker pilot \#1 stdout/stderr
if: always()
run: |
more $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot.out | cat
- name: worker clients / reco-icetray instances \#1 stdouts/stderrs
if: always()
run: |
find $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot-* -name "stderrfile" -o -name "stdoutfile" | xargs more | cat
echo "::::::::::::::" && tree $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-1/pilot-*
- name: worker pilot \#2 stdout/stderr
if: always() && env.N_WORKERS == '2'
run: |
more $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot.out | cat
- name: worker clients / reco-icetray instances \#2 stdouts/stderrs
if: always() && env.N_WORKERS == '2'
run: |
find $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot-* -name "stderrfile" -o -name "stdoutfile" | xargs more | cat
echo "::::::::::::::" && tree $CI_TEST_RUN_STDOUT_STDERR_DIR/worker-2/pilot-*
- name: rabbitmq logs
if: always()
run: |
docker logs rabbitmq
test-file-staging:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
run: |
source tests/env-vars.sh
docker run --rm -i \
$(env | grep -E '^(SKYSCAN_|_SKYSCAN_)' | cut -d'=' -f1 | sed 's/^/--env /') \
$CI_DOCKER_IMAGE_TAG \
python tests/file_staging.py
test-run-single-pixel:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
dir: [
"BRONZE",
"GOLD",
"JSON",
]
reco_algo:
[
millipede_original,
millipede_wilks,
splinempe,
splinempe_pointed
]
exclude:
# splinempe should not run on HESE
- reco_algo: splinempe
dir: "JSON"
- reco_algo: splinempe_pointed
dir: "JSON"
steps:
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
timeout-minutes: 10 # on average max~=5min
run: |
set -e
source tests/env-vars.sh
# run reco directly
docker run --network="host" --rm -i \
--shm-size=6gb \
--mount type=bind,source=$(readlink -f tests/data/reco_pixel_single/${{ matrix.reco_algo }}/${{ matrix.dir }}),target=/local/test-data \
--env PY_COLORS=1 \
$(env | grep -E '^(SKYSCAN_|_SKYSCAN_)' | cut -d'=' -f1 | sed 's/^/--env /') \
$CI_DOCKER_IMAGE_TAG \
python -m skymap_scanner.client \
--infile /local/test-data/in.json \
--client-startup-json /local/test-data/startup.json \
--outfile /local/test-data/out-actual.json
- name: test output against known result
run: |
source tests/env-vars.sh
ls tests/data/reco_pixel_pkls/${{ matrix.reco_algo }}/${{ matrix.dir }}
ls tests/data/reco_pixel_single/${{ matrix.reco_algo }}/${{ matrix.dir }}
# need icecube for depickling, so docker container needed
docker run --network="host" --rm -i \
--shm-size=6gb \
--mount type=bind,source=$(readlink -f tests/data/reco_pixel_single/${{ matrix.reco_algo }}/${{ matrix.dir }}),target=/local/test-data \
--env PY_COLORS=1 \
$(env | grep -E '^(SKYSCAN_|_SKYSCAN_)' | cut -d'=' -f1 | sed 's/^/--env /') \
$CI_DOCKER_IMAGE_TAG \
python tests/compare_reco_pixel_single.py \
--actual /local/test-data/out-actual.json \
--expected /local/test-data/out.json \
--diff-out-dir /local/test-data/ \
--assert \
|| (cat $(ls tests/data/reco_pixel_single/${{ matrix.reco_algo }}/${{ matrix.dir }}/*.diff.json) && false)
#############################################################################
# GITHUB RELEASE
#############################################################################
release:
# only run on main/master/default
if: format('refs/heads/{0}', github.event.repository.default_branch) == github.ref
# have to wait for tests so python-semantic-release can push (branch protection on main don't work w/ the bot)
needs: [
mypy,
py-setup,
py-dependencies,
test-build-docker,
test-run-singularity-dummy-reco,
test-file-staging,
test-run-nsides-thresholds-dummy,
test-run-crash-dummy,
test-run-realistic,
test-run-single-pixel,
]
runs-on: ubuntu-latest
concurrency: release
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
- name: Python Semantic Release
uses: python-semantic-release/[email protected]
with:
git_committer_name: ${{ env.BOT_NAME }}
git_committer_email: ${{ env.BOT_EMAIL }}
github_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
# repository_username: __token__
# repository_password: ${{ secrets.PYPI_TOKEN }}