diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fd6c59a9a..84359621d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -476,6 +476,49 @@ jobs: file: ./Dockerfile tags: ${{ env.CI_DOCKER_IMAGE_TAG }} load: true + - name: set 'EWMS_PILOT_TASK_TIMEOUT' for reco algo + event file + run: | + export RECO_ALGO=${{ matrix.reco_algo }} + export EVENT_FILE=${{ matrix.eventfile }} + + # NOTE: these values were determined by 1 stddev above the median (the max val of 2 workers) + value=$(python -c ' + import os + hese_event = "hese_event_01.json" + gold_event = "run00136766-evt000007637140-GOLD.pkl" + bronze_event = "run00136662-evt000035405932-BRONZE.pkl" + generic_event = "138632_31747601.json" + lookup = { + "millipede_original": { + hese_event: 185, + gold_event: 195, + bronze_event: 165, + generic_event: 195, + }, + "millipede_wilks": { + hese_event: 130, + gold_event: 280, + bronze_event: 190, + generic_event: 245, + }, + "splinempe": { + gold_event: 55, + bronze_event: 25, + generic_event: 35, + }, + "splinempe_pointed": { + gold_event: 35, + bronze_event: 35, + generic_event: 35, + } + } + print(lookup[os.environ["RECO_ALGO"]][os.environ["EVENT_FILE"]]) + ') + + adjusted_int_value=$(awk '{printf "%d\n", $1}' <<< "$(echo "$value * 1.1" | bc)") + echo "$adjusted_int_value" + echo "EWMS_PILOT_TASK_TIMEOUT=$adjusted_int_value" >> $GITHUB_ENV + - name: run timeout-minutes: 45 # on average max~=35min run: | diff --git a/tests/env-vars.sh b/tests/env-vars.sh index 1b9105276..ea846d399 100755 --- a/tests/env-vars.sh +++ b/tests/env-vars.sh @@ -33,9 +33,10 @@ export SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS=${SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRE # -> worker/client/pilot export EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE=${EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE:-60} export EWMS_PILOT_TIMEOUT_QUEUE_INCOMING=${EWMS_PILOT_TIMEOUT_QUEUE_INCOMING:-5} -export EWMS_PILOT_TASK_TIMEOUT=${EWMS_PILOT_TASK_TIMEOUT:-$((1 * 10))} # $((60 * 10))} # TODO - adjust / add option to pilot to not exit on task timeouts (just nack) +# export EWMS_PILOT_TASK_TIMEOUT -> this is very specific to the task, set it in a ci step export EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR=${EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR:-"True"} export EWMS_PILOT_OKAY_ERRORS=${EWMS_PILOT_OKAY_ERRORS:-"TimeoutError"} # this is a space-delimited list +# ^^^ in production, we run O(1k) cpus so a slow reco will be delivered to a new cpu, here we have to be more conservative. So, let the local workers retry the reco # ^^^ if EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR=false (or similar: no, 0, etc.), then this var is ignored # -> server export SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS=${SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS:-$((60 * 10))} # just need a big value -- only used to detect MIA workers (it isn't important in a successful scan)