Skip to content

Commit

Permalink
set 'EWMS_PILOT_TASK_TIMEOUT' for reco algo + event file
Browse files Browse the repository at this point in the history
  • Loading branch information
ric-evans committed Oct 15, 2024
1 parent e00ee53 commit 1265b6a
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 1 deletion.
43 changes: 43 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,49 @@ jobs:
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: set 'EWMS_PILOT_TASK_TIMEOUT' for reco algo + event file
run: |
export RECO_ALGO=${{ matrix.reco_algo }}
export EVENT_FILE=${{ matrix.eventfile }}
# NOTE: these values were determined by 1 stddev above the median (the max val of 2 workers)
value=$(python -c '
import os
hese_event = "hese_event_01.json"
gold_event = "run00136766-evt000007637140-GOLD.pkl"
bronze_event = "run00136662-evt000035405932-BRONZE.pkl"
generic_event = "138632_31747601.json"
lookup = {
"millipede_original": {
hese_event: 185,
gold_event: 195,
bronze_event: 165,
generic_event: 195,
},
"millipede_wilks": {
hese_event: 130,
gold_event: 280,
bronze_event: 190,
generic_event: 245,
},
"splinempe": {
gold_event: 55,
bronze_event: 25,
generic_event: 35,
},
"splinempe_pointed": {
gold_event: 35,
bronze_event: 35,
generic_event: 35,
}
}
print(lookup[os.environ["RECO_ALGO"]][os.environ["EVENT_FILE"]])
')
adjusted_int_value=$(awk '{printf "%d\n", $1}' <<< "$(echo "$value * 1.1" | bc)")
echo "$adjusted_int_value"
echo "EWMS_PILOT_TASK_TIMEOUT=$adjusted_int_value" >> $GITHUB_ENV
- name: run
timeout-minutes: 45 # on average max~=35min
run: |
Expand Down
3 changes: 2 additions & 1 deletion tests/env-vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ export SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS=${SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRE
# -> worker/client/pilot
export EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE=${EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE:-60}
export EWMS_PILOT_TIMEOUT_QUEUE_INCOMING=${EWMS_PILOT_TIMEOUT_QUEUE_INCOMING:-5}
export EWMS_PILOT_TASK_TIMEOUT=${EWMS_PILOT_TASK_TIMEOUT:-$((1 * 10))} # $((60 * 10))} # TODO - adjust / add option to pilot to not exit on task timeouts (just nack)
# export EWMS_PILOT_TASK_TIMEOUT -> this is very specific to the task, set it in a ci step
export EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR=${EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR:-"True"}
export EWMS_PILOT_OKAY_ERRORS=${EWMS_PILOT_OKAY_ERRORS:-"TimeoutError"} # this is a space-delimited list
# ^^^ in production, we run O(1k) cpus so a slow reco will be delivered to a new cpu, here we have to be more conservative. So, let the local workers retry the reco
# ^^^ if EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR=false (or similar: no, 0, etc.), then this var is ignored
# -> server
export SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS=${SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS:-$((60 * 10))} # just need a big value -- only used to detect MIA workers (it isn't important in a successful scan)
Expand Down

0 comments on commit 1265b6a

Please sign in to comment.