Skip to content

Commit

Permalink
abandon task timeout idea
Browse files Browse the repository at this point in the history
  • Loading branch information
ric-evans committed Oct 15, 2024
1 parent 2c1f85f commit 451c832
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 43 deletions.
42 changes: 0 additions & 42 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -476,48 +476,6 @@ jobs:
file: ./Dockerfile
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: set 'EWMS_PILOT_TASK_TIMEOUT' for reco algo + event file
run: |
export RECO_ALGO=${{ matrix.reco_algo }}
export EVENT_FILE=${{ matrix.eventfile }}
# NOTE: these values were determined by 1 stddev above the median (the max val of 2 workers)
value=$(python -c '
import os
hese_event = "hese_event_01.json"
gold_event = "run00136766-evt000007637140-GOLD.pkl"
bronze_event = "run00136662-evt000035405932-BRONZE.pkl"
generic_event = "138632_31747601.json"
lookup = {
"millipede_original": {
hese_event: 185,
gold_event: 195,
bronze_event: 165,
generic_event: 195,
},
"millipede_wilks": {
hese_event: 130,
gold_event: 280,
bronze_event: 190,
generic_event: 245,
},
"splinempe": {
gold_event: 55,
bronze_event: 25,
generic_event: 35,
},
"splinempe_pointed": {
gold_event: 35,
bronze_event: 35,
generic_event: 35,
}
}
print(lookup[os.environ["RECO_ALGO"]][os.environ["EVENT_FILE"]])
')
adjusted_int_value=$(awk '{printf "%d\n", $1}' <<< "$(echo "$value * 1.1" | bc)")
echo "$value * 1.1 = $adjusted_int_value"
echo "EWMS_PILOT_TASK_TIMEOUT=$adjusted_int_value" >> $GITHUB_ENV

- name: run
timeout-minutes: 45 # on average max~=35min
Expand Down
2 changes: 1 addition & 1 deletion tests/env-vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS=${SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRE
# -> worker/client/pilot
export EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE=${EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE:-60}
export EWMS_PILOT_TIMEOUT_QUEUE_INCOMING=${EWMS_PILOT_TIMEOUT_QUEUE_INCOMING:-5}
# export EWMS_PILOT_TASK_TIMEOUT -> this is very specific to the task, set it in a ci step
# export EWMS_PILOT_TASK_TIMEOUT -> this is very specific to the task, if it's wanted set it in a place where we now inputs
export EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR=${EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR:-"True"}
export EWMS_PILOT_OKAY_ERRORS=${EWMS_PILOT_OKAY_ERRORS:-"TimeoutError"} # this is a space-delimited list
# ^^^ in production, we run O(1k) cpus so a slow reco will be delivered to a new cpu, here we have to be more conservative. So, let the local workers retry the reco
Expand Down

0 comments on commit 451c832

Please sign in to comment.