From 451c8322bd41bdb23239a95be54603923725815a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 15 Oct 2024 18:09:08 -0500 Subject: [PATCH] abandon task timeout idea --- .github/workflows/tests.yml | 42 ------------------------------------- tests/env-vars.sh | 2 +- 2 files changed, 1 insertion(+), 43 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 889352403..5db415f6c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -476,48 +476,6 @@ jobs: file: ./Dockerfile tags: ${{ env.CI_DOCKER_IMAGE_TAG }} load: true - - name: set 'EWMS_PILOT_TASK_TIMEOUT' for reco algo + event file - run: | - export RECO_ALGO=${{ matrix.reco_algo }} - export EVENT_FILE=${{ matrix.eventfile }} - - # NOTE: these values were determined by 1 stddev above the median (the max val of 2 workers) - value=$(python -c ' - import os - hese_event = "hese_event_01.json" - gold_event = "run00136766-evt000007637140-GOLD.pkl" - bronze_event = "run00136662-evt000035405932-BRONZE.pkl" - generic_event = "138632_31747601.json" - lookup = { - "millipede_original": { - hese_event: 185, - gold_event: 195, - bronze_event: 165, - generic_event: 195, - }, - "millipede_wilks": { - hese_event: 130, - gold_event: 280, - bronze_event: 190, - generic_event: 245, - }, - "splinempe": { - gold_event: 55, - bronze_event: 25, - generic_event: 35, - }, - "splinempe_pointed": { - gold_event: 35, - bronze_event: 35, - generic_event: 35, - } - } - print(lookup[os.environ["RECO_ALGO"]][os.environ["EVENT_FILE"]]) - ') - - adjusted_int_value=$(awk '{printf "%d\n", $1}' <<< "$(echo "$value * 1.1" | bc)") - echo "$value * 1.1 = $adjusted_int_value" - echo "EWMS_PILOT_TASK_TIMEOUT=$adjusted_int_value" >> $GITHUB_ENV - name: run timeout-minutes: 45 # on average max~=35min diff --git a/tests/env-vars.sh b/tests/env-vars.sh index ea846d399..db235f775 100755 --- a/tests/env-vars.sh +++ b/tests/env-vars.sh @@ -33,7 +33,7 @@ export SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS=${SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRE # -> worker/client/pilot export EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE=${EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE:-60} export EWMS_PILOT_TIMEOUT_QUEUE_INCOMING=${EWMS_PILOT_TIMEOUT_QUEUE_INCOMING:-5} -# export EWMS_PILOT_TASK_TIMEOUT -> this is very specific to the task, set it in a ci step +# export EWMS_PILOT_TASK_TIMEOUT -> this is very specific to the task, if it's wanted set it in a place where we now inputs export EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR=${EWMS_PILOT_STOP_LISTENING_ON_TASK_ERROR:-"True"} export EWMS_PILOT_OKAY_ERRORS=${EWMS_PILOT_OKAY_ERRORS:-"TimeoutError"} # this is a space-delimited list # ^^^ in production, we run O(1k) cpus so a slow reco will be delivered to a new cpu, here we have to be more conservative. So, let the local workers retry the reco