Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scripts to check for worker readiness and liveness #14625

Merged
merged 2 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions bin/check-worker-liveness
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash

: "${DATABASE_URL:?Please set DATABASE_URL}"
: "${HOSTNAME:?Please set HOSTNAME}"

MIN_RUN_AT_FILE=/tmp/.min-run-at.op
FAIL_COUNT_FILE=/tmp/.liveness-check-fail-count.op

# How many jobs are overdue at least 15 minutes?
#
# Filter by not locked jobs because a crashing worker may leave behind
# a locked, unprocessed job. But this doesn't mean the new worker isn't working.
#
# This checks across all workers and can't tell if it's only this worker in particular
# that doesn't seem to be doing anything.
UNPROCESSED_COUNT=`psql -d ${DATABASE_URL%%\?*} -c "select count(*) from delayed_jobs where locked_by is null and run_at < (now() - interval '15 minutes');" | tail -n +3 | head -n1 | tr -d ' '`

echo "unprocessed: $UNPROCESSED_COUNT"

if [ "$UNPROCESSED_COUNT" = "0" ]; then
echo 0 > $MIN_RUN_AT_FILE # `date -d '0'` will be the beginning of the day which is fine for our purposes
echo 0 > $FAIL_COUNT_FILE

echo "Workers ok"

exit 0
else
MIN_RUN_AT=`psql -d ${DATABASE_URL%%\?*} -c "select run_at from delayed_jobs where locked_by is null and cron is null and run_at is not null and run_at < (now() - interval '1 minutes') order by run_at asc limit 1;" | tail -n +3 | head -n1`
LAST_MIN_RUN_AT=`cat $MIN_RUN_AT_FILE 2>/dev/null || echo 0`
NUM_FAILS=`cat $FAIL_COUNT_FILE 2>/dev/null || echo 0`

if [ "$MIN_RUN_AT" = "(0 rows)" ]; then
echo "There are no unprocessed jobs after all."

exit 0
# check if last dangling job has been processed by now
elif [ "`date -d "$MIN_RUN_AT" '+%s'`" -gt "`date -d "$LAST_MIN_RUN_AT" '+%s'`" ]; then
echo 0 > $FAIL_COUNT_FILE

echo "Jobs recovering. Workers seem to be ok."
exit 0
elif [ "$NUM_FAILS" -lt 3 ]; then
echo "$((NUM_FAILS + 1))" > $FAIL_COUNT_FILE

echo "Job still hasn't been locked. Workers may not be ok. We're keeping an eye on it."
exit 0
else
echo "It's dead, Jim."

exit 1
fi
fi
9 changes: 9 additions & 0 deletions bin/check-worker-readiness
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

if ps aux | grep 'rake jobs:work' | grep -v grep; then
echo "background worker running"
exit 0
fi
echo "background worker not running"
exit 1
fi
Loading