From e69c85a459a3eda341cc8325e5287e1d5111234d Mon Sep 17 00:00:00 2001 From: Samuel Arogbonlo <47984109+samuelarogbonlo@users.noreply.github.com> Date: Mon, 9 Oct 2023 14:37:32 +0100 Subject: [PATCH] fix: segregate daily_snapshot logs (#285) Signed-off-by: samuelarogbonlo Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../daily_snapshot/service/calibnet_cron_job | 4 ++-- .../daily_snapshot/service/daily_snapshot.rb | 18 ++++++++++++----- .../modules/daily_snapshot/service/init.sh | 4 ++-- .../daily_snapshot/service/upload_snapshot.sh | 20 +++++++++++++++---- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/terraform/modules/daily_snapshot/service/calibnet_cron_job b/terraform/modules/daily_snapshot/service/calibnet_cron_job index 2a0fb7b37..a492ad45d 100755 --- a/terraform/modules/daily_snapshot/service/calibnet_cron_job +++ b/terraform/modules/daily_snapshot/service/calibnet_cron_job @@ -3,5 +3,5 @@ # shellcheck source=/dev/null source ~/.forest_env cd "$BASE_FOLDER" || exit -flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet > calibnet_log.txt 2>&1" -flock -n /tmp/calibnet_filops.lock -c "./upload_filops_snapshot.sh calibnet > filops_calibnet_log.txt 2>&1" +flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet > logs/calibnet_log.txt 2>&1" +flock -n /tmp/calibnet_filops.lock -c "./upload_filops_snapshot.sh calibnet > logs/filops_calibnet_log.txt 2>&1" diff --git a/terraform/modules/daily_snapshot/service/daily_snapshot.rb b/terraform/modules/daily_snapshot/service/daily_snapshot.rb index c310bf4e4..4c4aed1ce 100644 --- a/terraform/modules/daily_snapshot/service/daily_snapshot.rb +++ b/terraform/modules/daily_snapshot/service/daily_snapshot.rb @@ -27,13 +27,14 @@ def latest_snapshot_date(chain_name = 'calibnet') end end - CHAIN_NAME = ARGV[0] raise 'No chain name supplied. Please provide chain identifier, e.g. calibnet or mainnet' if ARGV.empty? # Current datetime, to append to the log files DATE = Time.new.strftime '%FT%H:%M:%S' -LOG_EXPORT = "#{CHAIN_NAME}_#{DATE}_export.txt" +LOG_EXPORT_SCRIPT_RUN = "logs/#{CHAIN_NAME}_#{DATE}_script_run.txt" +LOG_EXPORT_DAEMON = "logs/#{CHAIN_NAME}_#{DATE}_daemon.txt" +LOG_EXPORT_METRICS = "logs/#{CHAIN_NAME}_#{DATE}_metrics.txt" client = SlackClient.new CHANNEL, SLACK_TOKEN @@ -41,8 +42,11 @@ def latest_snapshot_date(chain_name = 'calibnet') # of victory messages to 1/day even if we upload multiple snapshots per day. date_before_export = latest_snapshot_date(CHAIN_NAME) +# conditionally add timestamps to logs without timestamps +add_timestamps_cmd = "awk '{ if ($0 !~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{6}Z/) print strftime(\"[%Y-%m-%d %H:%M:%S]\"), $0; else print $0; fflush(); }'" + # Sync and export snapshot -snapshot_uploaded = system("bash -c 'timeout --signal=KILL 24h ./upload_snapshot.sh #{CHAIN_NAME}' > #{LOG_EXPORT} 2>&1") +snapshot_uploaded = system("bash -c 'timeout --signal=KILL 24h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS}' | #{add_timestamps_cmd} > #{LOG_EXPORT_SCRIPT_RUN} 2>&1") if snapshot_uploaded date_after_export = latest_snapshot_date(CHAIN_NAME) @@ -54,7 +58,11 @@ def latest_snapshot_date(chain_name = 'calibnet') else client.post_message "⛔ Snapshot failed for #{CHAIN_NAME}. 🔥🌲🔥 " # attach the log file and print the contents to STDOUT - client.attach_files(LOG_EXPORT) + [LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| + client.attach_files(log_file) if File.exist?(log_file) + end end -puts "Snapshot export log:\n#{File.read(LOG_EXPORT)}" +[LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| + puts "Snapshot export log:\n#{File.read(log_file)}\n\n" if File.exist?(log_file) +end diff --git a/terraform/modules/daily_snapshot/service/init.sh b/terraform/modules/daily_snapshot/service/init.sh index 05d0d0565..761a4e844 100755 --- a/terraform/modules/daily_snapshot/service/init.sh +++ b/terraform/modules/daily_snapshot/service/init.sh @@ -26,8 +26,8 @@ aws configure set aws_access_key_id "$R2_ACCESS_KEY" aws configure set aws_secret_access_key "$R2_SECRET_KEY" ## Create forest data directory -mkdir forest_db -chmod 777 forest_db +mkdir forest_db logs +chmod 777 forest_db logs mkdir --parents -- "$BASE_FOLDER/forest_db/filops" # Make the scripts executable diff --git a/terraform/modules/daily_snapshot/service/upload_snapshot.sh b/terraform/modules/daily_snapshot/service/upload_snapshot.sh index 04be8c41a..a8bf8822c 100755 --- a/terraform/modules/daily_snapshot/service/upload_snapshot.sh +++ b/terraform/modules/daily_snapshot/service/upload_snapshot.sh @@ -3,12 +3,14 @@ # If Forest hasn't synced to the network after 8 hours, something has gone wrong. SYNC_TIMEOUT=8h -if [[ $# != 1 ]]; then - echo "Usage: bash $0 CHAIN_NAME" +if [[ $# != 3 ]]; then + echo "Usage: bash $0 CHAIN_NAME LOG_EXPORT_DAEMON LOG_EXPORT_METRICS" exit 1 fi CHAIN_NAME=$1 +LOG_EXPORT_DAEMON=$2 +LOG_EXPORT_METRICS=$3 # Make sure we have the most recent Forest image docker pull ghcr.io/chainsafe/forest:"${FOREST_TAG}" @@ -25,18 +27,26 @@ apt-get update && apt-get install -y curl # Switch back to the service user for other service commands. su - forest +function add_timestamps { + while IFS= read -r line; do + echo "\$(date +'%Y-%m-%d %H:%M:%S') \$line" + done +} + # periodically write metrics to a file # this is done in a separate process to avoid blocking the sync process # and to ensure that the metrics are written even if it crashes function write_metrics { while true; do - curl --silent --fail --output metrics.txt --max-time 5 --retry 5 --retry-delay 2 --retry-max-time 10 http://localhost:6116/metrics || true + { + curl --silent --fail --max-time 5 --retry 5 --retry-delay 2 --retry-max-time 10 http://localhost:6116/metrics || true + } | add_timestamps >> "$LOG_EXPORT_METRICS" sleep 5 done } function print_forest_logs { - cat forest.err forest.out metrics.txt + cat forest.err forest.out > $LOG_EXPORT_DAEMON } trap print_forest_logs EXIT @@ -83,6 +93,7 @@ docker stop "$CONTAINER_NAME" || true docker rm --force "$CONTAINER_NAME" CHAIN_DB_DIR="$BASE_FOLDER/forest_db/$CHAIN_NAME" +CHAIN_LOGS_DIR="$BASE_FOLDER/logs" # Delete any existing snapshot files. It may be that the previous run failed # before deleting those. @@ -94,6 +105,7 @@ docker run \ --rm \ --user root \ -v "$CHAIN_DB_DIR:/home/forest/forest_db":z \ + -v "$CHAIN_LOGS_DIR:/home/forest/logs":z \ --entrypoint /bin/bash \ ghcr.io/chainsafe/forest:"${FOREST_TAG}" \ -c "$COMMANDS" || exit 1