Skip to content

Commit

Permalink
fix: segregate daily_snapshot logs (#285)
Browse files Browse the repository at this point in the history
Signed-off-by: samuelarogbonlo <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
samuelarogbonlo and pre-commit-ci[bot] authored Oct 9, 2023
1 parent a691722 commit e69c85a
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 13 deletions.
4 changes: 2 additions & 2 deletions terraform/modules/daily_snapshot/service/calibnet_cron_job
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
# shellcheck source=/dev/null
source ~/.forest_env
cd "$BASE_FOLDER" || exit
flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet > calibnet_log.txt 2>&1"
flock -n /tmp/calibnet_filops.lock -c "./upload_filops_snapshot.sh calibnet > filops_calibnet_log.txt 2>&1"
flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet > logs/calibnet_log.txt 2>&1"
flock -n /tmp/calibnet_filops.lock -c "./upload_filops_snapshot.sh calibnet > logs/filops_calibnet_log.txt 2>&1"
18 changes: 13 additions & 5 deletions terraform/modules/daily_snapshot/service/daily_snapshot.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,26 @@ def latest_snapshot_date(chain_name = 'calibnet')
end
end


CHAIN_NAME = ARGV[0]
raise 'No chain name supplied. Please provide chain identifier, e.g. calibnet or mainnet' if ARGV.empty?

# Current datetime, to append to the log files
DATE = Time.new.strftime '%FT%H:%M:%S'
LOG_EXPORT = "#{CHAIN_NAME}_#{DATE}_export.txt"
LOG_EXPORT_SCRIPT_RUN = "logs/#{CHAIN_NAME}_#{DATE}_script_run.txt"
LOG_EXPORT_DAEMON = "logs/#{CHAIN_NAME}_#{DATE}_daemon.txt"
LOG_EXPORT_METRICS = "logs/#{CHAIN_NAME}_#{DATE}_metrics.txt"

client = SlackClient.new CHANNEL, SLACK_TOKEN

# Query the date of the most recent snapshot. This is used to limit the number
# of victory messages to 1/day even if we upload multiple snapshots per day.
date_before_export = latest_snapshot_date(CHAIN_NAME)

# conditionally add timestamps to logs without timestamps
add_timestamps_cmd = "awk '{ if ($0 !~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{6}Z/) print strftime(\"[%Y-%m-%d %H:%M:%S]\"), $0; else print $0; fflush(); }'"

# Sync and export snapshot
snapshot_uploaded = system("bash -c 'timeout --signal=KILL 24h ./upload_snapshot.sh #{CHAIN_NAME}' > #{LOG_EXPORT} 2>&1")
snapshot_uploaded = system("bash -c 'timeout --signal=KILL 24h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS}' | #{add_timestamps_cmd} > #{LOG_EXPORT_SCRIPT_RUN} 2>&1")

if snapshot_uploaded
date_after_export = latest_snapshot_date(CHAIN_NAME)
Expand All @@ -54,7 +58,11 @@ def latest_snapshot_date(chain_name = 'calibnet')
else
client.post_message "⛔ Snapshot failed for #{CHAIN_NAME}. 🔥🌲🔥 "
# attach the log file and print the contents to STDOUT
client.attach_files(LOG_EXPORT)
[LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file|
client.attach_files(log_file) if File.exist?(log_file)
end
end

puts "Snapshot export log:\n#{File.read(LOG_EXPORT)}"
[LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file|
puts "Snapshot export log:\n#{File.read(log_file)}\n\n" if File.exist?(log_file)
end
4 changes: 2 additions & 2 deletions terraform/modules/daily_snapshot/service/init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ aws configure set aws_access_key_id "$R2_ACCESS_KEY"
aws configure set aws_secret_access_key "$R2_SECRET_KEY"

## Create forest data directory
mkdir forest_db
chmod 777 forest_db
mkdir forest_db logs
chmod 777 forest_db logs
mkdir --parents -- "$BASE_FOLDER/forest_db/filops"

# Make the scripts executable
Expand Down
20 changes: 16 additions & 4 deletions terraform/modules/daily_snapshot/service/upload_snapshot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
# If Forest hasn't synced to the network after 8 hours, something has gone wrong.
SYNC_TIMEOUT=8h

if [[ $# != 1 ]]; then
echo "Usage: bash $0 CHAIN_NAME"
if [[ $# != 3 ]]; then
echo "Usage: bash $0 CHAIN_NAME LOG_EXPORT_DAEMON LOG_EXPORT_METRICS"
exit 1
fi

CHAIN_NAME=$1
LOG_EXPORT_DAEMON=$2
LOG_EXPORT_METRICS=$3

# Make sure we have the most recent Forest image
docker pull ghcr.io/chainsafe/forest:"${FOREST_TAG}"
Expand All @@ -25,18 +27,26 @@ apt-get update && apt-get install -y curl
# Switch back to the service user for other service commands.
su - forest
function add_timestamps {
while IFS= read -r line; do
echo "\$(date +'%Y-%m-%d %H:%M:%S') \$line"
done
}
# periodically write metrics to a file
# this is done in a separate process to avoid blocking the sync process
# and to ensure that the metrics are written even if it crashes
function write_metrics {
while true; do
curl --silent --fail --output metrics.txt --max-time 5 --retry 5 --retry-delay 2 --retry-max-time 10 http://localhost:6116/metrics || true
{
curl --silent --fail --max-time 5 --retry 5 --retry-delay 2 --retry-max-time 10 http://localhost:6116/metrics || true
} | add_timestamps >> "$LOG_EXPORT_METRICS"
sleep 5
done
}
function print_forest_logs {
cat forest.err forest.out metrics.txt
cat forest.err forest.out > $LOG_EXPORT_DAEMON
}
trap print_forest_logs EXIT
Expand Down Expand Up @@ -83,6 +93,7 @@ docker stop "$CONTAINER_NAME" || true
docker rm --force "$CONTAINER_NAME"

CHAIN_DB_DIR="$BASE_FOLDER/forest_db/$CHAIN_NAME"
CHAIN_LOGS_DIR="$BASE_FOLDER/logs"

# Delete any existing snapshot files. It may be that the previous run failed
# before deleting those.
Expand All @@ -94,6 +105,7 @@ docker run \
--rm \
--user root \
-v "$CHAIN_DB_DIR:/home/forest/forest_db":z \
-v "$CHAIN_LOGS_DIR:/home/forest/logs":z \
--entrypoint /bin/bash \
ghcr.io/chainsafe/forest:"${FOREST_TAG}" \
-c "$COMMANDS" || exit 1
Expand Down

0 comments on commit e69c85a

Please sign in to comment.