Skip to content

Commit

Permalink
Adds reporting function to script
Browse files Browse the repository at this point in the history
  • Loading branch information
ozdemirozcelik committed Oct 11, 2024
1 parent 8e03bb5 commit c71810d
Showing 1 changed file with 121 additions and 32 deletions.
153 changes: 121 additions & 32 deletions maintenance/getRsTableSize.sh
Original file line number Diff line number Diff line change
@@ -1,57 +1,132 @@
#!/usr/bin/env bash
##################################################
#
# Queries Redshift for the list of tables ordered by their size
# Queries Redshift for the list of tables ordered by their size and writes the results to a Redshift table:
# - The raw query results are stored locally in a log file with a predefined prefix
# - Log files (with table size information) are uploaded to S3 for processing
# - After successful processing, logs are moved to S3 destination folder
# - Local log files older than 7 days are automatically deleted
#
# The results are written to a Redshift table.
# The Redshift table column order is:
# - date: Timestamp in the America/Vancouver timezone
# - schema: Schema the table belongs to
# - table: Table name
# - tbl_rows: Row count of the table
# - size: Size of the table in 1 MB data blocks
#
# The column order is:
# date, schema, table, tbl_rows, size
# Optional Positional Arguments:
# -h, --help: Displays help information and usage.
# --force-report: Forces a log report to be generated immediately, bypassing the set interval.
# --report-interval=H: Sets the interval (in H hours) for automatic report generation (default: 1 hour).
# --limit-rows=N: Limits the results written to Redshift to N rows.
#
# The date column is a timestamp in the America/Vancouver timezone
# The schema column is what schema the table is in
# The table columnn is the table name
# The tbl_rows column is the table row count
# The size column is the size of the table, in 1 MB data blocks.
# Logging for Reporting:
# - The script logs the Redshift table size query output for reporting.
# - Reports are generated at regular intervals defined by --report-interval (default is 1 hour).
# - The report log file is maintained daily, and logs older than 7 days are automatically deleted.
#
# An optional positive number positional argument limits the rows returned.
# Reporting:
# - The script checks if the current hour is divisible by the report interval using the modulo operation %.
# - If the remainder is 0, it means the current hour matches the reporting interval.
# (HOUR: Current hour (24-hour format), used to check report intervals.)
# - Reports are only generated at the hour's start (when minutes are 00).
# (MINUTE: The minute at the start of the job ensures reports are only generated at the beginning of the hour.)
# - The script interpret $HOUR (and $MINUTE) as decimal (base-10) by using the 10# prefix.
# - If --force-report is used, the report will be generated immediately, regardless of the current time.
#
##################################################

# uncomment the following shell options to expand aliases and source the current ~/.bashrc file if not running as cron
# Uncomment the following shell options to expand aliases and source the current ~/.bashrc file if not running as cron
shopt -s expand_aliases
source ~/.bashrc

# Set default command-line values
REPORT_INTERVAL_HOURS=1
FORCE_REPORT=false

# Define variables for logging and output file management
DATE=$(date -u +"%Y-%m-%dT%H:%M:%S%:z")
LOG_PATH="RsTableLogs/"
mkdir -p "$LOG_PATH" # Ensure the log directory exists
LOG_PREFIX="RedShift_Table_Size_"
OUT_FILE=${LOG_PATH}${LOG_PREFIX}$DATE
OUT_FILE=${LOG_PATH}${LOG_PREFIX}$DATE # Combine to create the output file name

# S3 paths and destination table for storing and processing data
TABLE_DEST="maintenance.table_sizes"
S3_PATH="s3://sp-ca-bc-gov-131565110619-12-microservices/client/redshift_table_size/"
S3_DEST="s3://sp-ca-bc-gov-131565110619-12-microservices/processed/good/client/redshift_table_size/"

# For no positional arguments return the full list of tables
if [ $# -eq 0 ]
then
# Variables used for reporting
REPORT_LOG_PATH="ReportLogs/"
mkdir -p "$REPORT_LOG_PATH" # Ensure the report log directory exists
REPORT_LOG_PREFIX="Report_"
REPORT_DATE=$(date -u +"%Y-%m-%d")
REPORT_LOG_FILE=${REPORT_LOG_PATH}${REPORT_LOG_PREFIX}$REPORT_DATE
CURRENT_TIME=$(date +"%Y-%m-%d %H:%M:%S")
MINUTE=$(date +"%M")
HOUR=$(date +"%H")

# Function to display help
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --limit-rows=N Limit results written to Redshift to N rows."
echo " --report-interval=H Set the interval to report in every H hours."
echo " --force-report Force to create a report without considering the report interval."
echo " -h, --help Show this help message."
}

# Process command-line arguments
for arg in "$@";
do
case $arg in
--limit-rows=*)
LIMIT_ROWS="${arg#*=}"
;;
--report-interval=*)
REPORT_INTERVAL_HOURS="${arg#*=}"
;;
--force-report)
FORCE_REPORT=true
;;
-h|--help)
show_help
exit 0
;;
*)
echo "Warning: Unrecognized argument: $arg" >&2
;;
esac
done

# Validate REPORT_INTERVAL_HOURS
if ! [[ "$REPORT_INTERVAL_HOURS" =~ ^[1-9][0-9]*$ ]]; then
echo "error: --report-interval must be a positive integer." >&2
exit 1
fi

# Validate LIMIT_ROWS (if set)
if [[ -n "$LIMIT_ROWS" ]] && ! [[ "$LIMIT_ROWS" =~ ^[1-9][0-9]*$ ]]; then
echo "error: --limit-rows must be a positive integer." >&2
exit 1
fi

# Construct the SQL query based on LIMIT_ROWS
if [[ -z "$LIMIT_ROWS" ]]; then
# No limit set, return full list of tables
read -r -d '' sql <<EOF
SELECT convert_timezone('America/Vancouver', getdate()) as date, schema, "table", tbl_rows, size, estimated_visible_rows, tbl_rows-estimated_visible_rows AS tombstoned_rows
FROM SVV_TABLE_INFO
FROM SVV_TABLE_INFOl
ORDER BY size DESC
EOF
else
limit=$1
# validate that the positional argument is a positive number
re='^[0-9]+$'
if ! [[ $limit =~ $re ]] ; then
echo "error: Argument must be a number" >&2; exit 1
else
# limit the rows returned to the number provided as an argument
# Limit the rows returned to the number specified in LIMIT_ROWS
read -r -d '' sql <<EOF
SELECT convert_timezone('America/Vancouver', getdate()) as date, schema, "table", tbl_rows, size, estimated_visible_rows, tbl_rows-estimated_visible_rows AS tombstoned_rows
FROM SVV_TABLE_INFO
ORDER BY size DESC
LIMIT $limit
SELECT convert_timezone('America/Vancouver', getdate()) as date, schema, "table", tbl_rows, size, estimated_visible_rows, tbl_rows-estimated_visible_rows AS tombstoned_rows
FROM SVV_TABLE_INFO
ORDER BY size DESC
LIMIT $LIMIT_ROWS
EOF
fi
fi

# Execute the query using the adminuser_rs alias and redirect output to file
Expand All @@ -60,12 +135,12 @@ adminuser_rs -tqc "$sql" >> $OUT_FILE
# Clean the file before s3 upload
sed -r -i 's/[\t ]//g;/^$/d' $OUT_FILE

# Copy output to s3
# Copy output to s3
aws s3 --quiet cp "$OUT_FILE" $S3_PATH

# Build table_size table
read -r -d '' rs_copy <<EOF
COPY maintenance.table_sizes
COPY $TABLE_DEST
FROM '$S3_PATH'
CREDENTIALS
'aws_access_key_id=$AWS_ACCESS_KEY_ID;aws_secret_access_key=$AWS_SECRET_ACCESS_KEY'
Expand All @@ -75,12 +150,26 @@ read -r -d '' rs_copy <<EOF
delimiter '|';
EOF


# Initiate copy to RedShift
adminuser_rs -tqc "$rs_copy"
printf "($CURRENT_TIME) " >> "$REPORT_LOG_FILE" 2>&1
adminuser_rs -tqc "$rs_copy" >> "$REPORT_LOG_FILE" 2>&1

# Move log file to processed
aws s3 mv $S3_PATH $S3_DEST --quiet --recursive

# Remove log files +7 days old
find $LOG_PATH -mindepth 1 -mtime +7 -delete

# Check the report interval and echo the log report
if [ "$FORCE_REPORT" = true ] || ([ $((10#$HOUR % REPORT_INTERVAL_HOURS)) -eq 0 ] && [ $((10#$MINUTE)) -eq 0 ]); then
if [ -s $REPORT_LOG_FILE ]; then
# echo the log content to send the report to MAILTO recipients
echo -e "Log report for Redshift table sizes:\n"
echo "$(tac $REPORT_LOG_FILE)"
else
echo "Log file is empty at $CURRENT_TIME, nothing to report."
fi
fi

# Delete log files older than 1 week
find $REPORT_LOG_PATH -mindepth 1 -mtime +7 -delete;

0 comments on commit c71810d

Please sign in to comment.