From 7e31daf5c906faf1b90f31eef0b833037fa9fb32 Mon Sep 17 00:00:00 2001 From: Jimmy <5608027+orcutt989@users.noreply.github.com> Date: Tue, 15 Aug 2023 12:41:01 -0400 Subject: [PATCH] snapshotEngine: DigitalOcean complete migration (#586) * add value to skip snap web generation * add configurable value for s3 bucket * web build condition on domain name instead * add secret and configurable s3 bucket override * switch name and mountpath to match format * update secret name and use in zip and upload job * use export instead of temp var * secret name change * expect correct names on secret volume mount * correct path to secret mount * rework credential override to provide logs and error messages * use double quotes for early expansion * remove variable checking since we are feeding in files * bug: container is gone so we cant delete a volume * show commands for debug * wrong default s3 bucket var * turn of tar output for debug * undo command verbosity * Verbose variables * Enable interactive for alias to work * More useful alias message and rm debug messages * Need space after ! * expand aliases instead of interactive * add public-read and move index.html * Website redirects stay in AWS * Set alias only for filesystem artifact upload * rolling redirects working * fix volume indexing * helpful messages * Useful comments for new indexing format * Omit alias functionality in lieu of variable parameters * Fix rolling tarball filename * configmap needs fqdn * cdn isnt working so we're using bucket url * unsilence lz4 logs * wrong aws bucket name * get all snapshot metadata from do spaces * upload metadatas to alt s3 bucket * fix metadata related to website build * initial commit demo functionality * put redirects back * remove merged files * update zip and upload commands for dual creds * sleep for debug * allow override of storage class for scratch volumes * use storage class as set * Container-running OS will not resolve localhost * Remove infinite sleep from debugging * Empty-Commit to trigger CI test * bucket name change to do space * rm fqdn from cm * increase warmer timeout * increase timeout after artifact job create * DO rate limits snapshots per 10m * sleep between creation for rate limiting * need different command for site upload * block snapshot until node ready * pause scheduler if node not ready * add sleep for cpu usage reduction * fix busy waits and document why * fix busy wait on job and more better comments --- .gitignore | 1 + .../snapshotEngine/scripts/snapshot-warmer.sh | 71 +++++++++++-------- .../snapshotEngine/templates/configmap.yaml | 2 +- snapshotEngine/mainJob.yaml | 33 +++++---- snapshotEngine/scratchVolume.yaml | 2 +- snapshotEngine/snapshot-maker.sh | 65 ++++++++++------- snapshotEngine/snapshot-scheduler.sh | 12 ++++ snapshotEngine/volumeFromSnap.yaml | 2 +- snapshotEngine/zip-and-upload.sh | 47 ++++++------ 9 files changed, 140 insertions(+), 95 deletions(-) diff --git a/.gitignore b/.gitignore index c7eda0683..7dbf4eb06 100644 --- a/.gitignore +++ b/.gitignore @@ -15,5 +15,6 @@ build # Ignore mkchain generated files *_values.yaml +*-values.yaml charts/tezos/charts diff --git a/charts/snapshotEngine/scripts/snapshot-warmer.sh b/charts/snapshotEngine/scripts/snapshot-warmer.sh index 269a80bab..817e61c1e 100755 --- a/charts/snapshotEngine/scripts/snapshot-warmer.sh +++ b/charts/snapshotEngine/scripts/snapshot-warmer.sh @@ -27,6 +27,7 @@ delete_old_volumesnapshots() { local max_snapshots="${2##max_snapshots=}" while [ "$(getNumberOfSnapshots readyToUse=true --selector="$selector")" -gt "$max_snapshots" ]; do + sleep 5 NUMBER_OF_SNAPSHOTS=$(getNumberOfSnapshots readyToUse=true --selector="$selector") printf "%s Number of snapshots with selector '$selector' is too high at $NUMBER_OF_SNAPSHOTS. Deleting 1.\n" "$(timestamp)" SNAPSHOTS=$(getSnapshotNames readyToUse=true --selector="$selector") @@ -37,31 +38,31 @@ delete_old_volumesnapshots() { done } -delete_stuck_volumesnapshots() { - snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}") - arr=(`echo ${snapshot_list}`); - for snapshot_name in "${arr[@]}"; do - snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}') - snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1} - snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s) - current_date_unix=$(date -u +%s) - snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60 )) - # Snapshots should never be older than 6 minutes - # If they are then there's a problem on AWS' end and the snapshot needs to be deleted. - if [ $snapshot_age_minutes -ge 6 ]; then - printf "%s Snasphot %s is %s minutes old. It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes" - err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null) - if [ $? -ne 0 ]; then - printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name" - printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err" - sleep 10 - exit 1 - else - printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name" - fi - fi - done -} +# delete_stuck_volumesnapshots() { +# snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}") +# arr=(`echo ${snapshot_list}`); +# for snapshot_name in "${arr[@]}"; do +# snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}') +# snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1} +# snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s) +# current_date_unix=$(date -u +%s) +# snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60 )) +# # Snapshots should never be older than 6 minutes +# # If they are then there's a problem on AWS' end and the snapshot needs to be deleted. +# if [ $snapshot_age_minutes -ge 6 ]; then +# printf "%s Snasphot %s is %s minutes old. It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes" +# err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null) +# if [ $? -ne 0 ]; then +# printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name" +# printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err" +# sleep 10 +# exit 1 +# else +# printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name" +# fi +# fi +# done +# } HISTORY_MODE="$(echo "$NODE_CONFIG" | jq -r ".history_mode")" TARGET_VOLUME="$(echo "$NODE_CONFIG" | jq ".target_volume")" @@ -83,12 +84,23 @@ yq e -i '.spec.volumeSnapshotClassName=strenv(VOLUME_SNAPSHOT_CLASS)' createVolu while true; do + # Pause if nodes are not ready + until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do + printf "%s Tezos node is not ready for snapshot. Check node pod logs. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" + until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do + sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node + if [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; then + break + fi + done + done + # Remove unlabeled snapshots delete_old_volumesnapshots selector='!history_mode' max_snapshots=0 # Maintain 4 snapshots of a certain history mode delete_old_volumesnapshots selector="history_mode=$HISTORY_MODE" max_snapshots=4 # Check for and delete old stuck snapshots - delete_stuck_volumesnapshots + # delete_stuck_volumesnapshots if ! [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; then # EBS Snapshot name based on current time and date @@ -113,7 +125,7 @@ while true; do while [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; do printf "%s Snapshot is still creating...\n" "$(timestamp)" sleep 10 - delete_stuck_volumesnapshots + # delete_stuck_volumesnapshots done end_time=$(date +%s) elapsed=$((end_time - start_time)) @@ -122,6 +134,9 @@ while true; do else printf "%s Snapshot already in progress...\n" "$(timestamp)" sleep 10 - delete_stuck_volumesnapshots + # delete_stuck_volumesnapshots fi + + printf "%s Sleeping for 10m due to Digital Ocean rate limit.\n" "$(timestamp)" + sleep 10m done \ No newline at end of file diff --git a/charts/snapshotEngine/templates/configmap.yaml b/charts/snapshotEngine/templates/configmap.yaml index 503dbe174..b8b2f9ec5 100644 --- a/charts/snapshotEngine/templates/configmap.yaml +++ b/charts/snapshotEngine/templates/configmap.yaml @@ -15,7 +15,7 @@ data: SCHEMA_URL: {{ $.Values.schemaUrl }} S3_BUCKET: {{ $.Values.s3BucketOverride }} CLOUD_PROVIDER: {{ $.Values.cloudProvider }} - FQDN: {{ $.Values.fqdn }} + STORAGE_CLASS: {{$.Values.volumeSnapClass }} kind: ConfigMap metadata: name: snapshot-configmap diff --git a/snapshotEngine/mainJob.yaml b/snapshotEngine/mainJob.yaml index 157b75bb7..9306269ff 100644 --- a/snapshotEngine/mainJob.yaml +++ b/snapshotEngine/mainJob.yaml @@ -53,17 +53,18 @@ spec: # These loops wait on the RPC to come online and prevent log from printing same line # over and over and over again. This prints one line and waits for the RPC to come online for a clean log. - until wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do + until wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do printf "%s Waiting for node RPC to come online.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" - until wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do - if wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then + until wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do + sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for rpc + if wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then break fi done done # If somehow we skip the above waiting loop, this kills the job if the RPC is not online. - if ! wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then + if ! wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then printf "%s RPC is not online! Exiting...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" exit 1 @@ -76,15 +77,15 @@ spec: # Tezos devs have advised us that it is safer to target HEAD~2 for rolling artifacts. else - HEAD_BLOCK=$(wget -qO- http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/') + HEAD_BLOCK=$(wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/') TARGET="${HEAD_BLOCK}~2" fi # Get BLOCK_HASH from RPC - wget -qO- http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH + wget -qO- http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH # Get BLOCK_HEIGHT from RPC - wget -qO- http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT + wget -qO- http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT # We need to check if the block is finalized for archive nodes since we aren't getting # validation by a Tezos snapshot like our rolling tarball. We are just zipping up the data dir from an archive node. @@ -117,13 +118,13 @@ spec: fi # Get BLOCK_TIMESTAMP from RPC - wget -qO- http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP + wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP # Old version string /usr/local/bin/octez-node --version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_VERSION # Get new version object from RPC - wget -qO- http://localhost:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO + wget -qO- http://127.0.0.1:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO # Print variables for debug printf "%s BLOCK_HASH is...$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH))\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" @@ -225,8 +226,10 @@ spec: name: snapshot-cache-volume - mountPath: /rolling-tarball-restore name: rolling-tarball-restore - - mountPath: /cloud-provider - name: cloud-provider + - mountPath: /aws-secrets + name: aws-secrets + - mountPath: /do-secrets + name: do-secrets env: - name: HISTORY_MODE value: "" @@ -244,8 +247,12 @@ spec: - name: rolling-tarball-restore persistentVolumeClaim: claimName: rolling-tarball-restore - - name: cloud-provider + - name: aws-secrets secret: - secretName: cloud-provider + secretName: aws-secrets + optional: true + - name: do-secrets + secret: + secretName: do-secrets optional: true backoffLimit: 0 diff --git a/snapshotEngine/scratchVolume.yaml b/snapshotEngine/scratchVolume.yaml index 2def8db1a..4b1affb46 100644 --- a/snapshotEngine/scratchVolume.yaml +++ b/snapshotEngine/scratchVolume.yaml @@ -4,7 +4,7 @@ metadata: name: snapshot-cache-volume namespace: "" spec: - storageClassName: ebs-sc + storageClassName: do-block-storage accessModes: - ReadWriteOnce resources: diff --git a/snapshotEngine/snapshot-maker.sh b/snapshotEngine/snapshot-maker.sh index d4dac4419..5cfcddd10 100755 --- a/snapshotEngine/snapshot-maker.sh +++ b/snapshotEngine/snapshot-maker.sh @@ -4,12 +4,6 @@ cd / ZIP_AND_UPLOAD_JOB_NAME=zip-and-upload-"${HISTORY_MODE}" -# Pause if nodes are not ready -while [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "False" ]; do - printf "%s Tezos node is not ready for snapshot. Check node pod logs. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" - sleep 30 -done - # Delete zip-and-upload job if kubectl get job "${ZIP_AND_UPLOAD_JOB_NAME}"; then printf "%s Old zip-and-upload job exits. Attempting to delete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" @@ -26,6 +20,7 @@ fi if [ "${HISTORY_MODE}" = rolling ]; then if [ "$(kubectl get pvc rolling-tarball-restore)" ]; then printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" + sleep 5 kubectl delete pvc rolling-tarball-restore sleep 5 fi @@ -33,20 +28,22 @@ fi if [ "$(kubectl get pvc "${HISTORY_MODE}"-snapshot-cache-volume)" ]; then printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" + sleep 5 kubectl delete pvc "${HISTORY_MODE}"-snapshot-cache-volume sleep 5 fi if [ "$(kubectl get pvc "${HISTORY_MODE}"-snap-volume)" ]; then printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" + sleep 5 kubectl delete pvc "${HISTORY_MODE}"-snap-volume sleep 5 fi -while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do - printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" - sleep 10 -done +# while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do +# printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" +# sleep 10 +# done printf "%s EBS Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" @@ -60,6 +57,11 @@ printf "%s Creating scratch volume for artifact processing...\n" "$(date "+%Y-%m # Set namespace for both "${HISTORY_MODE}"-snapshot-cache-volume NAMESPACE="${NAMESPACE}" yq e -i '.metadata.namespace=strenv(NAMESPACE)' scratchVolume.yaml +# Set storage class for sratch volume yaml +STORAGE_CLASS="${STORAGE_CLASS}" yq e -i '.spec.storageClassName=strenv(STORAGE_CLASS)' scratchVolume.yaml + +sleep 5 + # Create "${HISTORY_MODE}"-snapshot-cache-volume printf "%s Creating PVC ${HISTORY_MODE}-snapshot-cache-volume.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" NAME="${HISTORY_MODE}-snapshot-cache-volume" yq e -i '.metadata.name=strenv(NAME)' scratchVolume.yaml @@ -73,6 +75,7 @@ printf "%s PVC %s created.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "${HISTORY_MOD if [ "${HISTORY_MODE}" = rolling ]; then + sleep 5 # Create rolling-tarball-restore printf "%s Creating PVC rolling-tarball-restore..\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" NAME="rolling-tarball-restore" yq e -i '.metadata.name=strenv(NAME)' scratchVolume.yaml @@ -87,6 +90,9 @@ fi ## Snapshot volume namespace NAMESPACE="${NAMESPACE}" yq e -i '.metadata.namespace=strenv(NAMESPACE)' volumeFromSnap.yaml +# Set storageclass for restored volume +STORAGE_CLASS="${STORAGE_CLASS}" yq e -i '.spec.storageClassName=strenv(STORAGE_CLASS)' volumeFromSnap.yaml + ## Snapshot volume name VOLUME_NAME="${HISTORY_MODE}-snap-volume" VOLUME_NAME="${VOLUME_NAME}" yq e -i '.metadata.name=strenv(VOLUME_NAME)' volumeFromSnap.yaml @@ -111,6 +117,8 @@ printf "%s We're rounding up and adding 20%% , volume size will be %sGB.\n" "$(d RESTORE_VOLUME_SIZE="${RESTORE_VOLUME_SIZE}Gi" yq e -i '.spec.resources.requests.storage=strenv(RESTORE_VOLUME_SIZE)' volumeFromSnap.yaml +sleep 5 + printf "%s Creating volume from snapshot ${NEWEST_SNAPSHOT}.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" if ! kubectl apply -f volumeFromSnap.yaml then @@ -175,22 +183,22 @@ if [ "${HISTORY_MODE}" = archive ]; then yq eval -i "del(.spec.template.spec.containers[0].volumeMounts[2])" mainJob.yaml fi -# Switch alternate cloud provider secret name based on actual cloud provider -if [[ -n "${CLOUD_PROVIDER}" ]]; then - # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted. - SECRET_NAME="${NAMESPACE}-secret" - # Index of zip-and-upload container changes depending on if rolling job or archive job - NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml) - # Index of mounts also changes depending on history mode - NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml ) - # Secret volume mount is last item in list of volumeMounts for the zip and upload container - SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml - # Index of job volumes change depending on history mode - NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml ) - # Setting job secret volume to value set by workflow - SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml - SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml -fi +# # Switch alternate cloud provider secret name based on actual cloud provider +# if [[ -n "${CLOUD_PROVIDER}" ]]; then +# # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted. +# SECRET_NAME="${NAMESPACE}-secret" +# # Index of zip-and-upload container changes depending on if rolling job or archive job +# NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml) +# # Index of mounts also changes depending on history mode +# NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml ) +# # Secret volume mount is last item in list of volumeMounts for the zip and upload container +# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml +# # Index of job volumes change depending on history mode +# NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml ) +# # Setting job secret volume to value set by workflow +# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml +# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml +# fi # Service account to be used by entire zip-and-upload job. SERVICE_ACCOUNT="${SERVICE_ACCOUNT}" yq e -i '.spec.template.spec.serviceAccountName=strenv(SERVICE_ACCOUNT)' mainJob.yaml @@ -204,12 +212,13 @@ then exit 1 fi -sleep 5 +sleep 20 # Wait for snapshotting job to complete while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do printf "%s Waiting for zip-and-upload job to complete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do + sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job if [ "$(kubectl get pod -l job-name=zip-and-upload-"${HISTORY_MODE}" --namespace="${NAMESPACE}"| grep -i -e error -e evicted -e pending)" ] || \ [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace="${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].type}')" ] ; then printf "%s Zip-and-upload job failed. This job will end and a new snapshot will be taken.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" @@ -226,5 +235,7 @@ if ! [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMES fi printf "%s Deleting temporary snapshot volume.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" +sleep 5 kubectl delete -f volumeFromSnap.yaml | while IFS= read -r line; do printf '%s %s\n' "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "$line"; done +sleep 5 kubectl delete job snapshot-maker --namespace "${NAMESPACE}" diff --git a/snapshotEngine/snapshot-scheduler.sh b/snapshotEngine/snapshot-scheduler.sh index 6924b2c95..d8710df16 100755 --- a/snapshotEngine/snapshot-scheduler.sh +++ b/snapshotEngine/snapshot-scheduler.sh @@ -18,6 +18,18 @@ JOB_NAME=snapshot-maker-"${HISTORY_MODE}"-node JOB_NAME="${JOB_NAME}" yq e -i '.metadata.name=strenv(JOB_NAME)' snapshotMakerJob.yaml while true; do + + # Pause if nodes are not ready + until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do + printf "%s Tezos node is not ready for snapshot. Check node pod logs. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" + until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do + sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node + if [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; then + break + fi + done + done + # Job exists if [ "$(kubectl get jobs "${JOB_NAME}" --namespace "${NAMESPACE}")" ]; then printf "%s Snapshot-maker job exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" diff --git a/snapshotEngine/volumeFromSnap.yaml b/snapshotEngine/volumeFromSnap.yaml index 8b4952f40..1f1017c94 100644 --- a/snapshotEngine/volumeFromSnap.yaml +++ b/snapshotEngine/volumeFromSnap.yaml @@ -4,7 +4,7 @@ metadata: name: "" namespace: "" spec: - storageClassName: ebs-sc + storageClassName: do-block-storage dataSource: name: "" kind: VolumeSnapshot diff --git a/snapshotEngine/zip-and-upload.sh b/snapshotEngine/zip-and-upload.sh index 1ba6dbf93..665a084de 100755 --- a/snapshotEngine/zip-and-upload.sh +++ b/snapshotEngine/zip-and-upload.sh @@ -6,7 +6,7 @@ BLOCK_TIMESTAMP=$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP) #TEZOS_VERSION=$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_VERSION) NETWORK="${NAMESPACE%%-*}" # Export for python -export S3_BUCKET="${S3_BUCKET:-${NAMESPACE%-*}.${SNAPSHOT_WEBSITE_DOMAIN_NAME}}" +export S3_BUCKET="${NAMESPACE}" TEZOS_RPC_VERSION_INFO="$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO)" TEZOS_VERSION="$(echo "${TEZOS_RPC_VERSION_INFO}" | jq -r .version)" @@ -22,6 +22,7 @@ REDIRECT_ROOT="/" # CND Endpoint used for CDN URLs (different from command endpoint) if [[ "${CLOUD_PROVIDER}" == "digitalocean" ]]; then + FQDN=${S3_BUCKET}.nyc3.digitaloceanspaces.com URL="${FQDN}" REDIRECT_ROOT="https://${FQDN}/" else @@ -35,12 +36,14 @@ cd / # so we opted for this. # $1 is file name set_aws_command_creds(){ - if [[ -n ${CLOUD_PROVIDER} ]]; then - echo "AWS_ACCESS_KEY_ID=$(cat /cloud-provider/access-id) \ - AWS_SECRET_ACCESS_KEY=$(cat /cloud-provider/secret-key) \ - aws --endpoint-url https://nyc3.digitaloceanspaces.com " + if [[ $1 == "aws" ]]; then + echo "AWS_ACCESS_KEY_ID=$(cat /aws-secrets/aws-access-id) \ + AWS_SECRET_ACCESS_KEY=$(cat /aws-secrets/aws-secret-key) \ + aws " else - echo "aws " + echo "AWS_ACCESS_KEY_ID=$(cat /do-secrets/do-spaces-access-id) \ + AWS_SECRET_ACCESS_KEY=$(cat /do-secrets/do-spaces-secret-key) \ + aws --endpoint-url https://nyc3.digitaloceanspaces.com " fi } @@ -65,8 +68,8 @@ if [[ "${HISTORY_MODE}" = archive ]]; then # Instead of guessing size, you can use expected-size which tells S3 how big the file is and it calculates the size for you. # However if the file gets bigger than your expected size, the multipart upload fails because it uses a part size outside of the bounds (1-10000) # This gets the old archive tarball size and then adds 10%. Archive tarballs dont seem to grow more than that. - if aws s3 ls s3://"${AWS_S3_BUCKET}" | grep archive-tarball-metadata; then #Use last file for expected size if it exists - EXPECTED_SIZE=$(curl -L http://"${AWS_S3_BUCKET}"/archive-tarball-metadata 2>/dev/null | jq -r '.filesize_bytes' | awk '{print $1*1.1}' | awk '{print ($0-int($0)>0)?int($0)+1:int($0)}') + if eval "$(set_aws_command_creds)" s3 ls s3://"${S3_BUCKET}" | grep archive-tarball-metadata; then #Use last file for expected size if it exists + EXPECTED_SIZE=$(curl -L http://"${S3_BUCKET}"/archive-tarball-metadata 2>/dev/null | jq -r '.filesize_bytes' | awk '{print $1*1.1}' | awk '{print ($0-int($0)>0)?int($0)+1:int($0)}') else EXPECTED_SIZE=1000000000000 #1000GB Arbitrary filesize for initial value. Only used if no archive-tarball-metadata exists. IE starting up test network fi @@ -162,7 +165,7 @@ if [[ "${HISTORY_MODE}" = archive ]]; then fi # Upload redirect file and set header for previously uploaded LZ4 File - if ! aws s3 cp archive-tarball s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ARCHIVE_TARBALL_FILENAME}" --cache-control 'no-cache'; then + if ! eval "$(set_aws_command_creds "aws")" s3 cp archive-tarball s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ARCHIVE_TARBALL_FILENAME}" --cache-control 'no-cache'; then printf "%s Archive Tarball : Error uploading ${NETWORK}-archive-tarball. to S3\n" "$(date "+%Y-%m-%d %H:%M:%S")" else printf "%s Archive Tarball : Upload of ${NETWORK}-archive-tarball successful to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" @@ -176,7 +179,7 @@ if [[ "${HISTORY_MODE}" = archive ]]; then fi # Upload archive tarball json redirect file and set header for previously uploaded archive tarball json File - if ! aws s3 cp archive-tarball-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ARCHIVE_TARBALL_FILENAME}".json --cache-control 'no-cache'; then + if ! eval "$(set_aws_command_creds "aws")" s3 cp archive-tarball-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ARCHIVE_TARBALL_FILENAME}".json --cache-control 'no-cache'; then printf "%s archive Tarball : Error uploading ${NETWORK}-archive-tarball-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" else printf "%s archive Tarball : Uploaded ${NETWORK}-archive-tarball-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" @@ -244,8 +247,8 @@ if [ "${HISTORY_MODE}" = rolling ]; then # However if the file gets bigger than your expected size, the multipart upload fails because it uses a part size outside of the bounds (1-10000) # This gets the old rolling tarball size and then adds 10%. rolling tarballs dont seem to grow more than that. printf "%s Rolling Tarball: Getting last rolling tarball filesize for multipart upload...\n" "$(date "+%Y-%m-%d %H:%M:%S")" - if aws s3 ls s3://"${AWS_S3_BUCKET}" | grep rolling-tarball-metadata; then #Use last file for expected size if it exists - EXPECTED_SIZE=$(curl -L http://"${AWS_S3_BUCKET}"/rolling-tarball-metadata 2>/dev/null | jq -r '.filesize_bytes' | awk '{print $1*1.1}' | awk '{print ($0-int($0)>0)?int($0)+1:int($0)}') + if eval "$(set_aws_command_creds "aws")" s3 ls s3://"${S3_BUCKET}" | grep rolling-tarball-metadata; then #Use last file for expected size if it exists + EXPECTED_SIZE=$(curl -L http://"${S3_BUCKET}"/rolling-tarball-metadata 2>/dev/null | jq -r '.filesize_bytes' | awk '{print $1*1.1}' | awk '{print ($0-int($0)>0)?int($0)+1:int($0)}') printf "%s Rolling Tarball: Bucket has existing artifact metadata. \n" "$(date "+%Y-%m-%d %H:%M:%S")" printf "%s Rolling Tarball: Expected size is - %s \n" "$(date "+%Y-%m-%d %H:%M:%S")" "${EXPECTED_SIZE}" else @@ -344,7 +347,7 @@ if [ "${HISTORY_MODE}" = rolling ]; then fi # Upload redirect file and set header for previously uploaded LZ4 File - if ! aws s3 cp rolling-tarball s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_TARBALL_FILENAME}" --cache-control 'no-cache'; then + if ! eval "$(set_aws_command_creds "aws")" s3 cp rolling-tarball s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_TARBALL_FILENAME}" --cache-control 'no-cache'; then printf "%s Rolling Tarball : Error uploading ${NETWORK}-rolling-tarball file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" else printf "%s Rolling Tarball : Uploaded ${NETWORK}-rolling-tarball file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" @@ -358,7 +361,7 @@ if [ "${HISTORY_MODE}" = rolling ]; then fi # Upload rolling tarball json redirect file and set header for previously uploaded rolling tarball json File - if ! aws s3 cp rolling-tarball-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_TARBALL_FILENAME}".json --cache-control 'no-cache'; then + if ! eval "$(set_aws_command_creds "aws")" s3 cp rolling-tarball-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_TARBALL_FILENAME}".json --cache-control 'no-cache'; then printf "%s Rolling Tarball : Error uploading ${NETWORK}-rolling-tarball-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" else printf "%s Rolling Tarball : Uploaded ${NETWORK}-rolling-tarball-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" @@ -459,7 +462,7 @@ if [ "${HISTORY_MODE}" = rolling ]; then touch rolling # Upload rolling tezos snapshot redirect object - if ! aws s3 cp rolling s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_SNAPSHOT_FILENAME}" --cache-control 'no-cache'; then + if ! eval "$(set_aws_command_creds "aws")" s3 cp rolling s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_SNAPSHOT_FILENAME}" --cache-control 'no-cache'; then printf "%s Rolling Tezos : Error uploading redirect object for ${ROLLING_SNAPSHOT} to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" else printf "%s Rolling Tezos : Successfully uploaded redirect object for ${ROLLING_SNAPSHOT} to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" @@ -473,7 +476,7 @@ if [ "${HISTORY_MODE}" = rolling ]; then fi # Upload rolling snapshot json redirect file and set header for previously uploaded rolling snapshot json File - if ! aws s3 cp rolling-snapshot-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_SNAPSHOT_FILENAME}".json --cache-control 'no-cache'; then + if ! eval "$(set_aws_command_creds "aws")" s3 cp rolling-snapshot-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_SNAPSHOT_FILENAME}".json --cache-control 'no-cache'; then printf "%s Rolling snapshot : Error uploading ${NETWORK}-rolling-snapshot-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" else printf "%s Rolling snapshot : Uploaded ${NETWORK}-rolling-snapshot-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" @@ -491,7 +494,7 @@ if [[ -n "${SNAPSHOT_WEBSITE_DOMAIN_NAME}" ]]; then # Network bucket redirect # Redirects from network.website.com to website.com/network touch index.html - if ! aws s3 cp index.html s3://"${AWS_S3_BUCKET}" --website-redirect https://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}"/"${NETWORK}" --cache-control 'no-cache'; then + if ! eval "$(set_aws_command_creds "aws")" s3 cp index.html s3://"${AWS_S3_BUCKET}" --website-redirect https://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}"/"${NETWORK}" --cache-control 'no-cache'; then printf "%s ERROR ##### Could not upload network site redirect.\n" "$(date "+%Y-%m-%d %H:%M:%S")" else printf "%s Successfully uploaded network site redirect.\n" "$(date "+%Y-%m-%d %H:%M:%S")" @@ -551,7 +554,7 @@ if [[ -n "${SNAPSHOT_WEBSITE_DOMAIN_NAME}" ]]; then fi # Upload tezos-snapshots.json - if ! aws s3 cp tezos-snapshots.json s3://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}"/tezos-snapshots.json; then + if ! eval "$(set_aws_command_creds "aws")" s3 cp tezos-snapshots.json s3://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}"/tezos-snapshots.json; then printf "%s Upload tezos-snapshots.json : Error uploading file tezos-snapshots.json to S3. \n" "$(date "+%Y-%m-%d %H:%M:%S")" else printf "%s Upload tezos-snapshots.json : File tezos-snapshots.json successfully uploaded to S3. \n" "$(date "+%Y-%m-%d %H:%M:%S")" @@ -563,14 +566,10 @@ if [[ -n "${SNAPSHOT_WEBSITE_DOMAIN_NAME}" ]]; then # Generate HTML from markdown and metadata chown -R jekyll:jekyll ./* - bundle exec jekyll build + bundle exec jekyll build --quiet # Upload chain page (index.html and assets) to root of website bucket - if ! aws s3 cp _site/ s3://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}" --recursive --include "*"; then - printf "%s Website Build & Deploy : Error uploading site to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" - else - printf "%s Website Build & Deploy : Successful uploaded website to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")" - fi + eval "$(set_aws_command_creds "aws")" s3 cp _site/ s3://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}" --recursive | grep "*" fi SLEEP_TIME=0m