From 7e31daf5c906faf1b90f31eef0b833037fa9fb32 Mon Sep 17 00:00:00 2001
From: Jimmy <5608027+orcutt989@users.noreply.github.com>
Date: Tue, 15 Aug 2023 12:41:01 -0400
Subject: [PATCH] snapshotEngine: DigitalOcean complete migration (#586)

* add value to skip snap web generation

* add configurable value for s3 bucket

* web build condition on domain name instead

* add secret and configurable s3 bucket override

* switch name and mountpath to match format

* update secret name and use in zip and upload job

* use export instead of temp var

* secret name change

* expect correct names on secret volume mount

* correct path to secret mount

* rework credential override to provide logs and error messages

* use double quotes for early expansion

* remove variable checking since we are feeding in files

* bug: container is gone so we cant delete a volume

* show commands for debug

* wrong default s3 bucket var

* turn of tar output for debug

* undo command verbosity

* Verbose variables

* Enable interactive for alias to work

* More useful alias message and rm debug messages

* Need space after !

* expand aliases instead of interactive

* add public-read and move index.html

* Website redirects stay in AWS

* Set alias only for filesystem artifact upload

* rolling redirects working

* fix volume indexing

* helpful messages

* Useful comments for new indexing format

* Omit alias functionality in lieu of variable parameters

* Fix rolling tarball filename

* configmap needs fqdn

* cdn isnt working so we're using bucket url

* unsilence lz4 logs

* wrong aws bucket name

* get all snapshot metadata from do spaces

* upload metadatas to alt s3 bucket

* fix metadata related to website build

* initial commit demo functionality

* put redirects back

* remove merged files

* update zip and upload commands for dual creds

* sleep for debug

* allow override of storage class for scratch volumes

* use storage class as set

* Container-running OS will not resolve localhost

* Remove infinite sleep from debugging

* Empty-Commit to trigger CI test

* bucket name change to do space

* rm fqdn from cm

* increase warmer timeout

* increase timeout after artifact job create

* DO rate limits snapshots per 10m

* sleep between creation for rate limiting

* need different command for site upload

* block snapshot until node ready

* pause scheduler if node not ready

* add sleep for cpu usage reduction

* fix busy waits and document why

* fix busy wait on job and more better comments
---
 .gitignore                                    |  1 +
 .../snapshotEngine/scripts/snapshot-warmer.sh | 71 +++++++++++--------
 .../snapshotEngine/templates/configmap.yaml   |  2 +-
 snapshotEngine/mainJob.yaml                   | 33 +++++----
 snapshotEngine/scratchVolume.yaml             |  2 +-
 snapshotEngine/snapshot-maker.sh              | 65 ++++++++++-------
 snapshotEngine/snapshot-scheduler.sh          | 12 ++++
 snapshotEngine/volumeFromSnap.yaml            |  2 +-
 snapshotEngine/zip-and-upload.sh              | 47 ++++++------
 9 files changed, 140 insertions(+), 95 deletions(-)

diff --git a/.gitignore b/.gitignore
index c7eda0683..7dbf4eb06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,5 +15,6 @@ build
 
 # Ignore mkchain generated files
 *_values.yaml
+*-values.yaml
 
 charts/tezos/charts
diff --git a/charts/snapshotEngine/scripts/snapshot-warmer.sh b/charts/snapshotEngine/scripts/snapshot-warmer.sh
index 269a80bab..817e61c1e 100755
--- a/charts/snapshotEngine/scripts/snapshot-warmer.sh
+++ b/charts/snapshotEngine/scripts/snapshot-warmer.sh
@@ -27,6 +27,7 @@ delete_old_volumesnapshots() {
   local max_snapshots="${2##max_snapshots=}"
 
   while [ "$(getNumberOfSnapshots readyToUse=true --selector="$selector")" -gt "$max_snapshots" ]; do
+    sleep 5
     NUMBER_OF_SNAPSHOTS=$(getNumberOfSnapshots readyToUse=true --selector="$selector")
     printf "%s Number of snapshots with selector '$selector' is too high at $NUMBER_OF_SNAPSHOTS. Deleting 1.\n" "$(timestamp)"
     SNAPSHOTS=$(getSnapshotNames readyToUse=true --selector="$selector")
@@ -37,31 +38,31 @@ delete_old_volumesnapshots() {
   done
 }
 
-delete_stuck_volumesnapshots() {
-  snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}")
-  arr=(`echo ${snapshot_list}`);
-  for snapshot_name in "${arr[@]}"; do
-    snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}')
-    snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1}
-    snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s)
-    current_date_unix=$(date -u +%s)
-    snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60  ))
-    # Snapshots should never be older than 6 minutes
-    # If they are then there's a problem on AWS' end and the snapshot needs to be deleted.
-    if [ $snapshot_age_minutes -ge 6 ]; then
-      printf "%s Snasphot %s is %s minutes old.  It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes"
-      err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null)
-      if [ $? -ne 0 ]; then
-        printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name"
-        printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err"
-        sleep 10
-        exit 1
-      else
-         printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name"
-      fi
-    fi
-  done
-}
+# delete_stuck_volumesnapshots() {
+#   snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}")
+#   arr=(`echo ${snapshot_list}`);
+#   for snapshot_name in "${arr[@]}"; do
+#     snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}')
+#     snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1}
+#     snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s)
+#     current_date_unix=$(date -u +%s)
+#     snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60  ))
+#     # Snapshots should never be older than 6 minutes
+#     # If they are then there's a problem on AWS' end and the snapshot needs to be deleted.
+#     if [ $snapshot_age_minutes -ge 6 ]; then
+#       printf "%s Snasphot %s is %s minutes old.  It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes"
+#       err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null)
+#       if [ $? -ne 0 ]; then
+#         printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name"
+#         printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err"
+#         sleep 10
+#         exit 1
+#       else
+#          printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name"
+#       fi
+#     fi
+#   done
+# }
 
 HISTORY_MODE="$(echo "$NODE_CONFIG" | jq -r ".history_mode")"
 TARGET_VOLUME="$(echo "$NODE_CONFIG" | jq ".target_volume")"
@@ -83,12 +84,23 @@ yq e -i '.spec.volumeSnapshotClassName=strenv(VOLUME_SNAPSHOT_CLASS)' createVolu
 
 while true; do
 
+  # Pause if nodes are not ready
+  until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
+    printf "%s Tezos node is not ready for snapshot.  Check node pod logs.  \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
+      sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node
+      if  [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; then
+        break
+      fi
+    done
+  done
+
   # Remove unlabeled snapshots
   delete_old_volumesnapshots selector='!history_mode' max_snapshots=0
   # Maintain 4 snapshots of a certain history mode
   delete_old_volumesnapshots selector="history_mode=$HISTORY_MODE" max_snapshots=4
   # Check for and delete old stuck snapshots
-  delete_stuck_volumesnapshots
+  # delete_stuck_volumesnapshots
 
   if ! [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; then
     # EBS Snapshot name based on current time and date
@@ -113,7 +125,7 @@ while true; do
     while [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; do
       printf "%s Snapshot is still creating...\n" "$(timestamp)"
       sleep 10
-      delete_stuck_volumesnapshots
+      # delete_stuck_volumesnapshots
     done
     end_time=$(date +%s)
     elapsed=$((end_time - start_time))
@@ -122,6 +134,9 @@ while true; do
   else
     printf "%s Snapshot already in progress...\n" "$(timestamp)"
     sleep 10
-    delete_stuck_volumesnapshots
+    # delete_stuck_volumesnapshots
   fi
+
+  printf "%s Sleeping for 10m due to Digital Ocean rate limit.\n" "$(timestamp)"
+  sleep 10m  
 done
\ No newline at end of file
diff --git a/charts/snapshotEngine/templates/configmap.yaml b/charts/snapshotEngine/templates/configmap.yaml
index 503dbe174..b8b2f9ec5 100644
--- a/charts/snapshotEngine/templates/configmap.yaml
+++ b/charts/snapshotEngine/templates/configmap.yaml
@@ -15,7 +15,7 @@ data:
   SCHEMA_URL: {{ $.Values.schemaUrl }}
   S3_BUCKET: {{ $.Values.s3BucketOverride }}
   CLOUD_PROVIDER: {{ $.Values.cloudProvider }}
-  FQDN: {{ $.Values.fqdn }}
+  STORAGE_CLASS: {{$.Values.volumeSnapClass }}
 kind: ConfigMap
 metadata:
   name: snapshot-configmap
diff --git a/snapshotEngine/mainJob.yaml b/snapshotEngine/mainJob.yaml
index 157b75bb7..9306269ff 100644
--- a/snapshotEngine/mainJob.yaml
+++ b/snapshotEngine/mainJob.yaml
@@ -53,17 +53,18 @@ spec:
 
               # These loops wait on the RPC to come online and prevent log from printing same line
               # over and over and over again.  This prints one line and waits for the RPC to come online for a clean log.
-              until wget -qO-  http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
+              until wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
                 printf "%s Waiting for node RPC to come online.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
-                until wget -qO-  http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
-                  if  wget -qO-  http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
+                until wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
+                  sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for rpc
+                  if  wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
                     break
                   fi
                 done
               done
 
               # If somehow we skip the above waiting loop, this kills the job if the RPC is not online.
-              if ! wget -qO-  http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
+              if ! wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
                 printf "%s RPC is not online! Exiting...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
                 exit 1
 
@@ -76,15 +77,15 @@ spec:
 
               # Tezos devs have advised us that it is safer to target HEAD~2 for rolling artifacts.
               else
-                HEAD_BLOCK=$(wget -qO-  http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/')
+                HEAD_BLOCK=$(wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/')
                 TARGET="${HEAD_BLOCK}~2"
               fi
 
               # Get BLOCK_HASH from RPC
-              wget -qO-  http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH
+              wget -qO-  http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH
 
               # Get BLOCK_HEIGHT from RPC
-              wget -qO-  http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT
+              wget -qO-  http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT
 
               # We need to check if the block is finalized for archive nodes since we aren't getting
               # validation by a Tezos snapshot like our rolling tarball. We are just zipping up the data dir from an archive node.
@@ -117,13 +118,13 @@ spec:
               fi
 
               # Get BLOCK_TIMESTAMP from RPC
-              wget -qO- http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP
+              wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP
 
               # Old version string
               /usr/local/bin/octez-node --version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_VERSION
 
               # Get new version object from RPC
-              wget -qO- http://localhost:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO
+              wget -qO- http://127.0.0.1:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO
 
               # Print variables for debug
               printf "%s BLOCK_HASH is...$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH))\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
@@ -225,8 +226,10 @@ spec:
               name: snapshot-cache-volume
             - mountPath: /rolling-tarball-restore
               name: rolling-tarball-restore
-            - mountPath: /cloud-provider
-              name: cloud-provider
+            - mountPath: /aws-secrets
+              name: aws-secrets
+            - mountPath: /do-secrets
+              name: do-secrets
           env:
             - name: HISTORY_MODE
               value: ""
@@ -244,8 +247,12 @@ spec:
         - name: rolling-tarball-restore
           persistentVolumeClaim:
             claimName: rolling-tarball-restore
-        - name: cloud-provider
+        - name: aws-secrets
           secret:
-            secretName: cloud-provider
+            secretName: aws-secrets
+            optional: true
+        - name: do-secrets
+          secret:
+            secretName: do-secrets
             optional: true
   backoffLimit: 0
diff --git a/snapshotEngine/scratchVolume.yaml b/snapshotEngine/scratchVolume.yaml
index 2def8db1a..4b1affb46 100644
--- a/snapshotEngine/scratchVolume.yaml
+++ b/snapshotEngine/scratchVolume.yaml
@@ -4,7 +4,7 @@ metadata:
   name: snapshot-cache-volume
   namespace: ""
 spec:
-  storageClassName: ebs-sc
+  storageClassName: do-block-storage
   accessModes:
     - ReadWriteOnce
   resources:
diff --git a/snapshotEngine/snapshot-maker.sh b/snapshotEngine/snapshot-maker.sh
index d4dac4419..5cfcddd10 100755
--- a/snapshotEngine/snapshot-maker.sh
+++ b/snapshotEngine/snapshot-maker.sh
@@ -4,12 +4,6 @@ cd /
 
 ZIP_AND_UPLOAD_JOB_NAME=zip-and-upload-"${HISTORY_MODE}"
 
-# Pause if nodes are not ready
-while [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "False" ]; do
-    printf "%s Tezos node is not ready for snapshot.  Check node pod logs.  \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
-    sleep 30
-done
-
 # Delete zip-and-upload job
 if kubectl get job "${ZIP_AND_UPLOAD_JOB_NAME}"; then
     printf "%s Old zip-and-upload job exits.  Attempting to delete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
@@ -26,6 +20,7 @@ fi
 if [ "${HISTORY_MODE}" = rolling ]; then
     if [ "$(kubectl get pvc rolling-tarball-restore)" ]; then
     printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    sleep 5
     kubectl delete pvc rolling-tarball-restore
     sleep 5
     fi
@@ -33,20 +28,22 @@ fi
 
 if [ "$(kubectl get pvc "${HISTORY_MODE}"-snapshot-cache-volume)" ]; then
     printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    sleep 5
     kubectl delete pvc "${HISTORY_MODE}"-snapshot-cache-volume
     sleep 5
 fi
 
 if [ "$(kubectl get pvc "${HISTORY_MODE}"-snap-volume)" ]; then
     printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    sleep 5
     kubectl delete pvc "${HISTORY_MODE}"-snap-volume
     sleep 5
 fi
 
-while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do
-    printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
-    sleep 10
-done
+# while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do
+#     printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+#     sleep 10
+# done
 
 printf "%s EBS Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
 
@@ -60,6 +57,11 @@ printf "%s Creating scratch volume for artifact processing...\n" "$(date "+%Y-%m
 # Set namespace for both "${HISTORY_MODE}"-snapshot-cache-volume
 NAMESPACE="${NAMESPACE}" yq e -i '.metadata.namespace=strenv(NAMESPACE)' scratchVolume.yaml
 
+# Set storage class for sratch volume yaml
+STORAGE_CLASS="${STORAGE_CLASS}" yq e -i '.spec.storageClassName=strenv(STORAGE_CLASS)' scratchVolume.yaml
+
+sleep 5
+
 # Create "${HISTORY_MODE}"-snapshot-cache-volume
 printf "%s Creating PVC ${HISTORY_MODE}-snapshot-cache-volume.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
 NAME="${HISTORY_MODE}-snapshot-cache-volume" yq e -i '.metadata.name=strenv(NAME)' scratchVolume.yaml
@@ -73,6 +75,7 @@ printf "%s PVC %s created.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "${HISTORY_MOD
 
 
 if [ "${HISTORY_MODE}" = rolling ]; then
+    sleep 5
     # Create rolling-tarball-restore
     printf "%s Creating PVC rolling-tarball-restore..\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
     NAME="rolling-tarball-restore" yq e -i '.metadata.name=strenv(NAME)' scratchVolume.yaml
@@ -87,6 +90,9 @@ fi
 ## Snapshot volume namespace
 NAMESPACE="${NAMESPACE}" yq e -i '.metadata.namespace=strenv(NAMESPACE)' volumeFromSnap.yaml
 
+# Set storageclass for restored volume
+STORAGE_CLASS="${STORAGE_CLASS}" yq e -i '.spec.storageClassName=strenv(STORAGE_CLASS)' volumeFromSnap.yaml
+
 ## Snapshot volume name
 VOLUME_NAME="${HISTORY_MODE}-snap-volume"
 VOLUME_NAME="${VOLUME_NAME}" yq e -i '.metadata.name=strenv(VOLUME_NAME)' volumeFromSnap.yaml
@@ -111,6 +117,8 @@ printf "%s We're rounding up and adding 20%% , volume size will be %sGB.\n" "$(d
 
 RESTORE_VOLUME_SIZE="${RESTORE_VOLUME_SIZE}Gi" yq e -i '.spec.resources.requests.storage=strenv(RESTORE_VOLUME_SIZE)' volumeFromSnap.yaml
 
+sleep 5
+
 printf "%s Creating volume from snapshot ${NEWEST_SNAPSHOT}.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
 if ! kubectl apply -f volumeFromSnap.yaml
 then
@@ -175,22 +183,22 @@ if [ "${HISTORY_MODE}" = archive ]; then
     yq eval -i "del(.spec.template.spec.containers[0].volumeMounts[2])" mainJob.yaml
 fi
 
-# Switch alternate cloud provider secret name based on actual cloud provider
-if [[ -n "${CLOUD_PROVIDER}" ]]; then
-    # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted.
-    SECRET_NAME="${NAMESPACE}-secret"
-    # Index of zip-and-upload container changes depending on if rolling job or archive job
-    NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml)
-    # Index of mounts also changes depending on history mode
-    NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml )
-    # Secret volume mount is last item in list of volumeMounts for the zip and upload container
-    SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
-    # Index of job volumes change depending on history mode
-    NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml )
-    #  Setting job secret volume to value set by workflow
-    SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
-    SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml
-fi
+# # Switch alternate cloud provider secret name based on actual cloud provider
+# if [[ -n "${CLOUD_PROVIDER}" ]]; then
+#     # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted.
+#     SECRET_NAME="${NAMESPACE}-secret"
+#     # Index of zip-and-upload container changes depending on if rolling job or archive job
+#     NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml)
+#     # Index of mounts also changes depending on history mode
+#     NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml )
+#     # Secret volume mount is last item in list of volumeMounts for the zip and upload container
+#     SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
+#     # Index of job volumes change depending on history mode
+#     NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml )
+#     #  Setting job secret volume to value set by workflow
+#     SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
+#     SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml
+# fi
 
 # Service account to be used by entire zip-and-upload job.
 SERVICE_ACCOUNT="${SERVICE_ACCOUNT}" yq e -i '.spec.template.spec.serviceAccountName=strenv(SERVICE_ACCOUNT)' mainJob.yaml
@@ -204,12 +212,13 @@ then
     exit 1
 fi
 
-sleep 5
+sleep 20
 
 # Wait for snapshotting job to complete
 while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
     printf "%s Waiting for zip-and-upload job to complete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"    
     while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
+        sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job
         if [ "$(kubectl get pod -l job-name=zip-and-upload-"${HISTORY_MODE}" --namespace="${NAMESPACE}"| grep -i -e error -e evicted -e pending)" ] || \
         [ "$(kubectl get jobs  "zip-and-upload-${HISTORY_MODE}" --namespace="${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].type}')" ] ; then
             printf "%s Zip-and-upload job failed. This job will end and a new snapshot will be taken.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" 
@@ -226,5 +235,7 @@ if ! [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMES
 fi
 
 printf "%s Deleting temporary snapshot volume.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+sleep 5
 kubectl delete -f volumeFromSnap.yaml  | while IFS= read -r line; do printf '%s %s\n' "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "$line"; done
+sleep 5
 kubectl delete job snapshot-maker --namespace "${NAMESPACE}"
diff --git a/snapshotEngine/snapshot-scheduler.sh b/snapshotEngine/snapshot-scheduler.sh
index 6924b2c95..d8710df16 100755
--- a/snapshotEngine/snapshot-scheduler.sh
+++ b/snapshotEngine/snapshot-scheduler.sh
@@ -18,6 +18,18 @@ JOB_NAME=snapshot-maker-"${HISTORY_MODE}"-node
 JOB_NAME="${JOB_NAME}" yq e -i '.metadata.name=strenv(JOB_NAME)' snapshotMakerJob.yaml
 
 while true; do
+
+  # Pause if nodes are not ready
+  until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
+    printf "%s Tezos node is not ready for snapshot.  Check node pod logs.  \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
+      sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node
+      if  [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; then
+        break
+      fi
+    done
+  done
+
   # Job exists
   if [ "$(kubectl get jobs "${JOB_NAME}" --namespace "${NAMESPACE}")" ]; then
     printf "%s Snapshot-maker job exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
diff --git a/snapshotEngine/volumeFromSnap.yaml b/snapshotEngine/volumeFromSnap.yaml
index 8b4952f40..1f1017c94 100644
--- a/snapshotEngine/volumeFromSnap.yaml
+++ b/snapshotEngine/volumeFromSnap.yaml
@@ -4,7 +4,7 @@ metadata:
   name: ""
   namespace: ""
 spec:
-  storageClassName: ebs-sc
+  storageClassName: do-block-storage
   dataSource:
     name: ""
     kind: VolumeSnapshot
diff --git a/snapshotEngine/zip-and-upload.sh b/snapshotEngine/zip-and-upload.sh
index 1ba6dbf93..665a084de 100755
--- a/snapshotEngine/zip-and-upload.sh
+++ b/snapshotEngine/zip-and-upload.sh
@@ -6,7 +6,7 @@ BLOCK_TIMESTAMP=$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP)
 #TEZOS_VERSION=$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_VERSION)
 NETWORK="${NAMESPACE%%-*}"
 # Export for python
-export S3_BUCKET="${S3_BUCKET:-${NAMESPACE%-*}.${SNAPSHOT_WEBSITE_DOMAIN_NAME}}"
+export S3_BUCKET="${NAMESPACE}"
 TEZOS_RPC_VERSION_INFO="$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO)"
 
 TEZOS_VERSION="$(echo "${TEZOS_RPC_VERSION_INFO}" | jq -r .version)"
@@ -22,6 +22,7 @@ REDIRECT_ROOT="/"
 
 # CND Endpoint used for CDN URLs (different from command endpoint)
 if [[ "${CLOUD_PROVIDER}" == "digitalocean" ]]; then
+    FQDN=${S3_BUCKET}.nyc3.digitaloceanspaces.com
     URL="${FQDN}"
     REDIRECT_ROOT="https://${FQDN}/"
 else
@@ -35,12 +36,14 @@ cd /
 #  so we opted for this.
 # $1 is file name
 set_aws_command_creds(){
-    if [[ -n ${CLOUD_PROVIDER} ]]; then
-        echo "AWS_ACCESS_KEY_ID=$(cat /cloud-provider/access-id) \
-            AWS_SECRET_ACCESS_KEY=$(cat /cloud-provider/secret-key) \
-            aws --endpoint-url https://nyc3.digitaloceanspaces.com "
+    if [[ $1 == "aws" ]]; then
+        echo "AWS_ACCESS_KEY_ID=$(cat /aws-secrets/aws-access-id) \
+            AWS_SECRET_ACCESS_KEY=$(cat /aws-secrets/aws-secret-key) \
+            aws "
     else
-        echo "aws "
+        echo "AWS_ACCESS_KEY_ID=$(cat /do-secrets/do-spaces-access-id) \
+            AWS_SECRET_ACCESS_KEY=$(cat /do-secrets/do-spaces-secret-key) \
+            aws --endpoint-url https://nyc3.digitaloceanspaces.com "
     fi
 }
 
@@ -65,8 +68,8 @@ if [[ "${HISTORY_MODE}" = archive ]]; then
     # Instead of guessing size, you can use expected-size which tells S3 how big the file is and it calculates the size for you.
     # However if the file gets bigger than your expected size, the multipart upload fails because it uses a part size outside of the bounds (1-10000)
     # This gets the old archive tarball size and then adds 10%.  Archive tarballs dont seem to grow more than that.
-    if aws s3 ls s3://"${AWS_S3_BUCKET}" | grep archive-tarball-metadata; then #Use last file for expected size if it exists
-        EXPECTED_SIZE=$(curl -L http://"${AWS_S3_BUCKET}"/archive-tarball-metadata 2>/dev/null | jq -r '.filesize_bytes' | awk '{print $1*1.1}' | awk '{print ($0-int($0)>0)?int($0)+1:int($0)}')
+    if eval "$(set_aws_command_creds)" s3 ls s3://"${S3_BUCKET}" | grep archive-tarball-metadata; then #Use last file for expected size if it exists
+        EXPECTED_SIZE=$(curl -L http://"${S3_BUCKET}"/archive-tarball-metadata 2>/dev/null | jq -r '.filesize_bytes' | awk '{print $1*1.1}' | awk '{print ($0-int($0)>0)?int($0)+1:int($0)}')
     else
         EXPECTED_SIZE=1000000000000 #1000GB Arbitrary filesize for initial value. Only used if no archive-tarball-metadata exists. IE starting up test network
     fi
@@ -162,7 +165,7 @@ if [[ "${HISTORY_MODE}" = archive ]]; then
         fi
 
         # Upload redirect file and set header for previously uploaded LZ4 File
-        if ! aws s3 cp archive-tarball s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ARCHIVE_TARBALL_FILENAME}" --cache-control 'no-cache'; then
+        if ! eval "$(set_aws_command_creds "aws")" s3 cp archive-tarball s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ARCHIVE_TARBALL_FILENAME}" --cache-control 'no-cache'; then
             printf "%s Archive Tarball : Error uploading ${NETWORK}-archive-tarball. to S3\n" "$(date "+%Y-%m-%d %H:%M:%S")"
         else
             printf "%s Archive Tarball : Upload of ${NETWORK}-archive-tarball successful to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
@@ -176,7 +179,7 @@ if [[ "${HISTORY_MODE}" = archive ]]; then
         fi
 
         # Upload archive tarball json redirect file and set header for previously uploaded archive tarball json File
-        if ! aws s3 cp archive-tarball-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ARCHIVE_TARBALL_FILENAME}".json --cache-control 'no-cache'; then
+        if ! eval "$(set_aws_command_creds "aws")" s3 cp archive-tarball-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ARCHIVE_TARBALL_FILENAME}".json --cache-control 'no-cache'; then
             printf "%s archive Tarball : Error uploading ${NETWORK}-archive-tarball-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
         else
             printf "%s archive Tarball : Uploaded ${NETWORK}-archive-tarball-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
@@ -244,8 +247,8 @@ if [ "${HISTORY_MODE}" = rolling ]; then
     # However if the file gets bigger than your expected size, the multipart upload fails because it uses a part size outside of the bounds (1-10000)
     # This gets the old rolling tarball size and then adds 10%.  rolling tarballs dont seem to grow more than that.
     printf "%s Rolling Tarball: Getting last rolling tarball filesize for multipart upload...\n" "$(date "+%Y-%m-%d %H:%M:%S")"
-    if aws s3 ls s3://"${AWS_S3_BUCKET}" | grep rolling-tarball-metadata; then #Use last file for expected size if it exists
-        EXPECTED_SIZE=$(curl -L http://"${AWS_S3_BUCKET}"/rolling-tarball-metadata 2>/dev/null | jq -r '.filesize_bytes' | awk '{print $1*1.1}' | awk '{print ($0-int($0)>0)?int($0)+1:int($0)}')
+    if eval "$(set_aws_command_creds "aws")" s3 ls s3://"${S3_BUCKET}" | grep rolling-tarball-metadata; then #Use last file for expected size if it exists
+        EXPECTED_SIZE=$(curl -L http://"${S3_BUCKET}"/rolling-tarball-metadata 2>/dev/null | jq -r '.filesize_bytes' | awk '{print $1*1.1}' | awk '{print ($0-int($0)>0)?int($0)+1:int($0)}')
         printf "%s Rolling Tarball: Bucket has existing artifact metadata.  \n" "$(date "+%Y-%m-%d %H:%M:%S")"
         printf "%s Rolling Tarball: Expected size is - %s  \n" "$(date "+%Y-%m-%d %H:%M:%S")" "${EXPECTED_SIZE}"
     else
@@ -344,7 +347,7 @@ if [ "${HISTORY_MODE}" = rolling ]; then
         fi
 
         # Upload redirect file and set header for previously uploaded LZ4 File
-        if ! aws s3 cp rolling-tarball s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_TARBALL_FILENAME}" --cache-control 'no-cache'; then
+        if ! eval "$(set_aws_command_creds "aws")" s3 cp rolling-tarball s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_TARBALL_FILENAME}" --cache-control 'no-cache'; then
             printf "%s Rolling Tarball : Error uploading ${NETWORK}-rolling-tarball file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
         else
             printf "%s Rolling Tarball : Uploaded ${NETWORK}-rolling-tarball file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
@@ -358,7 +361,7 @@ if [ "${HISTORY_MODE}" = rolling ]; then
         fi
 
         # Upload rolling tarball json redirect file and set header for previously uploaded rolling tarball json File
-        if ! aws s3 cp rolling-tarball-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_TARBALL_FILENAME}".json --cache-control 'no-cache'; then
+        if ! eval "$(set_aws_command_creds "aws")" s3 cp rolling-tarball-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_TARBALL_FILENAME}".json --cache-control 'no-cache'; then
             printf "%s Rolling Tarball : Error uploading ${NETWORK}-rolling-tarball-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
         else
             printf "%s Rolling Tarball : Uploaded ${NETWORK}-rolling-tarball-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
@@ -459,7 +462,7 @@ if [ "${HISTORY_MODE}" = rolling ]; then
             touch rolling
 
             # Upload rolling tezos snapshot redirect object
-            if ! aws s3 cp rolling s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_SNAPSHOT_FILENAME}" --cache-control 'no-cache'; then
+            if ! eval "$(set_aws_command_creds "aws")" s3 cp rolling s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_SNAPSHOT_FILENAME}" --cache-control 'no-cache'; then
                 printf "%s Rolling Tezos : Error uploading redirect object for ${ROLLING_SNAPSHOT} to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
             else
                 printf "%s Rolling Tezos : Successfully uploaded redirect object for ${ROLLING_SNAPSHOT} to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
@@ -473,7 +476,7 @@ if [ "${HISTORY_MODE}" = rolling ]; then
             fi
 
             # Upload rolling snapshot json redirect file and set header for previously uploaded rolling snapshot json File
-            if ! aws s3 cp rolling-snapshot-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_SNAPSHOT_FILENAME}".json --cache-control 'no-cache'; then
+            if ! eval "$(set_aws_command_creds "aws")" s3 cp rolling-snapshot-metadata s3://"${AWS_S3_BUCKET}" --website-redirect "${REDIRECT_ROOT}${ROLLING_SNAPSHOT_FILENAME}".json --cache-control 'no-cache'; then
                 printf "%s Rolling snapshot : Error uploading ${NETWORK}-rolling-snapshot-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
             else
                 printf "%s Rolling snapshot : Uploaded ${NETWORK}-rolling-snapshot-metadata file to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
@@ -491,7 +494,7 @@ if [[ -n "${SNAPSHOT_WEBSITE_DOMAIN_NAME}" ]]; then
     # Network bucket redirect
     # Redirects from network.website.com to website.com/network
     touch index.html
-    if ! aws s3 cp index.html s3://"${AWS_S3_BUCKET}" --website-redirect https://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}"/"${NETWORK}" --cache-control 'no-cache'; then
+    if ! eval "$(set_aws_command_creds "aws")" s3 cp index.html s3://"${AWS_S3_BUCKET}" --website-redirect https://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}"/"${NETWORK}" --cache-control 'no-cache'; then
         printf "%s ERROR ##### Could not upload network site redirect.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
     else
         printf "%s Successfully uploaded network site redirect.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
@@ -551,7 +554,7 @@ if [[ -n "${SNAPSHOT_WEBSITE_DOMAIN_NAME}" ]]; then
     fi
 
     # Upload tezos-snapshots.json
-    if ! aws s3 cp tezos-snapshots.json s3://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}"/tezos-snapshots.json; then
+    if ! eval "$(set_aws_command_creds "aws")" s3 cp tezos-snapshots.json s3://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}"/tezos-snapshots.json; then
         printf "%s Upload tezos-snapshots.json : Error uploading file tezos-snapshots.json to S3.  \n" "$(date "+%Y-%m-%d %H:%M:%S")"
     else
         printf "%s Upload tezos-snapshots.json : File tezos-snapshots.json successfully uploaded to S3.  \n" "$(date "+%Y-%m-%d %H:%M:%S")"
@@ -563,14 +566,10 @@ if [[ -n "${SNAPSHOT_WEBSITE_DOMAIN_NAME}" ]]; then
 
     # Generate HTML from markdown and metadata
     chown -R jekyll:jekyll ./*
-    bundle exec jekyll build
+    bundle exec jekyll build --quiet
 
     # Upload chain page (index.html and assets) to root of website bucket
-    if ! aws s3 cp _site/ s3://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}" --recursive --include "*"; then
-        printf "%s Website Build & Deploy : Error uploading site to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
-    else
-        printf "%s Website Build & Deploy  : Successful uploaded website to S3.\n" "$(date "+%Y-%m-%d %H:%M:%S")"
-    fi
+    eval "$(set_aws_command_creds "aws")" s3 cp _site/ s3://"${SNAPSHOT_WEBSITE_DOMAIN_NAME}" --recursive | grep "*"
 fi
 
 SLEEP_TIME=0m