From 41a8666cd7509153b11171a4069b3576ecd61b54 Mon Sep 17 00:00:00 2001
From: Ben Jackson <b@benjackson.email>
Date: Wed, 24 Mar 2021 11:20:47 +1100
Subject: [PATCH 1/5] add a check for any idled deplyoments and unidle them
 when an ssh connection is made to an environment

---
 services/ssh/home/rsh.sh | 52 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh
index 09b1816965..801bda5102 100755
--- a/services/ssh/home/rsh.sh
+++ b/services/ssh/home/rsh.sh
@@ -16,6 +16,10 @@ USER_SSH_KEY=$2
 REQUESTED_PROJECT=$3
 shift 3
 
+# get the value from an envvar override (can be added to the ssh deployment)
+# default to false so we don't hold up the ssh for a long time
+WAIT_TO_UNIDLE_SERVICES=${WAIT_TO_UNIDLE_SERVICES:-false}
+
 # get the graphql endpoint, if set
 eval "$(grep GRAPHQL_ENDPOINT /authorize.env)"
 
@@ -120,6 +124,30 @@ fi
 
 # If there is a deployment for the given service searching for lagoon.sh labels
 if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; then
+  # get any other deployments that may have been idled by the idler and unidle them if required
+  # this only needs to be done for kubernetes
+  DEPLOYMENTS=$($OC get deployments -l "idling.amazee.io/watch=true" -o name)
+  if [ ! -z "${DEPLOYMENTS}" ]; then
+    # loop over the deployments and unidle them
+    for DEP in ${DEPLOYMENTS}
+    do
+      # if the deployment is idled, unidle it :)
+      if [[ $($OC get ${DEP} -o json | jq -r '.status.replicas // 0') == "0" ]]; then
+        $OC scale --replicas=1 ${DEP} >/dev/null 2>&1
+        # for unidling an entire environment and waiting for the number of `readyReplicas`
+        # to be 1 for each deployment, could add considerable delays for the ssh connection to establish.
+        # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
+        # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
+        if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
+          while [[ ! $($OC get ${DEP} -o go-template --template='{{.status.readyReplicas}}') == "1" ]]
+          do
+            sleep 1
+          done
+        fi
+      fi
+    done
+  fi
+  # then actually unidle the service that was requested
   DEPLOYMENT=$($OC get deployment -l "lagoon.sh/service=${SERVICE}" -o name)
   # If the deployment is scaled to 0, scale to 1
   # .status.replicas doesn't exist on a scaled to 0 deployment in k8s so assume it is 0 if nothing is returned
@@ -138,6 +166,30 @@ fi
 # If there is a deployment for the given service search for lagoon labels
 # @DEPRECATED: Remove with Lagoon 2.0.0
 if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then
+  # get any other deployments that may have been idled by the idler and unidle them if required
+  # this only needs to be done for kubernetes
+  DEPLOYMENTS=$($OC get deployments -l "idling.amazee.io/watch=true" -o name)
+  if [ ! -z "${DEPLOYMENTS}" ]; then
+    # loop over the deployments and unidle them
+    for DEP in ${DEPLOYMENTS}
+    do
+      # if the deployment is idled, unidle it :)
+      if [[ $($OC get ${DEP} -o json | jq -r '.status.replicas // 0') == "0" ]]; then
+        $OC scale --replicas=1 ${DEP} >/dev/null 2>&1
+        # for unidling an entire environment and waiting for the number of `readyReplicas`
+        # to be 1 for each deployment, could add considerable delays for the ssh connection to establish.
+        # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
+        # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
+        if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
+          while [[ ! $($OC get ${DEP} -o go-template --template='{{.status.readyReplicas}}') == "1" ]]
+          do
+            sleep 1
+          done
+        fi
+      fi
+    done
+  fi
+  # then actually unidle the service that was requested
   DEPLOYMENT=$($OC get deployment -l lagoon/service=${SERVICE} -o name)
   # If the deployment is scaled to 0, scale to 1
   # .status.replicas doesn't exist on a scaled to 0 deployment in k8s so assume it is 0 if nothing is returned

From a17bb3c48606256087ebeacb4b28bdf7064e8c81 Mon Sep 17 00:00:00 2001
From: Ben Jackson <b@benjackson.email>
Date: Wed, 24 Mar 2021 11:48:54 +1100
Subject: [PATCH 2/5] add check for `unidle-replicas` so if its more than 1 it
 will scale correctly

---
 services/ssh/home/rsh.sh | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh
index 801bda5102..6404ace956 100755
--- a/services/ssh/home/rsh.sh
+++ b/services/ssh/home/rsh.sh
@@ -126,14 +126,20 @@ fi
 if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; then
   # get any other deployments that may have been idled by the idler and unidle them if required
   # this only needs to be done for kubernetes
+  # we do this first to give the services a bit of time to unidle before starting the one that was requested
   DEPLOYMENTS=$($OC get deployments -l "idling.amazee.io/watch=true" -o name)
   if [ ! -z "${DEPLOYMENTS}" ]; then
     # loop over the deployments and unidle them
     for DEP in ${DEPLOYMENTS}
     do
       # if the deployment is idled, unidle it :)
-      if [[ $($OC get ${DEP} -o json | jq -r '.status.replicas // 0') == "0" ]]; then
-        $OC scale --replicas=1 ${DEP} >/dev/null 2>&1
+      DEP_JSON=$($OC get ${DEP} -o json)
+      if [ $(echo "$DEP_JSON" | jq -r '.status.replicas // 0') == "0" ]; then
+        REPLICAS=$(echo "$DEP_JSON" | jq -r '.metadata.annotations."idling.amazee.io/unidle-replicas" // 1')
+        if [ ! -z "$REPLICAS" ]; then
+          REPLICAS=1
+        fi
+        $OC scale --replicas=${REPLICAS} ${DEP} >/dev/null 2>&1
         # for unidling an entire environment and waiting for the number of `readyReplicas`
         # to be 1 for each deployment, could add considerable delays for the ssh connection to establish.
         # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
@@ -168,14 +174,20 @@ fi
 if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then
   # get any other deployments that may have been idled by the idler and unidle them if required
   # this only needs to be done for kubernetes
+  # we do this first to give the services a bit of time to unidle before starting the one that was requested
   DEPLOYMENTS=$($OC get deployments -l "idling.amazee.io/watch=true" -o name)
   if [ ! -z "${DEPLOYMENTS}" ]; then
     # loop over the deployments and unidle them
     for DEP in ${DEPLOYMENTS}
     do
       # if the deployment is idled, unidle it :)
-      if [[ $($OC get ${DEP} -o json | jq -r '.status.replicas // 0') == "0" ]]; then
-        $OC scale --replicas=1 ${DEP} >/dev/null 2>&1
+      DEP_JSON=$($OC get ${DEP} -o json)
+      if [ $(echo "$DEP_JSON" | jq -r '.status.replicas // 0') == "0" ]; then
+        REPLICAS=$(echo "$DEP_JSON" | jq -r '.metadata.annotations."idling.amazee.io/unidle-replicas" // 1')
+        if [ ! -z "$REPLICAS" ]; then
+          REPLICAS=1
+        fi
+        $OC scale --replicas=${REPLICAS} ${DEP} >/dev/null 2>&1
         # for unidling an entire environment and waiting for the number of `readyReplicas`
         # to be 1 for each deployment, could add considerable delays for the ssh connection to establish.
         # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments

From de98f1813c3b72fd101ccad5d65f7019a549ec11 Mon Sep 17 00:00:00 2001
From: Ben Jackson <b@benjackson.email>
Date: Fri, 26 Mar 2021 08:03:09 +1100
Subject: [PATCH 3/5] support for readyreplicas that could be greater than 1

---
 services/ssh/home/rsh.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh
index 6404ace956..c713db950d 100755
--- a/services/ssh/home/rsh.sh
+++ b/services/ssh/home/rsh.sh
@@ -145,7 +145,7 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t
         # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
         # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
         if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
-          while [[ ! $($OC get ${DEP} -o go-template --template='{{.status.readyReplicas}}') == "1" ]]
+          while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
           do
             sleep 1
           done
@@ -162,7 +162,7 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t
     $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1
 
     # Wait until the scaling is done
-    while [[ ! $($OC get ${DEPLOYMENT} -o go-template --template='{{.status.readyReplicas}}') == "1" ]]
+    while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
     do
       sleep 1
     done
@@ -193,7 +193,7 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then
         # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
         # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
         if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
-          while [[ ! $($OC get ${DEP} -o go-template --template='{{.status.readyReplicas}}') == "1" ]]
+          while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
           do
             sleep 1
           done
@@ -210,7 +210,7 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then
     $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1
 
     # Wait until the scaling is done
-    while [[ ! $($OC get ${DEPLOYMENT} -o go-template --template='{{.status.readyReplicas}}') == "1" ]]
+    while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
     do
       sleep 1
     done

From 08fb46fe2e3abd037ec66027f94dc0641e21080b Mon Sep 17 00:00:00 2001
From: Ben Jackson <b@benjackson.email>
Date: Fri, 26 Mar 2021 08:34:41 +1100
Subject: [PATCH 4/5] scale all services, then if required wait for all
 services

---
 services/ssh/home/rsh.sh | 80 ++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh
index c713db950d..76e20970c3 100755
--- a/services/ssh/home/rsh.sh
+++ b/services/ssh/home/rsh.sh
@@ -140,33 +140,37 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t
           REPLICAS=1
         fi
         $OC scale --replicas=${REPLICAS} ${DEP} >/dev/null 2>&1
-        # for unidling an entire environment and waiting for the number of `readyReplicas`
-        # to be 1 for each deployment, could add considerable delays for the ssh connection to establish.
-        # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
-        # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
-        if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
-          while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
-          do
-            sleep 1
-          done
-        fi
+      fi
+    done
+    # then if we have to wait for them to start, do that here
+    for DEP in ${DEPLOYMENTS}
+    do
+      # for unidling an entire environment and waiting for the number of `readyReplicas`
+      # to be 1 for each deployment, could add considerable delays for the ssh connection to establish.
+      # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
+      # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
+      if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
+        while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
+        do
+          sleep 1
+        done
       fi
     done
   fi
-  # then actually unidle the service that was requested
+  # then actually unidle the service that was requested and wait for it to be ready if it wasn't already captured above
+  # doing this means if the service hasn't been idled with the `idling.amazee.io/watch=true` label
+  # we can still establish a connection
   DEPLOYMENT=$($OC get deployment -l "lagoon.sh/service=${SERVICE}" -o name)
   # If the deployment is scaled to 0, scale to 1
   # .status.replicas doesn't exist on a scaled to 0 deployment in k8s so assume it is 0 if nothing is returned
   if [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.replicas // 0') == "0" ]]; then
-
     $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1
-
-    # Wait until the scaling is done
-    while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
-    do
-      sleep 1
-    done
   fi
+  # Wait until the scaling is done
+  while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
+  do
+    sleep 1
+  done
 fi
 
 # If there is a deployment for the given service search for lagoon labels
@@ -188,33 +192,37 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then
           REPLICAS=1
         fi
         $OC scale --replicas=${REPLICAS} ${DEP} >/dev/null 2>&1
-        # for unidling an entire environment and waiting for the number of `readyReplicas`
-        # to be 1 for each deployment, could add considerable delays for the ssh connection to establish.
-        # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
-        # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
-        if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
-          while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
-          do
-            sleep 1
-          done
-        fi
+      fi
+    done
+    # then if we have to wait for them to start, do that here
+    for DEP in ${DEPLOYMENTS}
+    do
+      # for unidling an entire environment and waiting for the number of `readyReplicas`
+      # to be 1 for each deployment, could add considerable delays for the ssh connection to establish.
+      # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
+      # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
+      if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
+        while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
+        do
+          sleep 1
+        done
       fi
     done
   fi
-  # then actually unidle the service that was requested
+  # then actually unidle the service that was requested and wait for it to be ready if it wasn't already captured above
+  # doing this means if the service hasn't been idled with the `idling.amazee.io/watch=true` label
+  # we can still establish a connection
   DEPLOYMENT=$($OC get deployment -l lagoon/service=${SERVICE} -o name)
   # If the deployment is scaled to 0, scale to 1
   # .status.replicas doesn't exist on a scaled to 0 deployment in k8s so assume it is 0 if nothing is returned
   if [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.replicas // 0') == "0" ]]; then
-
     $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1
-
-    # Wait until the scaling is done
-    while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
-    do
-      sleep 1
-    done
   fi
+  # Wait until the scaling is done
+  while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
+  do
+    sleep 1
+  done
 fi
 
 

From af788d2be8ee4d8871b388bc885c6194e4896ba7 Mon Sep 17 00:00:00 2001
From: Ben Jackson <b@benjackson.email>
Date: Mon, 29 Mar 2021 14:48:42 +1100
Subject: [PATCH 5/5] add a timeout on waiting for pods to start

---
 services/ssh/home/rsh.sh | 46 +++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh
index 76e20970c3..cb39b35c43 100755
--- a/services/ssh/home/rsh.sh
+++ b/services/ssh/home/rsh.sh
@@ -19,6 +19,8 @@ shift 3
 # get the value from an envvar override (can be added to the ssh deployment)
 # default to false so we don't hold up the ssh for a long time
 WAIT_TO_UNIDLE_SERVICES=${WAIT_TO_UNIDLE_SERVICES:-false}
+# set a timeout of 600 for waiting for a pod to start (the waits are 1 second interval, so 10 minutes timeout)
+SSH_CHECK_TIMEOUT=${SSH_CHECK_TIMEOUT:-600}
 
 # get the graphql endpoint, if set
 eval "$(grep GRAPHQL_ENDPOINT /authorize.env)"
@@ -150,9 +152,16 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t
       # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
       # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
       if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
-        while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
+        SSH_CHECK_COUNTER=0
+        until [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
         do
-          sleep 1
+          if [ $SSH_CHECK_COUNTER -lt $SSH_CHECK_TIMEOUT ]; then
+            let SSH_CHECK_COUNTER=SSH_CHECK_COUNTER+1
+            sleep 1
+          else
+            echo "Deployment '${DEP}' took too long to start pods"
+            exit 1
+          fi
         done
       fi
     done
@@ -167,9 +176,16 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t
     $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1
   fi
   # Wait until the scaling is done
-  while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
+  SSH_CHECK_COUNTER=0
+  until [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
   do
-    sleep 1
+    if [ $SSH_CHECK_COUNTER -lt $SSH_CHECK_TIMEOUT ]; then
+      let SSH_CHECK_COUNTER=SSH_CHECK_COUNTER+1
+      sleep 1
+    else
+      echo "Pod for ${SERVICE} took too long to start"
+      exit 1
+    fi
   done
 fi
 
@@ -202,9 +218,16 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then
       # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments
       # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1
       if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then
-        while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
+        SSH_CHECK_COUNTER=0
+        until [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
         do
-          sleep 1
+          if [ $SSH_CHECK_COUNTER -lt $SSH_CHECK_TIMEOUT ]; then
+            let SSH_CHECK_COUNTER=SSH_CHECK_COUNTER+1
+            sleep 1
+          else
+            echo "Deployment '${DEP}' took too long to start pods"
+            exit 1
+          fi
         done
       fi
     done
@@ -219,9 +242,16 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then
     $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1
   fi
   # Wait until the scaling is done
-  while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
+  SSH_CHECK_COUNTER=0
+  until [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]]
   do
-    sleep 1
+    if [ $SSH_CHECK_COUNTER -lt $SSH_CHECK_TIMEOUT ]; then
+      let SSH_CHECK_COUNTER=SSH_CHECK_COUNTER+1
+      sleep 1
+    else
+      echo "Pod for ${SERVICE} took too long to start"
+      exit 1
+    fi
   done
 fi