From 41a8666cd7509153b11171a4069b3576ecd61b54 Mon Sep 17 00:00:00 2001 From: Ben Jackson Date: Wed, 24 Mar 2021 11:20:47 +1100 Subject: [PATCH 1/5] add a check for any idled deplyoments and unidle them when an ssh connection is made to an environment --- services/ssh/home/rsh.sh | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh index 09b1816965..801bda5102 100755 --- a/services/ssh/home/rsh.sh +++ b/services/ssh/home/rsh.sh @@ -16,6 +16,10 @@ USER_SSH_KEY=$2 REQUESTED_PROJECT=$3 shift 3 +# get the value from an envvar override (can be added to the ssh deployment) +# default to false so we don't hold up the ssh for a long time +WAIT_TO_UNIDLE_SERVICES=${WAIT_TO_UNIDLE_SERVICES:-false} + # get the graphql endpoint, if set eval "$(grep GRAPHQL_ENDPOINT /authorize.env)" @@ -120,6 +124,30 @@ fi # If there is a deployment for the given service searching for lagoon.sh labels if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; then + # get any other deployments that may have been idled by the idler and unidle them if required + # this only needs to be done for kubernetes + DEPLOYMENTS=$($OC get deployments -l "idling.amazee.io/watch=true" -o name) + if [ ! -z "${DEPLOYMENTS}" ]; then + # loop over the deployments and unidle them + for DEP in ${DEPLOYMENTS} + do + # if the deployment is idled, unidle it :) + if [[ $($OC get ${DEP} -o json | jq -r '.status.replicas // 0') == "0" ]]; then + $OC scale --replicas=1 ${DEP} >/dev/null 2>&1 + # for unidling an entire environment and waiting for the number of `readyReplicas` + # to be 1 for each deployment, could add considerable delays for the ssh connection to establish. + # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments + # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 + if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then + while [[ ! $($OC get ${DEP} -o go-template --template='{{.status.readyReplicas}}') == "1" ]] + do + sleep 1 + done + fi + fi + done + fi + # then actually unidle the service that was requested DEPLOYMENT=$($OC get deployment -l "lagoon.sh/service=${SERVICE}" -o name) # If the deployment is scaled to 0, scale to 1 # .status.replicas doesn't exist on a scaled to 0 deployment in k8s so assume it is 0 if nothing is returned @@ -138,6 +166,30 @@ fi # If there is a deployment for the given service search for lagoon labels # @DEPRECATED: Remove with Lagoon 2.0.0 if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then + # get any other deployments that may have been idled by the idler and unidle them if required + # this only needs to be done for kubernetes + DEPLOYMENTS=$($OC get deployments -l "idling.amazee.io/watch=true" -o name) + if [ ! -z "${DEPLOYMENTS}" ]; then + # loop over the deployments and unidle them + for DEP in ${DEPLOYMENTS} + do + # if the deployment is idled, unidle it :) + if [[ $($OC get ${DEP} -o json | jq -r '.status.replicas // 0') == "0" ]]; then + $OC scale --replicas=1 ${DEP} >/dev/null 2>&1 + # for unidling an entire environment and waiting for the number of `readyReplicas` + # to be 1 for each deployment, could add considerable delays for the ssh connection to establish. + # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments + # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 + if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then + while [[ ! $($OC get ${DEP} -o go-template --template='{{.status.readyReplicas}}') == "1" ]] + do + sleep 1 + done + fi + fi + done + fi + # then actually unidle the service that was requested DEPLOYMENT=$($OC get deployment -l lagoon/service=${SERVICE} -o name) # If the deployment is scaled to 0, scale to 1 # .status.replicas doesn't exist on a scaled to 0 deployment in k8s so assume it is 0 if nothing is returned From a17bb3c48606256087ebeacb4b28bdf7064e8c81 Mon Sep 17 00:00:00 2001 From: Ben Jackson Date: Wed, 24 Mar 2021 11:48:54 +1100 Subject: [PATCH 2/5] add check for `unidle-replicas` so if its more than 1 it will scale correctly --- services/ssh/home/rsh.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh index 801bda5102..6404ace956 100755 --- a/services/ssh/home/rsh.sh +++ b/services/ssh/home/rsh.sh @@ -126,14 +126,20 @@ fi if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; then # get any other deployments that may have been idled by the idler and unidle them if required # this only needs to be done for kubernetes + # we do this first to give the services a bit of time to unidle before starting the one that was requested DEPLOYMENTS=$($OC get deployments -l "idling.amazee.io/watch=true" -o name) if [ ! -z "${DEPLOYMENTS}" ]; then # loop over the deployments and unidle them for DEP in ${DEPLOYMENTS} do # if the deployment is idled, unidle it :) - if [[ $($OC get ${DEP} -o json | jq -r '.status.replicas // 0') == "0" ]]; then - $OC scale --replicas=1 ${DEP} >/dev/null 2>&1 + DEP_JSON=$($OC get ${DEP} -o json) + if [ $(echo "$DEP_JSON" | jq -r '.status.replicas // 0') == "0" ]; then + REPLICAS=$(echo "$DEP_JSON" | jq -r '.metadata.annotations."idling.amazee.io/unidle-replicas" // 1') + if [ ! -z "$REPLICAS" ]; then + REPLICAS=1 + fi + $OC scale --replicas=${REPLICAS} ${DEP} >/dev/null 2>&1 # for unidling an entire environment and waiting for the number of `readyReplicas` # to be 1 for each deployment, could add considerable delays for the ssh connection to establish. # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments @@ -168,14 +174,20 @@ fi if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then # get any other deployments that may have been idled by the idler and unidle them if required # this only needs to be done for kubernetes + # we do this first to give the services a bit of time to unidle before starting the one that was requested DEPLOYMENTS=$($OC get deployments -l "idling.amazee.io/watch=true" -o name) if [ ! -z "${DEPLOYMENTS}" ]; then # loop over the deployments and unidle them for DEP in ${DEPLOYMENTS} do # if the deployment is idled, unidle it :) - if [[ $($OC get ${DEP} -o json | jq -r '.status.replicas // 0') == "0" ]]; then - $OC scale --replicas=1 ${DEP} >/dev/null 2>&1 + DEP_JSON=$($OC get ${DEP} -o json) + if [ $(echo "$DEP_JSON" | jq -r '.status.replicas // 0') == "0" ]; then + REPLICAS=$(echo "$DEP_JSON" | jq -r '.metadata.annotations."idling.amazee.io/unidle-replicas" // 1') + if [ ! -z "$REPLICAS" ]; then + REPLICAS=1 + fi + $OC scale --replicas=${REPLICAS} ${DEP} >/dev/null 2>&1 # for unidling an entire environment and waiting for the number of `readyReplicas` # to be 1 for each deployment, could add considerable delays for the ssh connection to establish. # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments From de98f1813c3b72fd101ccad5d65f7019a549ec11 Mon Sep 17 00:00:00 2001 From: Ben Jackson Date: Fri, 26 Mar 2021 08:03:09 +1100 Subject: [PATCH 3/5] support for readyreplicas that could be greater than 1 --- services/ssh/home/rsh.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh index 6404ace956..c713db950d 100755 --- a/services/ssh/home/rsh.sh +++ b/services/ssh/home/rsh.sh @@ -145,7 +145,7 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then - while [[ ! $($OC get ${DEP} -o go-template --template='{{.status.readyReplicas}}') == "1" ]] + while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] do sleep 1 done @@ -162,7 +162,7 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1 # Wait until the scaling is done - while [[ ! $($OC get ${DEPLOYMENT} -o go-template --template='{{.status.readyReplicas}}') == "1" ]] + while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] do sleep 1 done @@ -193,7 +193,7 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then - while [[ ! $($OC get ${DEP} -o go-template --template='{{.status.readyReplicas}}') == "1" ]] + while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] do sleep 1 done @@ -210,7 +210,7 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1 # Wait until the scaling is done - while [[ ! $($OC get ${DEPLOYMENT} -o go-template --template='{{.status.readyReplicas}}') == "1" ]] + while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] do sleep 1 done From 08fb46fe2e3abd037ec66027f94dc0641e21080b Mon Sep 17 00:00:00 2001 From: Ben Jackson Date: Fri, 26 Mar 2021 08:34:41 +1100 Subject: [PATCH 4/5] scale all services, then if required wait for all services --- services/ssh/home/rsh.sh | 80 ++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh index c713db950d..76e20970c3 100755 --- a/services/ssh/home/rsh.sh +++ b/services/ssh/home/rsh.sh @@ -140,33 +140,37 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t REPLICAS=1 fi $OC scale --replicas=${REPLICAS} ${DEP} >/dev/null 2>&1 - # for unidling an entire environment and waiting for the number of `readyReplicas` - # to be 1 for each deployment, could add considerable delays for the ssh connection to establish. - # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments - # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 - if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then - while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] - do - sleep 1 - done - fi + fi + done + # then if we have to wait for them to start, do that here + for DEP in ${DEPLOYMENTS} + do + # for unidling an entire environment and waiting for the number of `readyReplicas` + # to be 1 for each deployment, could add considerable delays for the ssh connection to establish. + # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments + # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 + if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then + while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] + do + sleep 1 + done fi done fi - # then actually unidle the service that was requested + # then actually unidle the service that was requested and wait for it to be ready if it wasn't already captured above + # doing this means if the service hasn't been idled with the `idling.amazee.io/watch=true` label + # we can still establish a connection DEPLOYMENT=$($OC get deployment -l "lagoon.sh/service=${SERVICE}" -o name) # If the deployment is scaled to 0, scale to 1 # .status.replicas doesn't exist on a scaled to 0 deployment in k8s so assume it is 0 if nothing is returned if [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.replicas // 0') == "0" ]]; then - $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1 - - # Wait until the scaling is done - while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] - do - sleep 1 - done fi + # Wait until the scaling is done + while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] + do + sleep 1 + done fi # If there is a deployment for the given service search for lagoon labels @@ -188,33 +192,37 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then REPLICAS=1 fi $OC scale --replicas=${REPLICAS} ${DEP} >/dev/null 2>&1 - # for unidling an entire environment and waiting for the number of `readyReplicas` - # to be 1 for each deployment, could add considerable delays for the ssh connection to establish. - # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments - # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 - if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then - while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] - do - sleep 1 - done - fi + fi + done + # then if we have to wait for them to start, do that here + for DEP in ${DEPLOYMENTS} + do + # for unidling an entire environment and waiting for the number of `readyReplicas` + # to be 1 for each deployment, could add considerable delays for the ssh connection to establish. + # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments + # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 + if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then + while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] + do + sleep 1 + done fi done fi - # then actually unidle the service that was requested + # then actually unidle the service that was requested and wait for it to be ready if it wasn't already captured above + # doing this means if the service hasn't been idled with the `idling.amazee.io/watch=true` label + # we can still establish a connection DEPLOYMENT=$($OC get deployment -l lagoon/service=${SERVICE} -o name) # If the deployment is scaled to 0, scale to 1 # .status.replicas doesn't exist on a scaled to 0 deployment in k8s so assume it is 0 if nothing is returned if [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.replicas // 0') == "0" ]]; then - $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1 - - # Wait until the scaling is done - while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] - do - sleep 1 - done fi + # Wait until the scaling is done + while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] + do + sleep 1 + done fi From af788d2be8ee4d8871b388bc885c6194e4896ba7 Mon Sep 17 00:00:00 2001 From: Ben Jackson Date: Mon, 29 Mar 2021 14:48:42 +1100 Subject: [PATCH 5/5] add a timeout on waiting for pods to start --- services/ssh/home/rsh.sh | 46 +++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/services/ssh/home/rsh.sh b/services/ssh/home/rsh.sh index 76e20970c3..cb39b35c43 100755 --- a/services/ssh/home/rsh.sh +++ b/services/ssh/home/rsh.sh @@ -19,6 +19,8 @@ shift 3 # get the value from an envvar override (can be added to the ssh deployment) # default to false so we don't hold up the ssh for a long time WAIT_TO_UNIDLE_SERVICES=${WAIT_TO_UNIDLE_SERVICES:-false} +# set a timeout of 600 for waiting for a pod to start (the waits are 1 second interval, so 10 minutes timeout) +SSH_CHECK_TIMEOUT=${SSH_CHECK_TIMEOUT:-600} # get the graphql endpoint, if set eval "$(grep GRAPHQL_ENDPOINT /authorize.env)" @@ -150,9 +152,16 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then - while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] + SSH_CHECK_COUNTER=0 + until [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] do - sleep 1 + if [ $SSH_CHECK_COUNTER -lt $SSH_CHECK_TIMEOUT ]; then + let SSH_CHECK_COUNTER=SSH_CHECK_COUNTER+1 + sleep 1 + else + echo "Deployment '${DEP}' took too long to start pods" + exit 1 + fi done fi done @@ -167,9 +176,16 @@ if [[ $($OC get deployment -l "lagoon.sh/service=${SERVICE}" 2> /dev/null) ]]; t $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1 fi # Wait until the scaling is done - while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] + SSH_CHECK_COUNTER=0 + until [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] do - sleep 1 + if [ $SSH_CHECK_COUNTER -lt $SSH_CHECK_TIMEOUT ]; then + let SSH_CHECK_COUNTER=SSH_CHECK_COUNTER+1 + sleep 1 + else + echo "Pod for ${SERVICE} took too long to start" + exit 1 + fi done fi @@ -202,9 +218,16 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then # WAIT_TO_UNIDLE_SERVICES will default to false so that it just scales the deployments # and won't wait for them to be ready, but if set to true, it will wait for `readyReplicas` to be 1 if [[ "$WAIT_TO_UNIDLE_SERVICES" =~ [Tt][Rr][Uu][Ee] ]]; then - while [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] + SSH_CHECK_COUNTER=0 + until [[ $($OC get ${DEP} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] do - sleep 1 + if [ $SSH_CHECK_COUNTER -lt $SSH_CHECK_TIMEOUT ]; then + let SSH_CHECK_COUNTER=SSH_CHECK_COUNTER+1 + sleep 1 + else + echo "Deployment '${DEP}' took too long to start pods" + exit 1 + fi done fi done @@ -219,9 +242,16 @@ if [[ $($OC get deployment -l lagoon/service=${SERVICE} 2> /dev/null) ]]; then $OC scale --replicas=1 ${DEPLOYMENT} >/dev/null 2>&1 fi # Wait until the scaling is done - while [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] + SSH_CHECK_COUNTER=0 + until [[ $($OC get ${DEPLOYMENT} -o json | jq -r '.status.readyReplicas // 0') -ne "0" ]] do - sleep 1 + if [ $SSH_CHECK_COUNTER -lt $SSH_CHECK_TIMEOUT ]; then + let SSH_CHECK_COUNTER=SSH_CHECK_COUNTER+1 + sleep 1 + else + echo "Pod for ${SERVICE} took too long to start" + exit 1 + fi done fi