Skip to content

Commit

Permalink
Improve resilience of E2E and integration tests (#1406)
Browse files Browse the repository at this point in the history
  • Loading branch information
pkosiec authored Mar 11, 2024
1 parent 027883f commit 3f92298
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 58 deletions.
2 changes: 2 additions & 0 deletions .github/actions/cloud-slack-e2e/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ runs:
- name: Dump cluster state
if: ${{ failure() }}
uses: ./.github/actions/dump-cluster
with:
name: cloud-slack-e2e

- name: Detect failed jobs
if: ${{ failure() }}
Expand Down
9 changes: 7 additions & 2 deletions .github/actions/dump-cluster/action.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
name: Dump cluster state
description: "Creates an artifacts with cluster dump"

inputs:
name:
description: "Cluster name"
required: true

runs:
using: "composite"
steps:
Expand All @@ -15,8 +20,8 @@ runs:
echo "Dumping cluster info into ${LOGS_DIR}"
kubectl cluster-info dump --all-namespaces --output-directory="${LOGS_DIR}"
- name: Upload artifacts
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: cluster_dump_${{github.sha}}
name: cluster_dump_${{github.sha}}_${{ inputs.name }}
path: "output"
retention-days: 5 # Default 90 days
10 changes: 9 additions & 1 deletion .github/workflows/branch-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,12 @@ jobs:
KUBECONFIG=$(k3d kubeconfig write ${{ matrix.integration }}-test-cluster) \
make test-integration-${{ matrix.integration }}
- name: Dump cluster state
if: ${{ failure() }}
uses: ./.github/actions/dump-cluster
with:
name: ${{ matrix.integration }}

- name: Slack Notification
uses: rtCamp/action-slack-notify@v2
if: ${{ failure() }}
Expand Down Expand Up @@ -263,7 +269,7 @@ jobs:
KUBECONFIG=$(k3d kubeconfig write cli-migration-e2e-cluster) make test-cli-migration-e2e
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: ${{ always() }}
with:
name: screenshots_dump_${{github.sha}}
Expand All @@ -273,6 +279,8 @@ jobs:
- name: Dump cluster state
if: ${{ failure() }}
uses: ./.github/actions/dump-cluster
with:
name: cli-migration-e2e

- name: Slack Notification
uses: rtCamp/action-slack-notify@v2
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/pr-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:
make save-images
- name: Upload artifact
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: botkube-${{github.sha}}
path: ${{ env.IMAGE_SAVE_LOAD_DIR }}
Expand All @@ -91,7 +91,7 @@ jobs:
persist-credentials: false

- name: Download artifact
uses: actions/download-artifact@v2
uses: actions/download-artifact@v4
with:
name: botkube-${{github.sha}}
path: ${{ env.IMAGE_SAVE_LOAD_DIR }}
Expand Down Expand Up @@ -304,3 +304,9 @@ jobs:
run: |
KUBECONFIG=$(k3d kubeconfig write ${{ matrix.integration }}-test-cluster) \
make test-integration-${{ matrix.integration }}
- name: Dump cluster state
if: ${{ failure() }}
uses: ./.github/actions/dump-cluster
with:
name: ${{ matrix.integration }}
36 changes: 18 additions & 18 deletions test/cloud-slack-dev-e2e/cloud_slack_dev_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"os"
"path/filepath"
"strings"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -88,9 +89,11 @@ func TestCloudSlackE2E(t *testing.T) {
require.NoError(t, err)

cfg.Slack.Tester.CloudBasedTestEnabled = false // override property used only in the Cloud Slack E2E tests
cfg.Slack.Tester.RecentMessagesLimit = 4 // this is used effectively only for the Botkube restarts. There are two of them in a short time window, so it shouldn't be higher than 5.

authHeaderValue := ""
helmChartUninstalled := false
var botkubeDeploymentUninstalled atomic.Bool
botkubeDeploymentUninstalled.Store(true) // not yet installed
gqlEndpoint := fmt.Sprintf("%s/%s", cfg.BotkubeCloud.APIBaseURL, cfg.BotkubeCloud.APIGraphQLEndpoint)

if cfg.ScreenshotsDir != "" {
Expand Down Expand Up @@ -236,7 +239,7 @@ func TestCloudSlackE2E(t *testing.T) {
go router.Run()
defer router.MustStop()

t.Log("Ensuring proper organizaton is selected")
t.Log("Ensuring proper organization is selected")
botkubePage.MustWaitOpen()
screenshotIfShould(t, cfg, botkubePage)
botkubePage.MustElement("a.logo-link")
Expand Down Expand Up @@ -299,6 +302,7 @@ func TestCloudSlackE2E(t *testing.T) {
if !cfg.Slack.DisconnectWorkspaceAfterTests {
return
}
t.Log("Disconnecting Slack workspace...")
gqlCli.MustDeleteSlackWorkspace(t, cfg.BotkubeCloud.TeamOrganizationID, slackWorkspace.ID)
})

Expand All @@ -319,18 +323,8 @@ func TestCloudSlackE2E(t *testing.T) {
t.Log("Creating deployment...")
deployment := gqlCli.MustCreateBasicDeploymentWithCloudSlack(t, channel.Name(), slackWorkspace.TeamID, channel.Name())
t.Cleanup(func() {
// We have a glitch on backend side and the logic below is a workaround for that.
// Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deplyment deletion reports "DELETED" status.
// If we do these two things too quickly, we'll run into resource version mismatch in repository logic.
// Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794

for !helmChartUninstalled {
t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting the first deployment...")
time.Sleep(1 * time.Second)
}

t.Log("Helm chart uninstalled. Waiting a bit...")
time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch
err := helmx.WaitForUninstallation(context.Background(), t, &botkubeDeploymentUninstalled)
assert.NoError(t, err)

t.Log("Deleting first deployment...")
gqlCli.MustDeleteDeployment(t, graphql.ID(deployment.ID))
Expand All @@ -343,6 +337,7 @@ func TestCloudSlackE2E(t *testing.T) {
gqlCli.MustDeleteDeployment(t, graphql.ID(deployment2.ID))
})

botkubeDeploymentUninstalled.Store(false) // about to be installed
params := helmx.InstallChartParams{
RepoURL: "https://storage.googleapis.com/botkube-latest-main-charts",
RepoName: "botkube",
Expand All @@ -353,9 +348,14 @@ func TestCloudSlackE2E(t *testing.T) {
}
helmInstallCallback := helmx.InstallChart(t, params)
t.Cleanup(func() {
t.Log("Uninstalling Helm chart...")
helmInstallCallback(t)
helmChartUninstalled = true
if t.Failed() {
t.Log("Tests failed, keeping the Botkube instance installed for debugging purposes.")
} else {
t.Log("Uninstalling Helm chart...")
helmInstallCallback(t)
}

botkubeDeploymentUninstalled.Store(true)
})

t.Log("Waiting for help message...")
Expand Down Expand Up @@ -472,9 +472,9 @@ func TestCloudSlackE2E(t *testing.T) {
t.Run("Botkube Deployment -> Cloud sync", func(t *testing.T) {
t.Log("Disabling notification...")
tester.PostMessageToBot(t, channel.Identifier(), "disable notifications")

t.Log("Waiting for config reload message...")
expectedReloadMsg := fmt.Sprintf(":arrows_counterclockwise: Configuration reload requested for cluster '%s'. Hold on a sec...", deployment.Name)

err = tester.OnChannel().WaitForMessagePostedRecentlyEqual(tester.BotUserID(), channel.ID(), expectedReloadMsg)
require.NoError(t, err)

Expand Down
69 changes: 35 additions & 34 deletions test/e2e/bots_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@ import (
"regexp"
"strconv"
"strings"
"sync/atomic"
"testing"
"time"
"unicode"

"botkube.io/botube/test/helmx"

"botkube.io/botube/test/botkubex"
"botkube.io/botube/test/commplatform"
"botkube.io/botube/test/diff"
Expand Down Expand Up @@ -201,7 +204,9 @@ func runBotTest(t *testing.T,
deployEnvSecondaryChannelIDName,
deployEnvRbacChannelIDName string,
) {
botkubeDeploymentUninstalled := false
var botkubeDeploymentUninstalled atomic.Bool
botkubeDeploymentUninstalled.Store(true) // not yet installed

t.Logf("Creating API client with provided token for %s...", driverType)
botDriver, err := newBotDriver(appCfg, driverType)
require.NoError(t, err)
Expand Down Expand Up @@ -259,22 +264,14 @@ func runBotTest(t *testing.T,
gqlCli.MustCreateAlias(t, alias[0], alias[1], alias[2], deployment.ID)
}
t.Cleanup(func() {
// We have a glitch on backend side and the logic below is a workaround for that.
// Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deployment deletion reports "DELETED" status.
// If we do these two things too quickly, we'll run into resource version mismatch in repository logic.
// Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794
for !botkubeDeploymentUninstalled {
t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting Botkube Cloud instance...")
time.Sleep(1 * time.Second)
}

t.Log("Helm chart uninstalled. Waiting a bit...")
time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch
err := helmx.WaitForUninstallation(context.Background(), t, &botkubeDeploymentUninstalled)
assert.NoError(t, err)

t.Log("Deleting Botkube Cloud instance...")
gqlCli.MustDeleteDeployment(t, graphql.ID(deployment.ID))
})

botkubeDeploymentUninstalled.Store(false) // about to be installed
err = botkubex.Install(t, botkubex.InstallParams{
BinaryPath: appCfg.ConfigProvider.BotkubeCliBinaryPath,
HelmRepoDirectory: appCfg.ConfigProvider.HelmRepoDirectory,
Expand All @@ -289,9 +286,14 @@ func runBotTest(t *testing.T,
})
require.NoError(t, err)
t.Cleanup(func() {
t.Log("Uninstalling Helm chart...")
botkubex.Uninstall(t, appCfg.ConfigProvider.BotkubeCliBinaryPath)
botkubeDeploymentUninstalled = true
if t.Failed() {
t.Log("Tests failed, keeping the Botkube instance installed for debugging purposes.")
} else {
t.Log("Uninstalling Helm chart...")
botkubex.Uninstall(t, appCfg.ConfigProvider.BotkubeCliBinaryPath)
}

botkubeDeploymentUninstalled.Store(true)
})
}

Expand Down Expand Up @@ -838,10 +840,13 @@ func runBotTest(t *testing.T,
expectedMessage = fmt.Sprintf("%s\n%s", cmdHeader(command), expectedBody)

botDriver.PostMessageToBot(t, botDriver.SecondChannel().Identifier(), command)

err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.SecondChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage))
require.NoError(t, err)

if botDriver.Type().IsCloud() {
waitForRestart(t, botDriver, botDriver.BotUserID(), botDriver.FirstChannel().ID(), appCfg.ClusterName)
}

cfgMapCli := k8sCli.CoreV1().ConfigMaps(appCfg.Deployment.Namespace)

t.Log("Creating ConfigMap...")
Expand Down Expand Up @@ -947,12 +952,12 @@ func runBotTest(t *testing.T,
expectedMessage = fmt.Sprintf("%s\n%s", cmdHeader(command), expectedBody)

botDriver.PostMessageToBot(t, botDriver.FirstChannel().Identifier(), command)
err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.FirstChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage))
assert.NoError(t, err)

if botDriver.Type().IsCloud() {
waitForRestart(t, botDriver, botDriver.BotUserID(), botDriver.FirstChannel().ID(), appCfg.ClusterName)
}
err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.FirstChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage))
assert.NoError(t, err)

t.Log("Getting notifier status from second channel...")
command = "status notifications"
Expand Down Expand Up @@ -1078,8 +1083,7 @@ func runBotTest(t *testing.T,

limitMessagesNo := 2
if botDriver.Type().IsCloud() {
// There are 2 config reload requested after second cm update
limitMessagesNo = 2 * limitLastMessageAfterCloudReload
limitMessagesNo = limitLastMessageAfterCloudReload
}
err = botDriver.OnChannel().WaitForMessagePostedWithAttachment(botDriver.BotUserID(), botDriver.SecondChannel().ID(), limitMessagesNo, secondCMUpdate)
require.NoError(t, err)
Expand Down Expand Up @@ -1682,16 +1686,11 @@ func crashConfigMapSourcePlugin(t *testing.T, cfgMapCli corev1.ConfigMapInterfac
}

func waitForRestart(t *testing.T, tester commplatform.BotDriver, userID, channel, clusterName string) {
t.Log("Waiting for restart...")
t.Logf("Waiting for restart (timestamp: %s)...", time.Now().Format(time.DateTime))

originalTimeout := tester.Timeout()
tester.SetTimeout(90 * time.Second)
if tester.Type() == commplatform.TeamsBot {
tester.SetTimeout(120 * time.Second)
}
// 2, since time to time latest message becomes upgrade message right after begin message
tester.SetTimeout(120 * time.Second)
expMsg := fmt.Sprintf("My watch begins for cluster '%s'! :crossed_swords:", clusterName)

assertFn := tester.AssertEquals(expMsg)
if tester.Type() == commplatform.TeamsBot { // Teams sends JSON (Adaptive Card), so we cannot do equal assertion
expMsg = fmt.Sprintf("My watch begins for cluster '%s'!", clusterName)
Expand All @@ -1700,14 +1699,16 @@ func waitForRestart(t *testing.T, tester commplatform.BotDriver, userID, channel
}
}

// 2, since from time to time latest message becomes upgrade message right after begin message
err := tester.OnChannel().WaitForMessagePosted(userID, channel, 2, assertFn)
if err != nil && tester.Type() == commplatform.TeamsBot {
// TODO(https://github.com/kubeshop/botkube-cloud/issues/854): for some reason, Teams restarts are not deterministic and sometimes it doesn't happen
// We should add fetching Agent logs to see why it happens.
t.Logf("⚠️ Teams communication platform didn't restart on time: %v", err)
} else {
assert.NoError(t, err)
}
assert.NoError(t, err)

t.Logf("Detected a successful restart (timestamp: %s).", time.Now().Format(time.DateTime))

t.Logf("Waiting a bit longer just to make sure Botkube connects to the Cloud Router...")
// Yes, it's ugly but "My watch begins..." doesn't really mean the Slack/Teams gRPC connection has been established.
// So we wait a bit longer to avoid a race condition.
time.Sleep(3 * time.Second)

tester.SetTimeout(originalTimeout)
}
Expand Down
37 changes: 37 additions & 0 deletions test/helmx/helm_helpers.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
package helmx

import (
"context"
"encoding/json"
"fmt"
"os/exec"
"regexp"
"strings"
"sync/atomic"
"testing"
"time"

"k8s.io/apimachinery/pkg/util/wait"

"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -108,3 +113,35 @@ func redactAPIKey(in []string) []string {
}
return dst
}

const (
uninstallPollInterval = 1 * time.Second
uninstallTimeout = 30 * time.Second
)

// WaitForUninstallation waits until a Helm chart is uninstalled, based on the atomic value.
// It's a workaround for the Helm chart uninstallation issue.
// We have a glitch on backend side and the logic below is a workaround for that.
// Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deployment deletion reports "DELETED" status.
// If we do these two things too quickly, we'll run into resource version mismatch in repository logic.
// Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794
func WaitForUninstallation(ctx context.Context, t *testing.T, alreadyUninstalled *atomic.Bool) error {
t.Helper()
t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting Botkube Cloud instance...")
err := wait.PollUntilContextTimeout(ctx, uninstallPollInterval, uninstallTimeout, false, func(ctx context.Context) (done bool, err error) {
return alreadyUninstalled.Load(), nil
})
waitInterrupted := wait.Interrupted(err)
if err != nil && !waitInterrupted {
return err
}

if waitInterrupted {
t.Log("Waiting for Helm chart uninstallation timed out. Proceeding with deleting other resources...")
return nil
}

t.Log("Waiting a bit more...")
time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch
return nil
}
Loading

0 comments on commit 3f92298

Please sign in to comment.