Skip to content

Commit

Permalink
test: Windows Metrics failure when High CPU (Azure#3962)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsturtevant authored Oct 27, 2020
1 parent be11bef commit 1afec96
Show file tree
Hide file tree
Showing 7 changed files with 287 additions and 4 deletions.
1 change: 1 addition & 0 deletions test/e2e/cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ docker run --rm \
-e ARC_LOCATION=${ARC_LOCATION:-$LOCATION} \
-e LINUX_CONTAINERD_URL=${LINUX_CONTAINERD_URL} \
-e WINDOWS_CONTAINERD_URL=${WINDOWS_CONTAINERD_URL} \
-e VALIDATE_CPU_LOAD=${VALIDATE_CPU_LOAD} \
"${DEV_IMAGE}" make test-kubernetes || tryExit && renameResultsFile "deploy"

if [ "${UPGRADE_CLUSTER}" = "true" ] || [ "${SCALE_CLUSTER}" = "true" ] || [ -n "$ADD_NODE_POOL_INPUT" ] || [ "${GET_CLUSTER_LOGS}" = "true" ] || [ "${ROTATE_CERTS}" = "true" ]; then
Expand Down
1 change: 1 addition & 0 deletions test/e2e/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type Config struct {
SubscriptionID string `envconfig:"SUBSCRIPTION_ID"`
ClientID string `envconfig:"CLIENT_ID"`
ClientSecret string `envconfig:"CLIENT_SECRET"`
ValidateCPULoad bool `envconfig:"VALIDATE_CPU_LOAD" default:"false"`
*ArcOnboardingConfig
}

Expand Down
117 changes: 116 additions & 1 deletion test/e2e/kubernetes/deployment/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func CreateDeployFromImageAsync(image, name, namespace, app, role string) GetRes
d, err := CreateLinuxDeploy(image, name, namespace, app, role)
return GetResult{
deployment: d,
err: err,
err: err,
}
}

Expand Down Expand Up @@ -404,6 +404,66 @@ func CreateWindowsDeployWithRetry(image, name, namespace, app, role string, slee
}
}

// CreateDeploymentFromFile will create a Deployment from file with a name
func CreateDeploymentFromFile(filename, name, namespace string, sleep, timeout time.Duration) (*Deployment, error) {
cmd := exec.Command("k", "apply", "-f", filename)
util.PrintCommand(cmd)
out, err := cmd.CombinedOutput()
if err != nil {
log.Printf("Error trying to create Deployment %s:%s\n", name, string(out))
return nil, err
}
d, err := GetWithRetry(name, namespace, sleep, timeout)
if err != nil {
log.Printf("Error while trying to fetch Deployment %s:%s\n", name, err)
return nil, err
}
return d, nil
}

// CreateDeploymentFromFileAsync wraps CreateDeploymentFromFile with a struct response for goroutine + channel usage
func CreateDeploymentFromFileAsync(filename, name, namespace string, sleep, timeout time.Duration) GetResult {
d, err := CreateDeploymentFromFile(filename, name, namespace, sleep, timeout)
return GetResult{
deployment: d,
err: err,
}
}

// CreateDeploymentFromFileWithRetry will kubectl apply a Deployment from file with a name with retry toleration
func CreateDeploymentFromFileWithRetry(filename, name, namespace string, sleep, timeout time.Duration) (*Deployment, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
ch := make(chan GetResult)
var mostRecentCreateDeploymentFromFileWithRetryError error
var d *Deployment
go func() {
for {
select {
case <-ctx.Done():
return
default:
ch <- CreateDeploymentFromFileAsync(filename, name, namespace, sleep, timeout)
time.Sleep(sleep)
}
}
}()
for {
select {
case result := <-ch:
mostRecentCreateDeploymentFromFileWithRetryError = result.err
d = result.deployment
if mostRecentCreateDeploymentFromFileWithRetryError == nil {
if d != nil {
return d, nil
}
}
case <-ctx.Done():
return d, errors.Errorf("CreateDeploymentFromFileWithRetry timed out: %s\n", mostRecentCreateDeploymentFromFileWithRetryError)
}
}
}

// CreateWindowsDeploy will create a deployment for a given image with a name in a namespace and create a service mapping a hostPort
func CreateWindowsDeploy(image, name, namespace, app, role string) (*Deployment, error) {
var commandTimeout time.Duration
Expand Down Expand Up @@ -869,3 +929,58 @@ func (d *Deployment) WaitForReplicas(min, max int, sleep, timeout time.Duration)
}
}
}

// WaitForReplicasWithAction waits for a pod replica count between min and max and runs an action after every check
func (d *Deployment) WaitForReplicasWithAction(min, max int, sleep, timeout time.Duration, action func() error) ([]pod.Pod, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
ch := make(chan pod.GetAllByPrefixResult)
var mostRecentWaitForReplicasError error
var pods []pod.Pod
go func() {
for {
select {
case <-ctx.Done():
return
default:
ch <- pod.GetAllRunningByPrefixAsync(d.Metadata.Name, d.Metadata.Namespace)
time.Sleep(sleep)

err := action()
if err != nil {
mostRecentWaitForReplicasError = err
cancel()
}

}
}
}()
for {
select {
case result := <-ch:
mostRecentWaitForReplicasError = result.Err
pods = result.Pods
if mostRecentWaitForReplicasError == nil {
if min == -1 {
if len(pods) <= max {
return pods, nil
}
} else if max == -1 {
if len(pods) >= min {
return pods, nil
}
} else {
if len(pods) >= min && len(pods) <= max {
return pods, nil
}
}
}
case <-ctx.Done():
err := d.Describe()
if err != nil {
log.Printf("Unable to describe deployment %s: %s", d.Metadata.Name, err)
}
return pods, errors.Errorf("WaitForReplicas timed out: %s\n", mostRecentWaitForReplicasError)
}
}
}
58 changes: 58 additions & 0 deletions test/e2e/kubernetes/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"fmt"
"io"
"log"
"math"
"math/rand"
"os"
"os/exec"
Expand Down Expand Up @@ -2369,6 +2370,63 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())
Expect(logsRotated).To(Equal(true))
})

// metrics endpoints failing in 1.18+
// https://github.com/kubernetes/kubernetes/issues/95735
It("windows should be able to get node metrics when high cpu", func() {
if !eng.HasWindowsAgents() || !cfg.ValidateCPULoad {
Skip("Will not validate effects of CPU load against nodes")
}

windowsImages, err := eng.GetWindowsTestImages()
cpuConsumptionDeploymentFile, err := pod.ReplaceContainerImageFromFile(filepath.Join(WorkloadDir, "validate-windows-cpu-consumption.yaml"), windowsImages.ServerCore)
Expect(err).NotTo(HaveOccurred())
defer os.Remove(cpuConsumptionDeploymentFile)

By("launching a deployment that consumes too much CPU")
deploymentName := "validate-windows-cpu-consumption" // should be the same as in yaml
cpuDeployment, err := deployment.CreateDeploymentFromFileWithRetry(cpuConsumptionDeploymentFile, deploymentName, "default", 1*time.Second, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
running, err := pod.WaitOnSuccesses(deploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

By("Scaling deployment to consuming allocatable")
nodeList, err := node.GetWithRetry(1*time.Second, cfg.Timeout)
cpuCapacity := 0
for _, n := range nodeList {
if n.IsWindows() {
c, err := strconv.Atoi(n.Status.Capacity.CPU)
Expect(err).NotTo(HaveOccurred())
cpuCapacity = cpuCapacity + c
}
}

// scale over allocatable for windows to make sure it's packed (.25 is limit on deployment)
deployCount := int(math.Round((float64(cpuCapacity) / 0.25)))
err = cpuDeployment.ScaleDeployment(deployCount * 2)
Expect(err).NotTo(HaveOccurred())

By("should be able to get nodes metrics")
checkMetrics := func() error {
log.Printf("running top nodes")
err = node.TopNodes()
return err
}
_, err = cpuDeployment.WaitForReplicasWithAction(deployCount, deployCount*2, 2*time.Second, cfg.Timeout, checkMetrics)
Expect(err).NotTo(HaveOccurred())
cpuPods, err := cpuDeployment.PodsRunning()
Expect(err).NotTo(HaveOccurred())
Expect(len(cpuPods)).To(BeNumerically(">=", deployCount))

By("should be able to get nodes metrics")
err = node.TopNodesWithRetry(1*time.Second, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())

By("Verifying pods & services can be deleted")
err = cpuDeployment.Delete(util.DefaultDeleteRetries)
Expect(err).NotTo(HaveOccurred())
})
})

Describe("after the cluster has been up for a while", func() {
Expand Down
19 changes: 16 additions & 3 deletions test/e2e/kubernetes/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ package node
import (
"context"
"encoding/json"
"github.com/Azure/aks-engine/test/e2e/kubernetes/pod"
"log"
"os/exec"
"regexp"
"strings"
"time"

"github.com/Azure/aks-engine/test/e2e/kubernetes/pod"

"github.com/Azure/aks-engine/test/e2e/kubernetes/util"
"github.com/pkg/errors"
)
Expand Down Expand Up @@ -56,6 +57,7 @@ type Status struct {
NodeInfo Info `json:"nodeInfo"`
NodeAddresses []Address `json:"addresses"`
Conditions []Condition `json:"conditions"`
Capacity Capacity `json:capacity`
}

// Address contains an address and a type
Expand All @@ -73,6 +75,10 @@ type Info struct {
OSImage string `json:"osImage"`
}

type Capacity struct {
CPU string `json:"cpu"`
}

// Condition contains various status information
type Condition struct {
LastHeartbeatTime time.Time `json:"lastHeartbeatTime"`
Expand All @@ -96,7 +102,7 @@ type GetNodesResult struct {

// TopNodesResult is the result type for TopNodesAsync
type TopNodesResult struct {
Err error
Err error
}

// GetNodesAsync wraps Get with a struct response for goroutine + channel usage
Expand All @@ -117,7 +123,7 @@ func GetNodesAsync() GetNodesResult {
func TopNodesAsync() TopNodesResult {
err := TopNodes()
return TopNodesResult{
Err: err,
Err: err,
}
}

Expand Down Expand Up @@ -432,6 +438,13 @@ func TopNodes() error {
pod.PrintPodsLogs("metrics-server", "kube-system", 5*time.Second, 1*time.Minute)
return err
}

if strings.Contains(string(out), "<unknown>") {
log.Printf("\n - %s", string(out))
pod.PrintPodsLogs("metrics-server", "kube-system", 5*time.Second, 1*time.Minute)
return errors.Errorf("Node contained unknown value")
}

return nil
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: validate-windows-cpu-consumption
labels:
app: validate-windows-cpu-consumption
spec:
replicas: 1
template:
metadata:
name: iis-2019
labels:
app: iis-2019
spec:
containers:
- name: iis
image: mcr.microsoft.com/windows/servercore/iis:windowsservercore-ltsc2019
command:
- powershell.exe
- "-command"
- "$result = 1; foreach ($number in 1..2147483647) {$result = $result * $number};"
resources:
limits:
cpu: .25
memory: 800m
requests:
cpu: .25
memory: 300m
ports:
- containerPort: 80
nodeSelector:
"kubernetes.io/os": windows
selector:
matchLabels:
app: iis-2019
60 changes: 60 additions & 0 deletions test/e2e/test_cluster_configs/windows/reproductions/high-cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"env": {
"VALIDATE_CPU_LOAD": true
},
"options": {
"allowedOrchestratorVersions": ["1.17", "1.18", "1.19"]
},
"apiModel": {
"apiVersion": "vlabs",
"properties": {
"orchestratorProfile": {
"orchestratorType": "Kubernetes",
"kubernetesConfig": {
"useManagedIdentity": false
}
},
"masterProfile": {
"count": 1,
"dnsPrefix": "",
"vmSize": "Standard_D2_v3"
},
"agentPoolProfiles": [
{
"name": "linuxpool1",
"count": 1,
"vmSize": "Standard_D2_v3",
"availabilityProfile": "VirtualMachineScaleSets"
},
{
"name": "agentwin",
"count": 1,
"vmSize": "Standard_D2_v3",
"osType": "Windows",
"availabilityProfile": "VirtualMachineScaleSets",
"scalesetPriority": "Spot"
}
],
"windowsProfile": {
"adminUsername": "azureuser",
"adminPassword": "replacepassword1234$",
"enableAutomaticUpdates": false,
"sshEnabled": true,
"windowsPublisher": "microsoft-aks",
"windowsOffer": "aks-windows",
"windowsSku": "2019-datacenter-core-smalldisk-2008",
"imageVersion": "latest"
},
"linuxProfile": {
"adminUsername": "azureuser",
"ssh": {
"publicKeys": [
{
"keyData": ""
}
]
}
}
}
}
}

0 comments on commit 1afec96

Please sign in to comment.