From e9b446a31f82cb272df0b2022a3463b2254d94fd Mon Sep 17 00:00:00 2001 From: Florian Stadler Date: Thu, 2 Jan 2025 10:26:40 +0100 Subject: [PATCH] Fix ManagedNodeGroup ignoring custom AMI if no user data is configured (#1562) When users specified a custom AMI for an EKS Managed Node Group, the setting was only taking effect if their configuration already triggered custom user data generation, otherwise it used the default AMI. To fix this, I modified the logic to generate custom user data whenever a custom AMI is configured. This ensures the node group launches successfully with proper bootstrap settings regardless of other configuration options (AWS EKS does not provide default user data if an AMI ID is specified in the launch template). The previous E2E tests worked because they coincidentally used the same AMI ID as the default AMI. The updated test intentionally chooses an AMI for the previous minor version and verifies the EC2 instances are using this exact AMI. Fixes #1550 --- nodejs/eks/nodegroup.ts | 6 +- tests/internal/utils.go | 121 +++++++++++++++--- tests/nodejs_test.go | 18 +++ .../testdata/programs/managed-ng-os/index.ts | 23 +++- 4 files changed, 142 insertions(+), 26 deletions(-) diff --git a/nodejs/eks/nodegroup.ts b/nodejs/eks/nodegroup.ts index 3de9fd4e6..35fc7e025 100644 --- a/nodejs/eks/nodegroup.ts +++ b/nodejs/eks/nodegroup.ts @@ -2113,7 +2113,7 @@ export function createManagedNodeGroupInternal( // amiType, releaseVersion and version cannot be set if an AMI ID is set in a custom launch template. // The AMI ID is set in the launch template if custom user data is required. // See https://docs.aws.amazon.com/eks/latest/userguide/launch-templates.html#mng-ami-id-conditions - if (requiresCustomUserData(userDataArgs) || args.userData) { + if (requiresCustomUserData(userDataArgs) || args.userData || args.amiId) { delete nodeGroupArgs.version; delete nodeGroupArgs.releaseVersion; delete nodeGroupArgs.amiType; @@ -2228,7 +2228,9 @@ function createMNGCustomLaunchTemplate( }); let userData: pulumi.Output | undefined; - if (requiresCustomUserData(customUserDataArgs) || args.userData) { + // when amiId is provided, we need to create a custom user data script because + // EKS will not provide default user data when an AMI ID is provided. + if (requiresCustomUserData(customUserDataArgs) || args.userData || args.amiId) { userData = pulumi .all([ clusterMetadata, diff --git a/tests/internal/utils.go b/tests/internal/utils.go index 071cd39ef..d06468d98 100644 --- a/tests/internal/utils.go +++ b/tests/internal/utils.go @@ -565,7 +565,7 @@ func validateCluster(t *testing.T, ctx context.Context, clusterName string, auth err := RetryWithExponentialBackoff(ctx, 2*time.Second, func() error { err := ValidateDeploymentHealth(t, clientset, deployment.Namespace, deployment.Name) if err != nil { - t.Logf("Detected unhelathy deployment %q in namespace %q of cluster %s: %v", deployment.Name, deployment.Namespace, clusterName, err) + t.Logf("Detected unhealthy deployment %q in namespace %q of cluster %s: %v", deployment.Name, deployment.Namespace, clusterName, err) labelSelector, selectorErr := findDeploymentLabelSelector(clientset, deployment.Namespace, deployment.Name, clusterName) if selectorErr != nil { return errors.Join(err, selectorErr) @@ -586,7 +586,7 @@ func validateCluster(t *testing.T, ctx context.Context, clusterName string, auth err := RetryWithExponentialBackoff(ctx, 2*time.Second, func() error { err := ValidateDaemonSetHealth(t, clientset, daemonset.Namespace, daemonset.Name) if err != nil { - t.Logf("Detected unhelathy daemonset %q in namespace %q of cluster %s: %v", daemonset.Name, daemonset.Namespace, clusterName, err) + t.Logf("Detected unhealthy daemonset %q in namespace %q of cluster %s: %v", daemonset.Name, daemonset.Namespace, clusterName, err) labelSelector, selectorErr := findDaemonSetLabelSelector(clientset, daemonset.Namespace, daemonset.Name, clusterName) if selectorErr != nil { return errors.Join(err, selectorErr) @@ -736,6 +736,36 @@ func mapClusterToNodeGroups(resources []apitype.ResourceV3) (clusterNodeGroupMap // Examples: i-1234567f, i-0123456789abcdef0 var eC2InstanceIDPattern = regexp.MustCompile(`^i-[a-f0-9]{8,17}$`) +// GetEC2InstanceIDs extracts the EC2 instance IDs from the given list of nodes. +// It returns a map of instance IDs to nodes. +func GetEC2InstanceIDs(nodes *corev1.NodeList) map[string]*corev1.Node { + if nodes == nil { + return nil + } + + instanceIDToNode := make(map[string]*corev1.Node) + + // Extract instance IDs from all nodes + for i := range nodes.Items { + node := &nodes.Items[i] + // Extract EC2 instance ID from node provider ID + // Provider ID format: aws:///az/i-1234567890abcdef0 + providerID := node.Spec.ProviderID + instanceID := strings.TrimPrefix(providerID, "aws:///") + instanceID = strings.TrimPrefix(instanceID, strings.Split(instanceID, "/")[0]+"/") + + // the list of k8s nodes can include nodes that are not EC2 instances (e.g. Fargate), + // those are not part of any ASGs. We already validated that all nodes are ready, so we can + // skip the nodes that are not EC2 instances. Their ID is not valid input to the + // DescribeAutoScalingInstances API used later and would cause an error. + if eC2InstanceIDPattern.MatchString(instanceID) { + instanceIDToNode[instanceID] = node + } + } + + return instanceIDToNode +} + // ValidateNodeGroupInstances validates that all nodes in the cluster are healthy and have successfully joined. // It checks that each ASG has the expected number of instances and that all instances are properly registered as nodes. func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, asgClient *autoscaling.Client, ec2Client *ec2.Client, clusterName string, expectedNodeGroups map[string]nodeGroupData) error { @@ -774,26 +804,11 @@ func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, a } asgInstanceCounts := make(map[string]int) - instanceIDs := make([]string, 0, len(nodes.Items)) - instanceIDToNode := make(map[string]*corev1.Node) - // Extract instance IDs from all nodes - for i := range nodes.Items { - node := &nodes.Items[i] - // Extract EC2 instance ID from node provider ID - // Provider ID format: aws:///az/i-1234567890abcdef0 - providerID := node.Spec.ProviderID - instanceID := strings.TrimPrefix(providerID, "aws:///") - instanceID = strings.TrimPrefix(instanceID, strings.Split(instanceID, "/")[0]+"/") - - // the list of k8s nodes can include nodes that are not EC2 instances (e.g. Fargate), - // those are not part of any ASGs. We already validated that all nodes are ready, so we can - // skip the nodes that are not EC2 instances. Their ID is not valid input to the - // DescribeAutoScalingInstances API used later and would cause an error. - if eC2InstanceIDPattern.MatchString(instanceID) { - instanceIDs = append(instanceIDs, instanceID) - instanceIDToNode[instanceID] = node - } + instanceIDToNode := GetEC2InstanceIDs(nodes) + instanceIDs := make([]string, 0, len(instanceIDToNode)) + for instanceID := range instanceIDToNode { + instanceIDs = append(instanceIDs, instanceID) } // For each node, get its EC2 instance ID, find which ASG it belongs to and verify that EC2 reports the instance as running. @@ -864,6 +879,70 @@ func ValidateNodeGroupInstances(t *testing.T, clientset *kubernetes.Clientset, a return nil } +// FindNodesWithAmiID finds the nodes in a cluster that have the given AMI ID. +// It returns a list of EC2 instance IDs. +func FindNodesWithAmiID(t *testing.T, kubeconfig any, amiID string) ([]string, error) { + clientSet, err := clientSetFromKubeconfig(kubeconfig) + if err != nil { + return nil, err + } + + nodes, err := clientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return nil, err + } + + instanceIDToNode := GetEC2InstanceIDs(nodes) + if len(instanceIDToNode) == 0 { + return nil, nil + } + + var region string + var profile string + if p, ok := os.LookupEnv("AWS_PROFILE"); ok { + profile = p + } + if r, ok := os.LookupEnv("AWS_REGION"); ok { + region = r + } + awsConfig := loadAwsDefaultConfig(t, region, profile) + ec2Client := ec2.NewFromConfig(awsConfig) + + nodesWithAmiID := make([]string, 0, len(instanceIDToNode)) + // Process instance IDs in batches of 255 (AWS API limit) + instanceIDs := make([]string, 0, len(instanceIDToNode)) + for id := range instanceIDToNode { + instanceIDs = append(instanceIDs, id) + } + + // Describe instances in batches of 255 (AWS API limit) + for i := 0; i < len(instanceIDs); i += 255 { + end := i + 255 + if end > len(instanceIDs) { + end = len(instanceIDs) + } + + batch := instanceIDs[i:end] + describeInput := &ec2.DescribeInstancesInput{ + InstanceIds: batch, + } + describeResult, err := ec2Client.DescribeInstances(context.TODO(), describeInput) + if err != nil { + return nil, err + } + + for _, reservation := range describeResult.Reservations { + for _, instance := range reservation.Instances { + if *instance.ImageId == amiID { + nodesWithAmiID = append(nodesWithAmiID, *instance.InstanceId) + } + } + } + } + + return nodesWithAmiID, nil +} + // ValidateNodes validates the nodes in a cluster contain the expected values. func ValidateNodes(t *testing.T, kubeconfig any, validateFn func(*corev1.NodeList)) error { clientSet, err := clientSetFromKubeconfig(kubeconfig) diff --git a/tests/nodejs_test.go b/tests/nodejs_test.go index 16a5feb36..b227a6ef0 100644 --- a/tests/nodejs_test.go +++ b/tests/nodejs_test.go @@ -17,6 +17,7 @@ package tests import ( + "context" "fmt" "os" "path" @@ -651,6 +652,23 @@ func TestAccManagedNodeGroupOS(t *testing.T) { } assert.Equal(t, 2, foundNodes, "Expected %s nodes with Nvidia GPUs", foundNodes) })) + + // Validate that the nodes with the custom AMI ID are present + customAmiId := info.Outputs["customAmiId"].(string) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + err := utils.RetryWithExponentialBackoff(ctx, 10*time.Second, func() error { + nodes, err := utils.FindNodesWithAmiID(t, info.Outputs["kubeconfig"], customAmiId) + if err != nil { + return err + } + t.Logf("Found %d nodes with custom AMI ID: %v", len(nodes), nodes) + if len(nodes) != 2 { + return fmt.Errorf("expected 2 nodes with custom AMI ID, got %d", len(nodes)) + } + return nil + }) + require.NoError(t, err) }, }) diff --git a/tests/testdata/programs/managed-ng-os/index.ts b/tests/testdata/programs/managed-ng-os/index.ts index 42f159e46..45d90e36b 100644 --- a/tests/testdata/programs/managed-ng-os/index.ts +++ b/tests/testdata/programs/managed-ng-os/index.ts @@ -306,8 +306,15 @@ const managedNodeGroupBottlerocketTaints = eks.createManagedNodeGroup("bottleroc }, }); -// Create a simple AL2023 node group with a custom user data and a custom AMI -const amiId = pulumi.interpolate`/aws/service/eks/optimized-ami/${cluster.core.cluster.version}/amazon-linux-2023/x86_64/standard/recommended/image_id`.apply(name => +// Fetch the previous minor version. This way we test that the user specified AMI is used instead of the default one. +// MNGs tolerate a skew of 1 minor version +export const customAmiId = pulumi.all([cluster.eksCluster.version]).apply(([version]) => { + const versionParts = version.split('.'); + const majorVersion = parseInt(versionParts[0], 10); + const minorVersion = parseInt(versionParts[1], 10) - 1; + const versionString = `${majorVersion}.${minorVersion}`; + return pulumi.interpolate`/aws/service/eks/optimized-ami/${versionString}/amazon-linux-2023/x86_64/standard/recommended/image_id`; +}).apply(name => aws.ssm.getParameter({ name }, { async: true }) ).apply(result => result.value); @@ -318,6 +325,7 @@ const customUserData = userdata.createUserData({ serviceCidr: cluster.core.cluster.kubernetesNetworkConfig.serviceIpv4Cidr, }, `--max-pods=${increasedPodCapacity} --node-labels=increased-pod-capacity=true`); +// Create a simple AL2023 node group with a custom user data and a custom AMI const managedNodeGroupAL2023CustomUserData = eks.createManagedNodeGroup("al-2023-mng-custom-userdata", { ...baseConfig, operatingSystem: eks.OperatingSystem.AL2023, @@ -326,7 +334,16 @@ const managedNodeGroupAL2023CustomUserData = eks.createManagedNodeGroup("al-2023 nodeRole: role, diskSize: 100, userData: customUserData, - amiId, + amiId: customAmiId, +}); + +const managedNodeGroupAL2023CustomAmi = eks.createManagedNodeGroup("al-2023-mng-custom-ami", { + ...baseConfig, + operatingSystem: eks.OperatingSystem.AL2023, + cluster: cluster, + instanceTypes: ["t3.medium"], + nodeRole: role, + amiId: customAmiId, }); const managedNodeGroupAL2023NodeadmExtraOptions = eks.createManagedNodeGroup("al-2023-mng-extra-options", {