diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index 42c7cfc16f..b6f535e041 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -87,6 +87,7 @@ ExistingFsxNetworkingValidator, FsxArchitectureOsValidator, HeadNodeImdsValidator, + HeadNodeInstanceTypeValidator, HeadNodeLaunchTemplateValidator, HostedZoneValidator, InstanceArchitectureCompatibilityValidator, @@ -108,6 +109,7 @@ SchedulerDisableSudoAccessForDefaultUserValidator, SchedulerOsValidator, SchedulerValidator, + SharedEbsClusterSizeValidator, SharedFileCacheNotHomeValidator, SharedStorageMountDirValidator, SharedStorageNameValidator, @@ -3030,6 +3032,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 self._register_validator(MultiNetworkInterfacesInstancesValidator, queues=self.scheduling.queues) checked_images = [] capacity_reservation_id_max_count_map = {} + total_max_compute_nodes = 0 for index, queue in enumerate(self.scheduling.queues): queue_image = self.image_dict[queue.name] if index == 0: @@ -3064,6 +3067,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 self._register_validator(AmiOsCompatibleValidator, os=self.image.os, image_id=queue_image) for compute_resource in queue.compute_resources: + total_max_compute_nodes += compute_resource.max_count self._register_validator( InstanceArchitectureCompatibilityValidator, instance_type_info_list=list(compute_resource.instance_type_info_map.values()), @@ -3180,6 +3184,18 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 compute_resource_tags=compute_resource.get_tags(), ) + self._register_validator( + HeadNodeInstanceTypeValidator, + head_node_instance_type=self.head_node.instance_type, + total_max_compute_nodes=total_max_compute_nodes, + ) + if self.shared_storage: + for storage in self.shared_storage: + if isinstance(storage, SharedEbs): + self._register_validator( + SharedEbsClusterSizeValidator, + total_max_compute_nodes=total_max_compute_nodes, + ) for capacity_reservation_id, num_of_instances in capacity_reservation_id_max_count_map.items(): self._register_validator( CapacityReservationSizeValidator, diff --git a/cli/src/pcluster/validators/cluster_validators.py b/cli/src/pcluster/validators/cluster_validators.py index 548db963c4..d3854b1626 100644 --- a/cli/src/pcluster/validators/cluster_validators.py +++ b/cli/src/pcluster/validators/cluster_validators.py @@ -1311,6 +1311,41 @@ def _validate(self, imds_secured: bool, scheduler: str): ) +class HeadNodeInstanceTypeValidator(Validator): + """ + Head Node Instance Type Validator. + + Verify if the Head Node has enough memory to manage compute nodes. + """ + + def _validate(self, head_node_instance_type: str, total_max_compute_nodes: int): + head_node_memory = ( + AWSApi.instance().ec2.get_instance_type_info(head_node_instance_type).ec2memory_size_in_mib() / 1024 + ) + # Assume OS takes up 0.6GB memory. Only check upto 16GB memory to prevent usage of small instance types. + required_memory = min(total_max_compute_nodes / 25 + 0.6, 16) + if head_node_memory < required_memory: + self._add_failure( + f"Head node instance type {head_node_instance_type} has {head_node_memory} GB of memory. " + f"Please choose a head node instance type with at least {required_memory} GB of memory" + f" to manage {total_max_compute_nodes} compute nodes.", + FailureLevel.ERROR, + ) + + +class SharedEbsClusterSizeValidator(Validator): + """Warn potential performance bottleneck of using Shared EBS.""" + + def _validate(self, total_max_compute_nodes: int): + if total_max_compute_nodes > 100: + self._add_failure( + "EBS shared storage is mounted on the head node and shared to the compute nodes. " + "This is a performance bottle neck if the compute nodes rely on this shared storage. " + "Please use FSx and EFS for better performance.", + FailureLevel.WARNING, + ) + + class ComputeResourceLaunchTemplateValidator(_LaunchTemplateValidator): """Try to launch the requested instances (in dry-run mode) to verify configuration parameters."""