aws · hanwen-cluster · Dec 19, 2024 · Dec 13, 2024 · Dec 19, 2024 · hehe7318
@@ -88,6 +88,7 @@
     FsxArchitectureOsValidator,
     HeadNodeImdsValidator,
     HeadNodeLaunchTemplateValidator,
+    HeadNodeMemorySizeValidator,
     HostedZoneValidator,
     InstanceArchitectureCompatibilityValidator,
     IntelHpcArchitectureValidator,
@@ -108,6 +109,7 @@
     SchedulerDisableSudoAccessForDefaultUserValidator,
     SchedulerOsValidator,
     SchedulerValidator,
+    SharedEbsPerformanceBottleNeckValidator,
     SharedFileCacheNotHomeValidator,
     SharedStorageMountDirValidator,
     SharedStorageNameValidator,
@@ -3030,6 +3032,7 @@ def _register_validators(self, context: ValidatorContext = None):  # noqa: C901
         self._register_validator(MultiNetworkInterfacesInstancesValidator, queues=self.scheduling.queues)
         checked_images = []
         capacity_reservation_id_max_count_map = {}
+        total_max_compute_nodes = 0
         for index, queue in enumerate(self.scheduling.queues):
             queue_image = self.image_dict[queue.name]
             if index == 0:
@@ -3064,6 +3067,7 @@ def _register_validators(self, context: ValidatorContext = None):  # noqa: C901
                 self._register_validator(AmiOsCompatibleValidator, os=self.image.os, image_id=queue_image)
 
             for compute_resource in queue.compute_resources:
+                total_max_compute_nodes += compute_resource.max_count
                 self._register_validator(
                     InstanceArchitectureCompatibilityValidator,
                     instance_type_info_list=list(compute_resource.instance_type_info_map.values()),
@@ -3180,6 +3184,18 @@ def _register_validators(self, context: ValidatorContext = None):  # noqa: C901
                     compute_resource_tags=compute_resource.get_tags(),
                 )
 
+        self._register_validator(
+            HeadNodeMemorySizeValidator,
+            head_node_instance_type=self.head_node.instance_type,
+            total_max_compute_nodes=total_max_compute_nodes,
+        )
+        if self.shared_storage:
+            for storage in self.shared_storage:
+                if isinstance(storage, SharedEbs):
+                    self._register_validator(
+                        SharedEbsPerformanceBottleNeckValidator,
+                        total_max_compute_nodes=total_max_compute_nodes,
+                    )
         for capacity_reservation_id, num_of_instances in capacity_reservation_id_max_count_map.items():
             self._register_validator(
                 CapacityReservationSizeValidator,

@@ -1311,6 +1311,42 @@ def _validate(self, imds_secured: bool, scheduler: str):
             )
 
 
+class HeadNodeMemorySizeValidator(Validator):
+    """
+    Head Node Memory Size Validator.
+
+    Verify if the Head Node has enough memory to manage compute nodes.
+    """
+
+    def _validate(self, head_node_instance_type: str, total_max_compute_nodes: int):
+        head_node_memory = (
+            AWSApi.instance().ec2.get_instance_type_info(head_node_instance_type).ec2memory_size_in_mib() / 1024
+        )
+        # Assume OS takes up 0.6GB memory. Only check upto 16GB memory to prevent usage of small instance types.
+        required_memory = min(total_max_compute_nodes / 25 + 0.6, 16)
+        if head_node_memory < required_memory:
+            self._add_failure(
+                f"Head node instance type {head_node_instance_type} has {head_node_memory} GB of memory. "
+                f"Please choose a head node instance type with at least {required_memory} GB of memory"
+                f" to manage {total_max_compute_nodes} compute nodes.",
+                FailureLevel.ERROR,
+            )
+
+
+class SharedEbsPerformanceBottleNeckValidator(Validator):
+    """Warn potential performance bottleneck of using Shared EBS."""
+
+    def _validate(self, total_max_compute_nodes: int):
+        if total_max_compute_nodes > 100:
+            self._add_failure(
+                "EBS shared storage is mounted on the head node and shared to the compute nodes. "
+                "Therefore, the head node network bandwidth is a network performance bottle neck "
+                "if the compute nodes rely on this shared storage. "
+                "Please use FSx and EFS for better performance.",
+                FailureLevel.WARNING,
+            )
+
+
 class ComputeResourceLaunchTemplateValidator(_LaunchTemplateValidator):
     """Try to launch the requested instances (in dry-run mode) to verify configuration parameters."""