Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validators to check head node instance type and shared storage type w.r.t cluster size #6623

Merged
merged 2 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cli/src/pcluster/config/cluster_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
FsxArchitectureOsValidator,
HeadNodeImdsValidator,
HeadNodeLaunchTemplateValidator,
HeadNodeMemorySizeValidator,
HostedZoneValidator,
InstanceArchitectureCompatibilityValidator,
IntelHpcArchitectureValidator,
Expand All @@ -108,6 +109,7 @@
SchedulerDisableSudoAccessForDefaultUserValidator,
SchedulerOsValidator,
SchedulerValidator,
SharedEbsPerformanceBottleNeckValidator,
SharedFileCacheNotHomeValidator,
SharedStorageMountDirValidator,
SharedStorageNameValidator,
Expand Down Expand Up @@ -3030,6 +3032,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
self._register_validator(MultiNetworkInterfacesInstancesValidator, queues=self.scheduling.queues)
checked_images = []
capacity_reservation_id_max_count_map = {}
total_max_compute_nodes = 0
for index, queue in enumerate(self.scheduling.queues):
queue_image = self.image_dict[queue.name]
if index == 0:
Expand Down Expand Up @@ -3064,6 +3067,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
self._register_validator(AmiOsCompatibleValidator, os=self.image.os, image_id=queue_image)

for compute_resource in queue.compute_resources:
total_max_compute_nodes += compute_resource.max_count
self._register_validator(
InstanceArchitectureCompatibilityValidator,
instance_type_info_list=list(compute_resource.instance_type_info_map.values()),
Expand Down Expand Up @@ -3180,6 +3184,18 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
compute_resource_tags=compute_resource.get_tags(),
)

self._register_validator(
HeadNodeMemorySizeValidator,
head_node_instance_type=self.head_node.instance_type,
total_max_compute_nodes=total_max_compute_nodes,
)
if self.shared_storage:
for storage in self.shared_storage:
if isinstance(storage, SharedEbs):
self._register_validator(
SharedEbsPerformanceBottleNeckValidator,
total_max_compute_nodes=total_max_compute_nodes,
)
for capacity_reservation_id, num_of_instances in capacity_reservation_id_max_count_map.items():
self._register_validator(
CapacityReservationSizeValidator,
Expand Down
36 changes: 36 additions & 0 deletions cli/src/pcluster/validators/cluster_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,42 @@ def _validate(self, imds_secured: bool, scheduler: str):
)


class HeadNodeMemorySizeValidator(Validator):
"""
Head Node Memory Size Validator.

Verify if the Head Node has enough memory to manage compute nodes.
"""

def _validate(self, head_node_instance_type: str, total_max_compute_nodes: int):
head_node_memory = (
AWSApi.instance().ec2.get_instance_type_info(head_node_instance_type).ec2memory_size_in_mib() / 1024
)
# Assume OS takes up 0.6GB memory. Only check upto 16GB memory to prevent usage of small instance types.
required_memory = min(total_max_compute_nodes / 25 + 0.6, 16)
if head_node_memory < required_memory:
self._add_failure(
f"Head node instance type {head_node_instance_type} has {head_node_memory} GB of memory. "
f"Please choose a head node instance type with at least {required_memory} GB of memory"
f" to manage {total_max_compute_nodes} compute nodes.",
FailureLevel.ERROR,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why decide to use FailureLevel.Error here but not a Warning? I think customers have the right to decide what instance type they pay for, but we have a responsibility to warn them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the requirement is very relaxed.
For example, the validator allows t3.micro to manage 10 nodes, t3.medium to manage 85 nodes, t3.xlarge to manage infinite nodes. If the requirement is violated, it is very likely the scaling up would fail.
Also, the customer could suppress the validation anyway

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Convinced me, I agree.

)


class SharedEbsPerformanceBottleNeckValidator(Validator):
"""Warn potential performance bottleneck of using Shared EBS."""

def _validate(self, total_max_compute_nodes: int):
if total_max_compute_nodes > 100:
self._add_failure(
"EBS shared storage is mounted on the head node and shared to the compute nodes. "
"Therefore, the head node network bandwidth is a network performance bottle neck "
"if the compute nodes rely on this shared storage. "
"Please use FSx and EFS for better performance.",
FailureLevel.WARNING,
)


class ComputeResourceLaunchTemplateValidator(_LaunchTemplateValidator):
"""Try to launch the requested instances (in dry-run mode) to verify configuration parameters."""

Expand Down
Loading