Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validators to check head node instance type and shared storage type w.r.t cluster size #6623

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cli/src/pcluster/config/cluster_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
ExistingFsxNetworkingValidator,
FsxArchitectureOsValidator,
HeadNodeImdsValidator,
HeadNodeInstanceTypeValidator,
HeadNodeLaunchTemplateValidator,
HostedZoneValidator,
InstanceArchitectureCompatibilityValidator,
Expand All @@ -108,6 +109,7 @@
SchedulerDisableSudoAccessForDefaultUserValidator,
SchedulerOsValidator,
SchedulerValidator,
SharedEbsClusterSizeValidator,
SharedFileCacheNotHomeValidator,
SharedStorageMountDirValidator,
SharedStorageNameValidator,
Expand Down Expand Up @@ -3030,6 +3032,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
self._register_validator(MultiNetworkInterfacesInstancesValidator, queues=self.scheduling.queues)
checked_images = []
capacity_reservation_id_max_count_map = {}
total_max_compute_nodes = 0
for index, queue in enumerate(self.scheduling.queues):
queue_image = self.image_dict[queue.name]
if index == 0:
Expand Down Expand Up @@ -3064,6 +3067,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
self._register_validator(AmiOsCompatibleValidator, os=self.image.os, image_id=queue_image)

for compute_resource in queue.compute_resources:
total_max_compute_nodes += compute_resource.max_count
self._register_validator(
InstanceArchitectureCompatibilityValidator,
instance_type_info_list=list(compute_resource.instance_type_info_map.values()),
Expand Down Expand Up @@ -3180,6 +3184,18 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
compute_resource_tags=compute_resource.get_tags(),
)

self._register_validator(
HeadNodeInstanceTypeValidator,
head_node_instance_type=self.head_node.instance_type,
total_max_compute_nodes=total_max_compute_nodes,
)
if self.shared_storage:
for storage in self.shared_storage:
if isinstance(storage, SharedEbs):
self._register_validator(
SharedEbsClusterSizeValidator,
total_max_compute_nodes=total_max_compute_nodes,
)
for capacity_reservation_id, num_of_instances in capacity_reservation_id_max_count_map.items():
self._register_validator(
CapacityReservationSizeValidator,
Expand Down
35 changes: 35 additions & 0 deletions cli/src/pcluster/validators/cluster_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,41 @@ def _validate(self, imds_secured: bool, scheduler: str):
)


class HeadNodeInstanceTypeValidator(Validator):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be more specific that just instance type and mention memory in the name so folks know what its checking

"""
Head Node Instance Type Validator.
Verify if the Head Node has enough memory to manage compute nodes.
"""

def _validate(self, head_node_instance_type: str, total_max_compute_nodes: int):
head_node_memory = (
AWSApi.instance().ec2.get_instance_type_info(head_node_instance_type).ec2memory_size_in_mib() / 1024
)
# Assume OS takes up 0.6GB memory. Only check upto 16GB memory to prevent usage of small instance types.
required_memory = min(total_max_compute_nodes / 25 + 0.6, 16)
if head_node_memory < required_memory:
self._add_failure(
f"Head node instance type {head_node_instance_type} has {head_node_memory} GB of memory. "
f"Please choose a head node instance type with at least {required_memory} GB of memory"
f" to manage {total_max_compute_nodes} compute nodes.",
FailureLevel.ERROR,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why decide to use FailureLevel.Error here but not a Warning? I think customers have the right to decide what instance type they pay for, but we have a responsibility to warn them.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the requirement is very relaxed.
For example, the validator allows t3.micro to manage 10 nodes, t3.medium to manage 85 nodes, t3.xlarge to manage infinite nodes. If the requirement is violated, it is very likely the scaling up would fail.
Also, the customer could suppress the validation anyway

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Convinced me, I agree.

)


class SharedEbsClusterSizeValidator(Validator):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same naming comment as above, we should be able to understand what it does by it's name

"""Warn potential performance bottleneck of using Shared EBS."""

def _validate(self, total_max_compute_nodes: int):
if total_max_compute_nodes > 100:
self._add_failure(
"EBS shared storage is mounted on the head node and shared to the compute nodes. "
"This is a performance bottle neck if the compute nodes rely on this shared storage. "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not clear why this is a performance bottleneck. Performance of what components?

"Please use FSx and EFS for better performance.",
FailureLevel.WARNING,
)


class ComputeResourceLaunchTemplateValidator(_LaunchTemplateValidator):
"""Try to launch the requested instances (in dry-run mode) to verify configuration parameters."""

Expand Down
Loading