Skip to content

Commit

Permalink
Add options to install FSx Lustre and Nvidia software
Browse files Browse the repository at this point in the history
Signed-off-by: Hanwen <[email protected]>
  • Loading branch information
hanwen-cluster committed Nov 11, 2024
1 parent c389847 commit e3369f4
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 49 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ CHANGELOG
3.12.0
------

**ENHANCEMENTS**
- Add new build image configuration section `Build/Installation` to turn on/off Nvidia software and Lustre client installations. By default, Nvidia software, although included in official ParallelCluster AMIs, is not installed by `build-image`. By default, Lustre client is installed.

**CHANGES**
- The CLI commands `export-cluster-logs` and `export-image-logs` can now by default export the logs to the default ParallelCluster bucket or to the CustomS3Bucket if specified in the config.

Expand Down
56 changes: 51 additions & 5 deletions cli/src/pcluster/config/imagebuilder_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from pcluster.validators.imagebuilder_validators import (
AMIVolumeSizeValidator,
ComponentsValidator,
InstanceTypeSoftwareValidator,
SecurityGroupsAndSubnetValidator,
)
from pcluster.validators.kms_validators import KmsKeyIdEncryptedValidator, KmsKeyValidator
Expand Down Expand Up @@ -143,6 +144,41 @@ def __init__(
self.enabled = enabled


class LustreClient(Resource):
"""Represent the LustreClient configuration for the ImageBuilder."""

def __init__(
self,
enabled: bool = None,
):
super().__init__()
self.enabled = Resource.init_param(enabled, default=True)


class NvidiaSoftware(Resource):
"""Represent the NvidiaSoftware configuration for the ImageBuilder."""

def __init__(
self,
enabled: bool = None,
):
super().__init__()
self.enabled = Resource.init_param(enabled, default=False)


class Installation(Resource):
"""Represent the installation configuration for the ImageBuilder."""

def __init__(
self,
lustre_client: LustreClient = None,
nvidia_software: NvidiaSoftware = None,
):
super().__init__()
self.lustre_client = lustre_client or LustreClient()
self.nvidia_software = nvidia_software or NvidiaSoftware()


class Build(Resource):
"""Represent the build configuration for the ImageBuilder."""

Expand All @@ -157,6 +193,7 @@ def __init__(
components: List[Component] = None,
update_os_packages: UpdateOsPackages = None,
imds: Imds = None,
installation: Installation = None,
):
super().__init__()
self.instance_type = Resource.init_param(instance_type)
Expand All @@ -168,13 +205,19 @@ def __init__(
self.components = components
self.update_os_packages = update_os_packages
self.imds = imds or Imds(implied="v2.0")
self.installation = installation or Installation()

def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument
self._register_validator(
InstanceTypeBaseAMICompatibleValidator,
instance_type=self.instance_type,
image=self.parent_image,
)
self._register_validator(
InstanceTypeSoftwareValidator,
instance_type=self.instance_type,
nvidia=self.installation.nvidia_software.enabled,
)
self._register_validator(
ComponentsValidator,
components=self.components,
Expand Down Expand Up @@ -282,21 +325,24 @@ def lambda_functions_vpc_config(self):
class ImageBuilderExtraChefAttributes(ExtraChefAttributes):
"""Extra Attributes for ImageBuilder Chef Client."""

def __init__(self, dev_settings: ImagebuilderDevSettings):
super().__init__(dev_settings)
def __init__(self, config: ImageBuilderConfig):
super().__init__(config.dev_settings)
self.region = None
self.nvidia = None
self.lustre = None
self.is_official_ami_build = None
self.custom_node_package = None
self.custom_awsbatchcli_package = None
self.base_os = None
self.disable_kernel_update = None
self.slurm_patches_s3_archive = None
self._set_default(dev_settings)
self._set_default(config)

def _set_default(self, dev_settings: ImagebuilderDevSettings):
def _set_default(self, config: ImageBuilderConfig):
dev_settings = config.dev_settings
self.region = "{{ build.AWSRegion.outputs.stdout }}"
self.nvidia = {"enabled": "no"}
self.nvidia = {"enabled": "yes"} if config.build.installation.nvidia_software.enabled else {"enabled": "no"}
self.lustre = {"enabled": "yes"} if config.build.installation.lustre_client.enabled else {"enabled": "no"}
self.is_official_ami_build = "false"
self.custom_node_package = dev_settings.node_package if dev_settings and dev_settings.node_package else ""
self.custom_awsbatchcli_package = (
Expand Down
38 changes: 38 additions & 0 deletions cli/src/pcluster/schemas/imagebuilder_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
ImageBuilderConfig,
ImagebuilderDeploymentSettings,
ImagebuilderDevSettings,
Installation,
LustreClient,
NvidiaSoftware,
UpdateOsPackages,
Volume,
)
Expand Down Expand Up @@ -167,6 +170,40 @@ def make_resource(self, data, **kwargs):
return UpdateOsPackages(**data)


class LustreClientSchema(BaseSchema):
"""Represent the schema of the ImageBuilder NvidiaSoftware."""

enabled = fields.Bool()

@post_load
def make_resource(self, data, **kwargs):
"""Generate resource."""
return LustreClient(**data)


class NvidiaSoftwareSchema(BaseSchema):
"""Represent the schema of the ImageBuilder NvidiaSoftware."""

enabled = fields.Bool()

@post_load
def make_resource(self, data, **kwargs):
"""Generate resource."""
return NvidiaSoftware(**data)


class InstallationSchema(BaseSchema):
"""Represent the schema of the ImageBuilder Installation."""

lustre_client = fields.Nested(LustreClientSchema)
nvidia_software = fields.Nested(NvidiaSoftwareSchema)

@post_load
def make_resource(self, data, **kwargs):
"""Generate resource."""
return Installation(**data)


class BuildSchema(BaseSchema):
"""Represent the schema of the ImageBuilder Build."""

Expand All @@ -179,6 +216,7 @@ class BuildSchema(BaseSchema):
subnet_id = fields.Str(validate=get_field_validator("subnet_id"))
update_os_packages = fields.Nested(UpdateOsPackagesSchema)
imds = fields.Nested(ImdsSchema)
installation = fields.Nested(InstallationSchema)

@post_load
def make_resource(self, data, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion cli/src/pcluster/templates/imagebuilder_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def _add_cfn_parameters(self):
self,
"CfnParamChefDnaJson",
type="String",
default=ImageBuilderExtraChefAttributes(self.config.dev_settings).dump_json(),
default=ImageBuilderExtraChefAttributes(self.config).dump_json(),
description="ChefAttributes",
)
CfnParameter(
Expand Down
14 changes: 14 additions & 0 deletions cli/src/pcluster/validators/imagebuilder_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,17 @@ def _validate(self, security_group_ids: list, subnet_id: str):
"Subnet id {0} is specified, security groups is required.".format(subnet_id),
FailureLevel.ERROR,
)


class InstanceTypeSoftwareValidator(Validator):
"""Validate software compatibility with instance type."""

def _validate(self, instance_type: str, nvidia: bool):
if nvidia:
instance_type_info = AWSApi.instance().ec2.get_instance_type_info(instance_type)
if instance_type_info.gpu_count() == 0:
self._add_failure(
f"Instance type {instance_type} does not have GPU. "
"NVIDIA software can only be installed on GPU instances.",
FailureLevel.ERROR,
)
6 changes: 6 additions & 0 deletions cli/tests/pcluster/config/dummy_imagebuilder_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
ImageBuilderConfig,
ImagebuilderDeploymentSettings,
ImagebuilderDevSettings,
Installation,
LustreClient,
NvidiaSoftware,
UpdateOsPackages,
Volume,
)
Expand All @@ -39,6 +42,9 @@
"additional_iam_policies": AdditionalIamPolicy,
"update_os_packages": UpdateOsPackages,
"imds": Imds,
"installation": Installation,
"lustre_client": LustreClient,
"nvidia_software": NvidiaSoftware,
}


Expand Down
Loading

0 comments on commit e3369f4

Please sign in to comment.