diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 94be047f96..aa9f635263 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -202,6 +202,7 @@ jobs: python-version: '3.12' - name: Install AWS CDK run: | + pip install typeguard~=2.13 npm install -g aws-cdk pip install -r cloudformation/external-slurmdbd/requirements.txt - working-directory: cloudformation/external-slurmdbd diff --git a/CHANGELOG.md b/CHANGELOG.md index 6db8905465..12460f866a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,29 @@ CHANGELOG ========= +3.12.0 +------ + +**BUG FIXES** +- When mounting an external OpenZFS, it is no longer required to set the outbound rules for ports 111, 2049, 20001, 20002, 20003 + +3.12.0 +------ + +**CHANGES** + +3.11.1 +------ + +**CHANGES** +- Pyxis is now disabled by default, so it must be manually enabled as documented in the product documentation. +- Upgrade Python runtime to version 3.12 in ParallelCluster Lambda Layer. +- Remove version pinning for setuptools to version prior to 70.0.0. +- Upgrade libjwt to version 1.17.0. + +**BUG FIXES** +- Fix an issue in the way we configure the Pyxis Slurm plugin in ParallelCluster that can lead to job submission failures. + https://github.com/aws/aws-parallelcluster/issues/6459 +- Add missing permissions required by login nodes to the public template of policies. 3.11.0 ------ diff --git a/api/infrastructure/parallelcluster-api.yaml b/api/infrastructure/parallelcluster-api.yaml index 80f98364f4..f2f8050b6c 100644 --- a/api/infrastructure/parallelcluster-api.yaml +++ b/api/infrastructure/parallelcluster-api.yaml @@ -200,7 +200,7 @@ Resources: - parallelcluster/${Version}/layers/aws-parallelcluster/lambda-layer.zip - { Version: !FindInMap [ParallelCluster, Constants, Version]} CompatibleRuntimes: - - python3.9 + - python3.12 # We need to define three AWS::Serverless::Api due to an issue with the handling of AWS::NoValue # See related GitHub issue: https://github.com/aws/serverless-application-model/issues/1435 @@ -294,7 +294,7 @@ Resources: Value: api - Key: 'parallelcluster:version' Value: !FindInMap [ParallelCluster, Constants, Version] - Runtime: python3.9 + Runtime: python3.12 Handler: pcluster.api.awslambda.entrypoint.lambda_handler Layers: - !Ref PclusterLayer diff --git a/awsbatch-cli/setup.py b/awsbatch-cli/setup.py index cc26e96b9c..0d7c51033d 100644 --- a/awsbatch-cli/setup.py +++ b/awsbatch-cli/setup.py @@ -22,7 +22,7 @@ def readme(): VERSION = "1.4.0" REQUIRES = [ - "setuptools<70.0.0", + "setuptools", "boto3>=1.16.14", "tabulate>=0.8.8,<=0.8.10", ] diff --git a/awsbatch-cli/src/awsbatch/awsbhosts.py b/awsbatch-cli/src/awsbatch/awsbhosts.py index 5eb52e75a1..31afcae5f1 100644 --- a/awsbatch-cli/src/awsbatch/awsbhosts.py +++ b/awsbatch-cli/src/awsbatch/awsbhosts.py @@ -66,7 +66,7 @@ def __init__( mem_registered, cpu_avail, mem_avail, - ): + ): # pylint: disable=too-many-positional-arguments """Initialize the object.""" self.container_instance_arn = container_instance_arn self.status = status diff --git a/awsbatch-cli/src/awsbatch/awsbout.py b/awsbatch-cli/src/awsbatch/awsbout.py index 5b511c5966..d228656b43 100644 --- a/awsbatch-cli/src/awsbatch/awsbout.py +++ b/awsbatch-cli/src/awsbatch/awsbout.py @@ -81,7 +81,9 @@ def __init__(self, log, boto3_factory): self.log = log self.boto3_factory = boto3_factory - def run(self, job_id, head=None, tail=None, stream=None, stream_period=None): + def run( + self, job_id, head=None, tail=None, stream=None, stream_period=None + ): # pylint: disable=too-many-positional-arguments """Print job output.""" log_stream = self.__get_log_stream(job_id) if log_stream: @@ -124,7 +126,9 @@ def __get_log_stream(self, job_id): fail("Error listing jobs from AWS Batch. Failed with exception: %s" % e) return log_stream - def __print_log_stream(self, log_stream, head=None, tail=None, stream=None, stream_period=None): # noqa: C901 FIXME + def __print_log_stream( # noqa: C901 FIXME + self, log_stream, head=None, tail=None, stream=None, stream_period=None + ): # pylint:disable=too-many-positional-arguments """ Ask for log stream and print it. diff --git a/awsbatch-cli/src/awsbatch/awsbqueues.py b/awsbatch-cli/src/awsbatch/awsbqueues.py index bd53f72f74..2d2518deb3 100644 --- a/awsbatch-cli/src/awsbatch/awsbqueues.py +++ b/awsbatch-cli/src/awsbatch/awsbqueues.py @@ -50,7 +50,7 @@ def _get_parser(): class Queue: """Generic queue object.""" - def __init__(self, arn, name, priority, status, status_reason): + def __init__(self, arn, name, priority, status, status_reason): # pylint: disable=too-many-positional-arguments """Initialize the object.""" self.arn = arn self.name = name diff --git a/awsbatch-cli/src/awsbatch/awsbstat.py b/awsbatch-cli/src/awsbatch/awsbstat.py index ae02b3e863..5ddb62e55e 100644 --- a/awsbatch-cli/src/awsbatch/awsbstat.py +++ b/awsbatch-cli/src/awsbatch/awsbstat.py @@ -94,7 +94,7 @@ def __init__( log_stream, log_stream_url, s3_folder_url, - ): + ): # pylint: disable=too-many-positional-arguments """Initialize the object.""" self.id = job_id self.name = name @@ -282,7 +282,9 @@ def __init__(self, log, boto3_factory): self.boto3_factory = boto3_factory self.batch_client = boto3_factory.get_client("batch") - def run(self, job_status, expand_children, job_queue=None, job_ids=None, show_details=False): + def run( + self, job_status, expand_children, job_queue=None, job_ids=None, show_details=False + ): # pylint: disable=too-many-positional-arguments """Print list of jobs, by filtering by queue or by ids.""" if job_ids: self.__populate_output_by_job_ids(job_ids, show_details or len(job_ids) == 1, include_parents=True) diff --git a/awsbatch-cli/src/awsbatch/awsbsub.py b/awsbatch-cli/src/awsbatch/awsbsub.py index fbb29a4c30..f1797e4fd2 100644 --- a/awsbatch-cli/src/awsbatch/awsbsub.py +++ b/awsbatch-cli/src/awsbatch/awsbsub.py @@ -444,7 +444,7 @@ def run( # noqa: C901 FIXME timeout=None, dependencies=None, env=None, - ): + ): # pylint: disable=too-many-positional-arguments """Submit the job.""" try: # array properties diff --git a/cli/setup.py b/cli/setup.py index 21691d4651..8dae4724c0 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -23,7 +23,7 @@ def readme(): VERSION = "3.12.0" CDK_VERSION = "1.164" REQUIRES = [ - "setuptools<70.0.0", + "setuptools", "boto3>=1.16.14", "tabulate>=0.8.8,<=0.8.10", "PyYAML>=5.3.1,!=5.4", diff --git a/cli/src/pcluster/validators/cluster_validators.py b/cli/src/pcluster/validators/cluster_validators.py index c17328ca32..f77050fa08 100644 --- a/cli/src/pcluster/validators/cluster_validators.py +++ b/cli/src/pcluster/validators/cluster_validators.py @@ -462,7 +462,9 @@ def _validate( # --------------- Storage validators --------------- # -def _is_access_allowed(security_groups_ids, subnets, port, security_groups_by_nodes, protocol="tcp"): +def _is_access_allowed( + security_groups_ids, subnets, port, security_groups_by_nodes, protocol="tcp", check_outbound=True +): """ Verify given list of security groups to check if they allow in and out access on the given port. @@ -508,7 +510,9 @@ def _is_access_allowed(security_groups_ids, subnets, port, security_groups_by_no out_access = out_access or _are_ip_ranges_and_sg_accessible( security_groups_by_nodes, dst_ip_ranges, dst_security_groups, subnets ) - return in_access and out_access + if check_outbound: + return in_access and out_access + return in_access def _are_ip_ranges_and_sg_accessible(security_groups_by_nodes, allowed_ip_ranges, allowed_security_groups, subnets): @@ -654,31 +658,45 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id for protocol, ports in FSX_PORTS[file_storage.file_storage_type].items(): missing_ports = self._get_missing_ports( - security_groups_by_nodes, subnet_ids, network_interfaces, ports, protocol + security_groups_by_nodes, + subnet_ids, + network_interfaces, + ports, + protocol, + file_storage.file_storage_type, ) if missing_ports: + direction = "inbound and outbound" + if file_storage.file_storage_type == "OPENZFS": + direction = "inbound" self._add_failure( f"The current security group settings on file storage '{file_storage_id}' does not" " satisfy mounting requirement. The file storage must be associated to a security group" - f" that allows inbound and outbound {protocol.upper()} traffic through ports {ports}. " + f" that allows {direction } {protocol.upper()} traffic through ports {ports}. " f"Missing ports: {missing_ports}", FailureLevel.ERROR, ) - def _get_missing_ports(self, security_groups_by_nodes, subnet_ids, network_interfaces, ports, protocol): + def _get_missing_ports( + self, security_groups_by_nodes, subnet_ids, network_interfaces, ports, protocol, storage_type + ): missing_ports = [] for port in ports: fs_access = False for network_interface in network_interfaces: # Get list of security group IDs sg_ids = [sg.get("GroupId") for sg in network_interface.get("Groups")] + check_outbound = True + if storage_type == "OPENZFS": + check_outbound = False if _is_access_allowed( sg_ids, subnet_ids, port=port, security_groups_by_nodes=security_groups_by_nodes, protocol=protocol, + check_outbound=check_outbound, ): fs_access = True break diff --git a/cli/tests/pcluster/validators/test_cluster_validators.py b/cli/tests/pcluster/validators/test_cluster_validators.py index df565b1b05..54cad0fb02 100644 --- a/cli/tests/pcluster/validators/test_cluster_validators.py +++ b/cli/tests/pcluster/validators/test_cluster_validators.py @@ -1072,12 +1072,14 @@ def test_queue_name_validator(name, expected_message): @pytest.mark.parametrize( - "fsx_file_system_type, fsx_vpc, ip_permissions, nodes_security_groups, network_interfaces, " "expected_message", + "fsx_file_system_type, fsx_vpc, ip_permissions, ip_permissions_egress, nodes_security_groups, network_interfaces, " + "expected_message", [ ( # working case, right vpc and sg, multiple network interfaces "LUSTRE", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], + [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], {frozenset({"sg-12345678"}), frozenset({"sg-12345678", "sg-23456789"})}, ["eni-09b9460295ddd4e5f", "eni-001b3cef7c78b45c4"], None, @@ -1086,6 +1088,7 @@ def test_queue_name_validator(name, expected_message): "LUSTRE", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], + [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], {frozenset({"sg-12345678"}), frozenset({"sg-12345678", "sg-23456789"})}, ["eni-09b9460295ddd4e5f"], None, @@ -1094,6 +1097,7 @@ def test_queue_name_validator(name, expected_message): "LUSTRE", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "IpRanges": [{"CidrIp": "0.0.0.0/0"}]}], + [{"IpProtocol": "-1", "IpRanges": [{"CidrIp": "0.0.0.0/0"}]}], {frozenset({None}), frozenset({None})}, ["eni-09b9460295ddd4e5f"], None, @@ -1107,6 +1111,10 @@ def test_queue_name_validator(name, expected_message): {"IpProtocol": "-1", "IpRanges": [{"CidrIp": "10.0.1.0/25"}]}, {"IpProtocol": "tcp", "FromPort": 988, "ToPort": 988, "IpRanges": [{"CidrIp": "10.0.1.128/25"}]}, ], + [ + {"IpProtocol": "-1", "IpRanges": [{"CidrIp": "10.0.1.0/25"}]}, + {"IpProtocol": "tcp", "FromPort": 988, "ToPort": 988, "IpRanges": [{"CidrIp": "10.0.1.128/25"}]}, + ], {frozenset({None}), frozenset({None})}, ["eni-09b9460295ddd4e5f"], None, @@ -1119,6 +1127,10 @@ def test_queue_name_validator(name, expected_message): {"IpProtocol": "-1", "IpRanges": [{"CidrIp": "10.1.1.0/25"}]}, {"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}, ], + [ + {"IpProtocol": "-1", "IpRanges": [{"CidrIp": "10.1.1.0/25"}]}, + {"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}, + ], {frozenset({"sg-12345678"}), frozenset({"sg-12345678", "sg-23456789"})}, ["eni-09b9460295ddd4e5f"], None, @@ -1131,14 +1143,19 @@ def test_queue_name_validator(name, expected_message): {"IpProtocol": "-1", "IpRanges": [{"CidrIp": "10.0.0.0/23"}]}, {"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-34567890"}]}, ], + [ + {"IpProtocol": "-1", "IpRanges": [{"CidrIp": "10.0.0.0/23"}]}, + {"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-34567890"}]}, + ], {frozenset({"sg-12345678"}), frozenset({"sg-12345678", "sg-23456789"})}, ["eni-09b9460295ddd4e5f"], None, ), - ( # working case (OPENZFS), CIDR specified in the security group through ip ranges + ( # working case (OPENZFS), CIDR specified in the security group through ip ranges, no egress rules "OPENZFS", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "IpRanges": [{"CidrIp": "0.0.0.0/0"}]}], + [], {frozenset({None}), frozenset({None})}, ["eni-09b9460295ddd4e5f"], None, @@ -1147,6 +1164,7 @@ def test_queue_name_validator(name, expected_message): "ONTAP", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "IpRanges": [{"CidrIp": "0.0.0.0/0"}]}], + [{"IpProtocol": "-1", "IpRanges": [{"CidrIp": "0.0.0.0/0"}]}], {frozenset({None}), frozenset({None})}, ["eni-09b9460295ddd4e5f"], None, @@ -1155,6 +1173,7 @@ def test_queue_name_validator(name, expected_message): "LUSTRE", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "PrefixListIds": [{"PrefixListId": "pl-12345"}]}], + [{"IpProtocol": "-1", "PrefixListIds": [{"PrefixListId": "pl-12345"}]}], {frozenset({None}), frozenset({None})}, ["eni-09b9460295ddd4e5f"], None, @@ -1164,6 +1183,7 @@ def test_queue_name_validator(name, expected_message): "LUSTRE", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], + [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], {frozenset({None}), frozenset({None})}, ["eni-09b9460295ddd4e5f"], "The current security group settings on file storage .* does not satisfy mounting requirement. " @@ -1175,6 +1195,7 @@ def test_queue_name_validator(name, expected_message): "LUSTRE", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-23456789"}]}], + [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-23456789"}]}], {frozenset({"sg-12345678"}), frozenset({"sg-12345678", "sg-23456789"})}, ["eni-09b9460295ddd4e5f"], "The current security group settings on file storage .* does not satisfy mounting requirement. " @@ -1186,17 +1207,19 @@ def test_queue_name_validator(name, expected_message): "OPENZFS", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], + [], {frozenset({None}), frozenset({None})}, ["eni-09b9460295ddd4e5f"], "The current security group settings on file storage .* does not satisfy mounting requirement. " "The file storage must be associated to a security group that " - r"allows inbound and outbound TCP traffic through ports \[111, 2049, 20001, 20002, 20003\].", + r"allows inbound TCP traffic through ports \[111, 2049, 20001, 20002, 20003\].", ), ( # not working case, wrong security group. Ontap # Security group without CIDR cannot work with clusters containing pcluster created security group. "ONTAP", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], + [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], {frozenset({None}), frozenset({None})}, ["eni-09b9460295ddd4e5f"], "The current security group settings on file storage .* does not satisfy mounting requirement. " @@ -1207,6 +1230,7 @@ def test_queue_name_validator(name, expected_message): "LUSTRE", "vpc-06e4ab6c6cEXAMPLE", [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], + [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], {frozenset({"sg-12345678"}), frozenset({"sg-12345678", "sg-23456789"})}, [], "doesn't have Elastic Network Interfaces attached", @@ -1215,6 +1239,7 @@ def test_queue_name_validator(name, expected_message): "LUSTRE", "vpc-06e4ab6c6ccWRONG", [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], + [{"IpProtocol": "-1", "UserIdGroupPairs": [{"UserId": "123456789012", "GroupId": "sg-12345678"}]}], {frozenset({"sg-12345678"}), frozenset({"sg-12345678", "sg-23456789"})}, ["eni-09b9460295ddd4e5f"], "only support using FSx file storage that is in the same VPC as the cluster", @@ -1232,6 +1257,16 @@ def test_queue_name_validator(name, expected_message): "UserIdGroupPairs": [], } ], + [ + { + "PrefixListIds": [], + "FromPort": 22, + "IpRanges": [{"CidrIp": "203.0.113.0/24"}], + "ToPort": 22, + "IpProtocol": "tcp", + "UserIdGroupPairs": [], + } + ], {frozenset({"sg-12345678"}), frozenset({"sg-12345678", "sg-23456789"})}, ["eni-09b9460295ddd4e5f"], [ @@ -1239,6 +1274,27 @@ def test_queue_name_validator(name, expected_message): "does not satisfy mounting requirement", ], ), + ( # not working case (OPENZFS), ingress rules don't cover all ports required + "OPENZFS", + "vpc-06e4ab6c6cEXAMPLE", + [ + { + "PrefixListIds": [], + "FromPort": 111, + "IpRanges": [{"CidrIp": "0.0.0.0/0"}], + "ToPort": 111, + "IpProtocol": "tcp", + "UserIdGroupPairs": [], + }, + ], + [], + {frozenset({None}), frozenset({None})}, + ["eni-09b9460295ddd4e5f"], + "The current security group settings on file storage .* does not satisfy mounting requirement. " + "The file storage must be associated to a security group that " + r"allows inbound TCP traffic through ports \[111, 2049, 20001, 20002, 20003\]. Missing ports: " + r"\[2049, 20001, 20002, 20003\]", + ), ], ) def test_fsx_network_validator( @@ -1246,6 +1302,7 @@ def test_fsx_network_validator( fsx_file_system_type, fsx_vpc, ip_permissions, + ip_permissions_egress, nodes_security_groups, network_interfaces, expected_message, @@ -1406,7 +1463,7 @@ def test_fsx_network_validator( describe_security_groups_response = { "SecurityGroups": [ { - "IpPermissionsEgress": ip_permissions, + "IpPermissionsEgress": ip_permissions_egress, "Description": "My security group", "IpPermissions": ip_permissions, "GroupName": "MySecurityGroup", diff --git a/cloudformation/custom_resource/cluster.yaml b/cloudformation/custom_resource/cluster.yaml index 1b03fa2225..192ff3863e 100644 --- a/cloudformation/custom_resource/cluster.yaml +++ b/cloudformation/custom_resource/cluster.yaml @@ -45,7 +45,7 @@ Resources: - parallelcluster/${Version}/layers/aws-parallelcluster/lambda-layer.zip - { Version: !FindInMap [ParallelCluster, Constants, Version] } CompatibleRuntimes: - - python3.9 + - python3.12 PclusterPolicies: Condition: UsePCPolicies @@ -341,7 +341,7 @@ Resources: helper(event, context) Handler: index.handler - Runtime: python3.9 + Runtime: python3.12 Role: !If [CustomRoleCondition, !Ref CustomLambdaRole, !GetAtt PclusterLambdaRole.Arn] Layers: - !Ref PclusterLayer @@ -395,7 +395,7 @@ Resources: reason = str(e) cfnresponse.send(event, context, response_status, {}, event.get('PhysicalResourceId', 'CleanupS3bucketCustomResource'), reason) Handler: index.handler - Runtime: python3.9 + Runtime: python3.12 Role: !If [CustomRoleCondition, !Ref CustomLambdaRole, !GetAtt PclusterLambdaRole.Arn] CleanupS3bucketCustomResource: diff --git a/cloudformation/external-slurmdbd/requirements.txt b/cloudformation/external-slurmdbd/requirements.txt index 765549ebd5..198ff634fa 100644 --- a/cloudformation/external-slurmdbd/requirements.txt +++ b/cloudformation/external-slurmdbd/requirements.txt @@ -1,3 +1,3 @@ -setuptools<70.0.0 +setuptools aws-cdk-lib~=2.105 constructs>=10.0.0,<11.0.0 diff --git a/cloudformation/policies/parallelcluster-policies.yaml b/cloudformation/policies/parallelcluster-policies.yaml index 1193f32654..9bd217d173 100644 --- a/cloudformation/policies/parallelcluster-policies.yaml +++ b/cloudformation/policies/parallelcluster-policies.yaml @@ -572,11 +572,13 @@ Resources: - autoscaling:DeleteAutoScalingGroup - autoscaling:DeleteLifecycleHook - autoscaling:DescribeAutoScalingGroups + - autoscaling:DescribeLifecycleHooks - autoscaling:DescribeScalingActivities - autoscaling:PutLifecycleHook - autoscaling:UpdateAutoScalingGroup - elasticloadbalancing:CreateListener - elasticloadbalancing:CreateTargetGroup + - elasticloadbalancing:DescribeTags - elasticloadbalancing:DeleteListener - elasticloadbalancing:DeleteLoadBalancer - elasticloadbalancing:DeleteTargetGroup diff --git a/cloudformation/proxy/proxy.yaml b/cloudformation/proxy/proxy.yaml index 837cb08bc3..6889021f66 100644 --- a/cloudformation/proxy/proxy.yaml +++ b/cloudformation/proxy/proxy.yaml @@ -294,7 +294,6 @@ Resources: Properties: GroupSet: - !Ref ProxySecurityGroup - InterfaceType: interface SourceDestCheck: false SubnetId: !Ref PublicSubnet diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md index e0ba76bdcb..040da8f0fe 100644 --- a/tests/integration-tests/README.md +++ b/tests/integration-tests/README.md @@ -504,13 +504,13 @@ Here is how to define a simple parametrized test case: def test_case_1(region, instance, os, scheduler): ``` This test case will be automatically parametrized and executed for all combination of input dimensions. -For example, given as input dimensions `--regions "eu-west-1" --instances "c4.xlarge" --oss "alinux2" +For example, given as input dimensions `--regions "eu-west-1" --instances "c5.xlarge" --oss "alinux2" "ubuntu1804" --scheduler "awsbatch" "slurm"`, the following tests will run: ``` -test_case_1[eu-west-1-c4.xlarge-alinux2-awsbatch] -test_case_1[eu-west-1-c4.xlarge-ubuntu1804-awsbatch] -test_case_1[eu-west-1-c4.xlarge-alinux2-slurm] -test_case_1[eu-west-1-c4.xlarge-ubuntu1804-slurm] +test_case_1[eu-west-1-c5.xlarge-alinux2-awsbatch] +test_case_1[eu-west-1-c5.xlarge-ubuntu1804-awsbatch] +test_case_1[eu-west-1-c5.xlarge-alinux2-slurm] +test_case_1[eu-west-1-c5.xlarge-ubuntu1804-slurm] ``` If you don't need to reference the parametrized arguments in your test case you can simply replace the @@ -529,7 +529,7 @@ test cases then you can do it in the following way: ```python @pytest.mark.usefixtures("region", "os", "instance", "scheduler") -@pytest.mark.parametrized("cluster_max_size", [5, 10]) +@pytest.mark.parametrize("cluster_max_size", [5, 10]) def test_case_2(cluster_max_size): ``` @@ -569,13 +569,13 @@ While the following test case: ```python @pytest.mark.skip_regions(["us-east-1", "eu-west-1"]) @pytest.mark.skip_dimensions("*", "c5.xlarge", "alinux2", "awsbatch") -@pytest.mark.skip_dimensions("*", "c4.xlarge", "centos7", "slurm") +@pytest.mark.skip_dimensions("*", "c5.xlarge", "centos7", "slurm") def test_case_2(region, instance, os, scheduler): ``` is allowed to run only if: * region is not `["us-east-1", "eu-west-1"]` * the triplet (instance, os, scheduler) is not `("c5.xlarge", "alinux2", "awsbatch")` or -`("c4.xlarge", "ubuntu2004", "slurm")` +`("c5.xlarge", "ubuntu2004", "slurm")` #### Default Invalid Dimensions diff --git a/tests/integration-tests/configs/common.jinja2 b/tests/integration-tests/configs/common.jinja2 index 2d73390d1f..0b1f0c89f3 100644 --- a/tests/integration-tests/configs/common.jinja2 +++ b/tests/integration-tests/configs/common.jinja2 @@ -16,7 +16,7 @@ {%- set INSTANCES_DEFAULT_ARM = ["m6g.xlarge"] -%} # m6g.xlarge is not supported in af-south-1, eu-south-1, eu-west-3, me-south-1 {%- set INSTANCES_DEFAULT = ["c5.xlarge", "m6g.xlarge"] -%} {%- set INSTANCES_EFA_SUPPORTED_X86 = ["c5n.9xlarge"] -%} -{%- set INSTANCES_EFA_UNSUPPORTED_X86 = ["t2.micro"] -%} +{%- set INSTANCES_EFA_UNSUPPORTED_X86 = ["t3.micro"] -%} {%- set NOT_RELEASED_OSES = ["rocky8", "rocky9"] -%} {%- macro instance(instance_key) -%} diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index 45abaea782..2a7ed624f1 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -825,3 +825,10 @@ test-suites: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: [{{ OS_X86_7 }}] schedulers: ["slurm"] + pyxis: + test_pyxis.py::test_pyxis: + dimensions: + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu2204"] + schedulers: ["slurm"] diff --git a/tests/integration-tests/configs/new_os.yaml b/tests/integration-tests/configs/new_os.yaml index 80fc58d22a..49bf724582 100644 --- a/tests/integration-tests/configs/new_os.yaml +++ b/tests/integration-tests/configs/new_os.yaml @@ -226,7 +226,7 @@ test-suites: test_ephemeral.py::test_head_node_stop: dimensions: - regions: ["use1-az4"] - instances: ["m5d.xlarge", "d2.2xlarge"] + instances: ["m5d.xlarge"] oss: {{ NEW_OS }} schedulers: ["slurm"] update: diff --git a/tests/integration-tests/conftest_networking.py b/tests/integration-tests/conftest_networking.py index 43a0c51bf9..7643ba46ef 100644 --- a/tests/integration-tests/conftest_networking.py +++ b/tests/integration-tests/conftest_networking.py @@ -38,12 +38,8 @@ "ap-southeast-2": ["apse2-az1", "apse2-az2"], # FSx for Luster is not supported in apne1-az1 "ap-northeast-1": ["apne1-az4", "apne1-az2"], - # c4.xlarge is not supported in apne2-az2 - "ap-northeast-2": ["apne2-az1", "apne2-az3"], # c5.xlarge is not supported in apse1-az3 "ap-southeast-1": ["apse1-az2", "apse1-az1"], - # c4.xlarge is not supported in aps1-az2 - "ap-south-1": ["aps1-az1", "aps1-az3"], # NAT Gateway not available in sae1-az2 , c5n.18xlarge is not supported in sae1-az3 "sa-east-1": ["sae1-az1"], # m6g.xlarge instances not available in euw1-az3 diff --git a/tests/integration-tests/conftest_resource_bucket.py b/tests/integration-tests/conftest_resource_bucket.py index df76516508..954d1ddc6e 100644 --- a/tests/integration-tests/conftest_resource_bucket.py +++ b/tests/integration-tests/conftest_resource_bucket.py @@ -30,7 +30,7 @@ from tests.common.utils import get_installed_parallelcluster_version logger = logging.getLogger() -NODE_VERSION = "v16.19.0" # maintenance version compatible with alinux2's GLIBC +NODE_VERSION = "v18.20.3" def install_pc(basepath, pc_version): @@ -40,7 +40,9 @@ def install_pc(basepath, pc_version): cli_dir = root / "cli" try: logger.info("installing ParallelCluster packages...") - subprocess.check_call([sys.executable, "-m", "pip", "install", f"{cli_dir}[awslambda]", "-t", tempdir]) + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "jsonschema==4.17.3", f"{cli_dir}[awslambda]", "-t", tempdir] + ) # The following are provided by the lambda runtime shutil.rmtree(tempdir / "botocore") shutil.rmtree(tempdir / "boto3") diff --git a/tests/integration-tests/tests/pyxis/test_pyxis.py b/tests/integration-tests/tests/pyxis/test_pyxis.py new file mode 100644 index 0000000000..d25b2e4710 --- /dev/null +++ b/tests/integration-tests/tests/pyxis/test_pyxis.py @@ -0,0 +1,83 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import boto3 +import pytest +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor + +from tests.common.schedulers_common import SlurmCommands + + +@pytest.mark.parametrize("scale_up_fleet", [False]) +@pytest.mark.usefixtures("region", "os", "instance", "scheduler") +def test_pyxis(pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory, region, scale_up_fleet): + """ + Test Pyxis and Enroot functionality after configuration. + + + This test creates a cluster with the necessary custom actions to configure Pyxis and Enroot. + It submits two consecutive containerized jobs and verifies that they run successfully, + and the output contains the expected messages. + """ + # Set max_queue_size based on scale_up_fleet + max_queue_size = 1000 if scale_up_fleet else 3 + + # Create an S3 bucket for custom action scripts + bucket_name = s3_bucket_factory() + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) + + # Pre-upload custom scripts that set up pyxis to S3 + bucket.upload_file(str(test_datadir / "head_node_configure.sh"), "head_node_configure.sh") + bucket.upload_file(str(test_datadir / "compute_node_start.sh"), "compute_node_start.sh") + + cluster_config = pcluster_config_reader(bucket_name=bucket_name, max_queue_size=max_queue_size) + cluster = clusters_factory(cluster_config) + + remote_command_executor = RemoteCommandExecutor(cluster) + slurm_commands = SlurmCommands(remote_command_executor) + + # Submit the first containerized job with dynamic 3 or 1000 nodes + logging.info("Submitting first containerized job") + + result = slurm_commands.submit_command( + command="srun --container-image docker://ubuntu:22.04 hostname", + nodes=max_queue_size, + ) + job_id = slurm_commands.assert_job_submitted(result.stdout) + slurm_commands.wait_job_completed(job_id, timeout=30 if scale_up_fleet else 12) + slurm_commands.assert_job_succeeded(job_id) + + # Fetch the job output and check for the expected messages + logging.info("Checking output of the first job") + slurm_out_1 = remote_command_executor.run_remote_command("cat slurm-1.out").stdout + + logging.info("Checking for expected messages in first job output") + assert_that(slurm_out_1).contains("pyxis: imported docker image: docker://ubuntu:22.04") + + # Submit the second containerized job with fixed 3 nodes after the first one completes + logging.info("Submitting second containerized job") + result = slurm_commands.submit_command( + command="srun --container-image docker://ubuntu:22.04 hostname", + nodes=3, + ) + job_id = slurm_commands.assert_job_submitted(result.stdout) + slurm_commands.wait_job_completed(job_id) + slurm_commands.assert_job_succeeded(job_id) + + # Fetch the job output and check for the expected messages + logging.info("Checking output of the second job") + slurm_out_2 = remote_command_executor.run_remote_command("cat slurm-2.out").stdout + + logging.info("Checking for expected messages in second job output") + assert_that(slurm_out_2).contains("pyxis: imported docker image: docker://ubuntu:22.04") diff --git a/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/compute_node_start.sh b/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/compute_node_start.sh new file mode 100644 index 0000000000..f03a12d65f --- /dev/null +++ b/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/compute_node_start.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e + +echo "Executing $0" + +# Configure Enroot +ENROOT_PERSISTENT_DIR="/var/enroot" +ENROOT_VOLATILE_DIR="/run/enroot" + +sudo mkdir -p $ENROOT_PERSISTENT_DIR +sudo chmod 1777 $ENROOT_PERSISTENT_DIR +sudo mkdir -p $ENROOT_VOLATILE_DIR +sudo chmod 1777 $ENROOT_VOLATILE_DIR +sudo mv /opt/parallelcluster/examples/enroot/enroot.conf /etc/enroot/enroot.conf +sudo chmod 0644 /etc/enroot/enroot.conf + +# Configure Pyxis +PYXIS_RUNTIME_DIR="/run/pyxis" + +sudo mkdir -p $PYXIS_RUNTIME_DIR +sudo chmod 1777 $PYXIS_RUNTIME_DIR diff --git a/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/head_node_configure.sh b/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/head_node_configure.sh new file mode 100644 index 0000000000..f8560f480a --- /dev/null +++ b/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/head_node_configure.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -e + +echo "Executing $0" + +# Configure Enroot +ENROOT_PERSISTENT_DIR="/var/enroot" +ENROOT_VOLATILE_DIR="/run/enroot" + +sudo mkdir -p $ENROOT_PERSISTENT_DIR +sudo chmod 1777 $ENROOT_PERSISTENT_DIR +sudo mkdir -p $ENROOT_VOLATILE_DIR +sudo chmod 1777 $ENROOT_VOLATILE_DIR +sudo mv /opt/parallelcluster/examples/enroot/enroot.conf /etc/enroot/enroot.conf +sudo chmod 0644 /etc/enroot/enroot.conf + +# Configure Pyxis +PYXIS_RUNTIME_DIR="/run/pyxis" + +sudo mkdir -p $PYXIS_RUNTIME_DIR +sudo chmod 1777 $PYXIS_RUNTIME_DIR + +sudo mkdir -p /opt/slurm/etc/plugstack.conf.d/ +sudo mv /opt/parallelcluster/examples/spank/plugstack.conf /opt/slurm/etc/ +sudo mv /opt/parallelcluster/examples/pyxis/pyxis.conf /opt/slurm/etc/plugstack.conf.d/ +sudo -i scontrol reconfigure diff --git a/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/pcluster.config.yaml b/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/pcluster.config.yaml new file mode 100644 index 0000000000..d6c2ea9e08 --- /dev/null +++ b/tests/integration-tests/tests/pyxis/test_pyxis/test_pyxis/pcluster.config.yaml @@ -0,0 +1,33 @@ +Image: + Os: {{ os }} +HeadNode: + InstanceType: {{ instance }} + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} + CustomActions: + OnNodeConfigured: + Script: s3://{{ bucket_name }}/head_node_configure.sh + Iam: + S3Access: + - BucketName: {{ bucket_name }} +Scheduling: + Scheduler: {{ scheduler }} + SlurmQueues: + - Name: queue-0 + ComputeResources: + - Name: compute-resource-0 + Instances: + - InstanceType: t3.small + MinCount: 0 + MaxCount: {{ max_queue_size }} + Networking: + SubnetIds: + - {{ private_subnet_id }} + CustomActions: + OnNodeStart: + Script: s3://{{ bucket_name }}/compute_node_start.sh + Iam: + S3Access: + - BucketName: {{ bucket_name }} diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.yaml b/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.yaml index a26cd9647e..ed5c4f7265 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.yaml +++ b/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.yaml @@ -19,7 +19,7 @@ Scheduling: - Name: compute-resource-11 InstanceTypes: - {{ instance }} - # we usually use c4.xlarge and c5.xlarge for test, the min vcpus for one instance is 4. + # we usually use c5.xlarge for test, the min vcpus for one instance is 4. MinvCpus: 4 DesiredvCpus: 8 MaxvCpus: 64