diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 42c81e6f73..a5550fac75 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -94,6 +94,13 @@ jobs: run: pip install tox - name: Run Tox run: cd ${{ matrix.toxdir }} && tox -e ${{ matrix.toxenv }} + - name: Upload code coverage report to Codecov + uses: codecov/codecov-action@v3 + if: ${{ endsWith(matrix.toxenv, '-cov') }} + with: + files: cli/coverage.xml + flags: unittests + verbose: true awsbatch-cli-tests: name: AWS Batch CLI Tests runs-on: ${{ matrix.os }} @@ -169,7 +176,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: mikefarah/yq@v4.6.3 + - uses: mikefarah/yq@v4.32.2 - run: api/docker/awslambda/docker-build.sh shellcheck: name: Shellcheck diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 3d7f9d77f6..47f800c001 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -22,9 +22,9 @@ jobs: - name: Checkout repository uses: actions/checkout@v2 - name: Initialize CodeQL - uses: github/codeql-action/init@v1 + uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} queries: +security-and-quality - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v1 + uses: github/codeql-action/analyze@v2 diff --git a/.gitignore b/.gitignore index 9dcf45faa6..605549a914 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ report.html tests_outputs/ .python-version test.yaml +.vscode diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d65f699fb9..19070325f7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,6 +16,7 @@ repos: - id: check-symlinks - id: end-of-file-fixer - id: pretty-format-json + args: ['--autofix'] - id: requirements-txt-fixer - id: mixed-line-ending args: ['--fix=no'] diff --git a/CHANGELOG.md b/CHANGELOG.md index a084117416..e19994adf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,16 +4,58 @@ CHANGELOG 3.6.0 ---- **ENHANCEMENTS** +- Add a CloudFormation custom resource for creating and managing clusters from CloudFormation. - Add `mem_used_percent` and `disk_used_percent` metrics for head node memory and root volume disk utilization tracking on the ParallelCluster CloudWatch dashboard, and set up alarms for monitoring these metrics. - -**ENHANCEMENTS** - Add log rotation support for ParallelCluster managed logs. +- Track common errors of compute nodes on Cloudwatch Dashboard. +- Increase the limit on the maximum number of queues per cluster from 10 to 100. Each cluster can however have a maximum number of 150 compute resources and each queue can have a maximum of 40 compute resources. +- Allow to specify a sequence of multiple custom actions scripts per event. +- Add support for customizing the cluster Slurm configuration via the ParallelCluster configuration YAML file. +- Track the longest dynamic node idle time in CloudWatch Dashboard. +- Add new configuration section `HealthChecks/Gpu` for enabling the GPU Health Check in the compute node before job execution. +- Add support for `DetailedMonitoring` in the `Monitoring` section. +- Add support for `Tags` in the `SlurmQueues` and `SlurmQueues/ComputeResources` section. +- Build Slurm with support for LUA. **CHANGES** - Increase the default `RetentionInDays` of CloudWatch logs from 14 to 180 days. +- Set Slurm prolog and epilog configurations to target a directory, /opt/slurm/etc/scripts/prolog.d/ and /opt/slurm/etc/scripts/epilog.d/ respectively. +- Upgrade Slurm to version 23.02.1. +- Upgrade munge to version 0.5.15. +- Upgrade image used by CodeBuild environment when building container images for AWS Batch clusters, from + `aws/codebuild/amazonlinux2-x86_64-standard:3.0` to `aws/codebuild/amazonlinux2-x86_64-standard:4.0` and from + `aws/codebuild/amazonlinux2-aarch64-standard:1.0` to `aws/codebuild/amazonlinux2-aarch64-standard:2.0`. **BUG FIXES** - Fix EFS, FSx network security groups validators to avoid reporting false errors. +- Fix missing tagging of resources created by ImageBuilder during the `build-image` operation. +- Fix Update policy for MaxCount to always perform numerical comparisons on MaxCount property. +- Fix IP association on instances with multiple network cards. +- Fix replacement of StoragePass in slurm_parallelcluster_slurmdbd.conf when a queue parameter update is performed and the Slurm accounting configurations are not updated. + +3.5.1 +----- +**ENHANCEMENTS** +- Add a new way to distribute ParallelCluster as a self-contained executable shipped with a dedicated installer. +- Add support for US isolated region us-isob-east-1. + +**CHANGES** +- Upgrade EFA installer to `1.22.0` + - Efa-driver: `efa-2.1.1g` + - Efa-config: `efa-config-1.13-1` + - Efa-profile: `efa-profile-1.5-1` + - Libfabric-aws: `libfabric-aws-1.17.0-1` + - Rdma-core: `rdma-core-43.0-1` + - Open MPI: `openmpi40-aws-4.1.5-1` +- Upgrade NICE DCV to version `2022.2-14521`. + - server: `2022.2.14521-1` + - xdcv: `2022.2.519-1` + - gl: `2022.2.1012-1` + - web_viewer: `2022.2.14521-1` + +**BUG FIXES** +- Fix update cluster to remove shared EBS volumes can potentially cause node launching failures if `MountDir` match the same pattern in `/etc/exports`. +- Fix for compute_console_output log file being truncated at every clustermgtd iteration. 3.5.0 ----- @@ -23,7 +65,6 @@ CHANGELOG - Add a Python library to allow customers to use ParallelCluster functionalities in their own code. - Add logging of compute node console output to CloudWatch on compute node bootstrap failure. - Add failures field containing failure code and reason to `describe-cluster` output when cluster creation fails. -- Add support for US isolated regions: us-iso-* and us-isob-*. **CHANGES** - Upgrade Slurm to version 22.05.8. @@ -204,6 +245,25 @@ CHANGELOG - Fix ParallelCluster API stack update failure when upgrading from a previus version. Add resource pattern used for the `ListImagePipelineImages` action in the `EcrImageDeletionLambdaRole`. - Fix ParallelCluster API adding missing permissions needed to import/export from S3 when creating an FSx for Lustre storage. +3.1.5 +------ + +**CHANGES** +- Upgrade EFA installer to `1.18.0` + - Efa-driver: `efa-1.16.0-1` + - Efa-config: `efa-config-1.11-1` + - Efa-profile: `efa-profile-1.5-1` + - Libfabric-aws: `libfabric-aws-1.16.0~amzn4.0-1` + - Rdma-core: `rdma-core-41.0-2` + - Open MPI: `openmpi40-aws-4.1.4-2` +- Add `lambda:ListTags` and `lambda:UntagResource` to `ParallelClusterUserRole` used by ParallelCluster API stack for cluster update. +- Upgrade Intel MPI Library to 2021.6.0.602. +- Upgrade NVIDIA driver to version 470.141.03. +- Upgrade NVIDIA Fabric Manager to version 470.141.03. + +**BUG FIXES** +- Fix Slurm issue that prevents idle nodes termination. + 3.1.4 ------ @@ -680,7 +740,7 @@ CHANGELOG - Improve retrieval of instance type info by using `DescribeInstanceType` API. - Remove `custom_awsbatch_template_url` configuration parameter. - Upgrade `pip` to latest version in virtual environments. -- Upgrade image used by CodeBuild environment when building container images for Batch clusters, from +- Upgrade image used by CodeBuild environment when building container images for AWS Batch clusters, from `aws/codebuild/amazonlinux2-x86_64-standard:1.0` to `aws/codebuild/amazonlinux2-x86_64-standard:3.0`. **BUG FIXES** diff --git a/api/README.md b/api/README.md index 0d5eb2f474..1eb8387e64 100644 --- a/api/README.md +++ b/api/README.md @@ -91,98 +91,11 @@ correctness of the API model evey time a PR is opened. The ParallelCluster OpenAPI Generator workflow (`workdlows/openapi_generator.yml`) defines a `generate-openapi-model` build step that automatically adds to the PR the generated OpenAPI model in case this was not included in the commit. -## Packaging the API as an AWS Lambda container +## Testing -The `docker/awslambda` directory contains the definition of a Dockerfile that is used to package the ParallelCluster -API as an AWS Lambda function. Running the `docker/awslambda/docker-build.sh` script will produce a `pcluster-lambda` -Docker container that packages and exposes the ParallelCluster API in a format which is compatible with the AWS Lambda runtime. +The API is a facade ontop of the controllers (as well as the CLI) so much of the underlying functionality can be tested +through unit tests and integration tests that exercise the operations. -### Running Testing and Debugging the API locally +In order to test the API specifically, there are integraiton tests which will deploy the API and test the functionality using +the generated client. -Once the Docker image has been successfully built you have the following options: - -#### Run a shell in the container -Use the following to run a shell in the container: `docker run -it --entrypoint /bin/bash pcluster-lambda`. - -This is particularly useful to debug issues with the container runtime. - -#### Run a local AWS Lambda endpoint -Use the following to run a local AWS Lambda endpoint hosting the API: `docker run -e POWERTOOLS_TRACE_DISABLED=1 -e AWS_REGION=eu-west-1 -p 9000:8080 pcluster-lambda` - -Then you can use the following to send requests to the local endpoint: -`curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d @docker/awslambda/test-events/event.json` - -This is useful to test the integration with AWS Lambda. - -#### Run the Flask development server -Use the following to run a local Flask development server hosting the API: `docker run -p 8080:8080 --entrypoint python pcluster-lambda -m pcluster.api.flask_app` - -Then you can navigate to the following url to test the API: `http://0.0.0.0:8080/ui` -Note that to enable swagger-ui you have to build the docker with `--build-arg PROFILE=dev`. - -This is particularly useful to ignore the AWS Lambda layer and directly hit the Flask application with plain HTTP requests. -An even simpler way to do this which also offers live reloading of the API code, is to just ignore the Docker container -and run a local Flask server on your host by executing `cd ../cli/src && python -m pcluster.api.flask_app` - -## Deploy the API test infrastructure with SAM cli (API Gateway + Lambda) -The Serverless Application Model Command Line Interface (SAM CLI) is an extension of the AWS CLI that adds functionality -for building and testing Lambda applications. It uses Docker to run your functions in an Amazon Linux environment that -matches Lambda. It can also emulate your application's build environment and API. - -To use the SAM CLI, you need the following tools. - -* SAM CLI - [Install the SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-install.html) -* Docker - [Install Docker community edition](https://hub.docker.com/search/?type=edition&offering=community) - -You may need the following for local testing. -* [Python 3 installed](https://www.python.org/downloads/) - -The `docker/awslambda/sam` directory contains a sample [SAM](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/what-is-sam.html) -template that can be used to test the ParallelCluster API. - -### Run a local AWS APIGateway endpoint with SAM -The SAM template can be used together with the SAM CLI to locally test the ParallelCluster API as if it were hosted -behind an API Gateway endpoint. - -To do so move to the `docker/awslambda/sam` directory and run: - -```bash -sam build -sam local start-api -``` - -To only invoke the AWS Lambda function locally you can run: -```bash -sam build -sam local invoke ParallelClusterFunction --event ../test-events/event.json -``` - -For further details and -to review all the testing features available through SAM please refer to the official -[SAM docs](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-test-and-debug.html). - -### Deploy the API test infrastructure -To build and deploy your application for the first time, run the following in your shell: - -```bash -sam build -sam deploy --guided -``` - -The first command will build a docker image from a Dockerfile and then copy the source of your application inside the Docker image. -The second command will package and deploy your application to AWS, with a series of prompts. - -#### Fetch, tail, and filter Lambda function logs - -To simplify troubleshooting, SAM CLI has a command called `sam logs`. `sam logs` lets you fetch logs generated by your -deployed Lambda function from the command line. In addition to printing the logs on the terminal, this command has -several nifty features to help you quickly find the bug. - -NOTE: This command works for all AWS Lambda functions; not just the ones you deploy using SAM. - -```bash -sam logs -n ParallelClusterFunction --stack-name pcluster-lambda --tail -``` - -You can find more information and examples about filtering Lambda function logs in the -[SAM CLI Documentation](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-logging.html). diff --git a/api/client/patch-client.sh b/api/client/patch-client.sh index 726ddf3d14..a269d561f0 100755 --- a/api/client/patch-client.sh +++ b/api/client/patch-client.sh @@ -7,6 +7,8 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +set -ex + cp client/resources/sigv4_auth.py client/src/pcluster_client patch -u -N client/src/pcluster_client/api_client.py < client/resources/api_client.py.patch patch -u -N client/src/requirements.txt < client/resources/client-requirements.txt.patch diff --git a/api/client/resources/api_client.py.patch b/api/client/resources/api_client.py.patch index 244ce972dc..0ce2092d95 100644 --- a/api/client/resources/api_client.py.patch +++ b/api/client/resources/api_client.py.patch @@ -8,10 +8,10 @@ class ApiClient(object): -@@ -603,6 +604,9 @@ - if not auth_settings: +@@ -633,6 +634,9 @@ class ApiClient(object): + headers, queries, resource_path, method, body, auth_setting) return - + + if 'aws.auth.sigv4' in auth_settings: + sigv4_auth(method, self.configuration.host, resource_path, queries, body, headers) + diff --git a/api/docker/awslambda/sam/template.yaml b/api/docker/awslambda/sam/template.yaml deleted file mode 100644 index cd4973b8cb..0000000000 --- a/api/docker/awslambda/sam/template.yaml +++ /dev/null @@ -1,72 +0,0 @@ -AWSTemplateFormatVersion: '2010-09-09' -Transform: AWS::Serverless-2016-10-31 -Description: > - python3.8 - - Sample SAM Template for pcluster-lambda - -# More info about Globals: https://github.com/awslabs/serverless-application-model/blob/master/docs/globals.rst -Globals: - Function: - Timeout: 30 - MemorySize: 256 - -Resources: - ApiGatewayApi: - Type: AWS::Serverless::Api - Properties: - StageName: prod - Auth: - DefaultAuthorizer: AWS_IAM - TracingEnabled: True - EndpointConfiguration: - Type: REGIONAL - DefinitionBody: - Fn::Transform: - Name: AWS::Include - Parameters: - Location: ../../../spec/openapi/ParallelCluster.openapi.yaml - APIGatewayExecutionRole: - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Principal: - Service: - - apigateway.amazonaws.com - Action: - - 'sts:AssumeRole' - Policies: - - PolicyName: lambda-invoke - PolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: Allow - Action: lambda:InvokeFunction - Resource: !GetAtt ParallelClusterFunction.Arn - ParallelClusterFunction: - Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction - Properties: - PackageType: Image - Tracing: Active - Policies: - - AWSLambdaBasicExecutionRole - - AWSXRayDaemonWriteAccess - - AdministratorAccess # TODO: replace with less permissive - Metadata: - Dockerfile: ../api/docker/awslambda/Dockerfile - DockerContext: ../../../../cli - DockerTag: pcluster-api-sam - -Outputs: - # ServerlessRestApi is an implicit API created out of Events key under Serverless::Function - # Find out more about other implicit resources you can reference within SAM - # https://github.com/awslabs/serverless-application-model/blob/master/docs/internals/generated_resources.rst#api - ParallelClusterFunction: - Description: "ParallelCluster Lambda Function ARN" - Value: !GetAtt ParallelClusterFunction.Arn - ParallelClusterFunctionIamRole: - Description: "Implicit IAM Role created for ParallelCluster function" - Value: !GetAtt ParallelClusterFunctionRole.Arn diff --git a/api/docker/awslambda/test-events/event.json b/api/docker/awslambda/test-events/event.json deleted file mode 100644 index 9c9f8e8934..0000000000 --- a/api/docker/awslambda/test-events/event.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "body": "{\"message\": \"hello world\"}", - "resource": "/{proxy+}", - "path": "/v3/clusters", - "httpMethod": "GET", - "isBase64Encoded": false, - "queryStringParameters": { - "region": "us-east-1", - "foo": "bar" - }, - "pathParameters": { - "proxy": "/path/to/resource" - }, - "stageVariables": { - "baz": "qux" - }, - "headers": { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Encoding": "gzip, deflate, sdch", - "Accept-Language": "en-US,en;q=0.8", - "Cache-Control": "max-age=0", - "CloudFront-Forwarded-Proto": "https", - "CloudFront-Is-Desktop-Viewer": "true", - "CloudFront-Is-Mobile-Viewer": "false", - "CloudFront-Is-SmartTV-Viewer": "false", - "CloudFront-Is-Tablet-Viewer": "false", - "CloudFront-Viewer-Country": "US", - "Content-Type": "application/json", - "Host": "1234567890.execute-api.us-east-1.amazonaws.com", - "Upgrade-Insecure-Requests": "1", - "User-Agent": "Custom User Agent String", - "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", - "X-Amz-Cf-Id": "cDehVQoZnx43VYQb9j2-nvCh-9z396Uhbp027Y2JvkCPNLmGJHqlaA==", - "X-Forwarded-For": "127.0.0.1, 127.0.0.2", - "X-Forwarded-Port": "443", - "X-Forwarded-Proto": "https", - "Authorization": "test" - }, - "requestContext": { - "accountId": "123456789012", - "resourceId": "123456", - "stage": "prod", - "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", - "requestTime": "09/Apr/2015:12:34:56 +0000", - "requestTimeEpoch": 1428582896000, - "identity": { - "cognitoIdentityPoolId": null, - "accountId": null, - "cognitoIdentityId": null, - "caller": null, - "accessKey": null, - "sourceIp": "127.0.0.1", - "cognitoAuthenticationType": null, - "cognitoAuthenticationProvider": null, - "userArn": null, - "userAgent": "Custom User Agent String", - "user": null - }, - "path": "/v3/clusters", - "resourcePath": "/{proxy+}", - "httpMethod": "GET", - "apiId": "1234567890", - "protocol": "HTTP/1.1" - } - } diff --git a/api/infrastructure/deploy-api.sh b/api/infrastructure/deploy-api.sh index 896892dcf3..309b7ce5a2 100755 --- a/api/infrastructure/deploy-api.sh +++ b/api/infrastructure/deploy-api.sh @@ -1,5 +1,4 @@ -#!/bin/bash -set -ex +#!/bin/bash -ex # Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance @@ -8,18 +7,14 @@ set -ex # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -usage="$(basename "$0") [-h] --s3-bucket bucket-name --ecr-repo repo-name --region aws-region [--stack-name name] [--enable-iam-admin true|false] [--create-api-user true|false] [--skip-image-import])" +usage="$(basename "$0") [-h] --s3-bucket bucket-name --region aws-region [--stack-name name] [--enable-iam-admin true|false] [--create-api-user true|false])" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" S3_BUCKET= -ECR_REPO= -IMAGE_BUILDER_VPC_ID= -IMAGE_BUILDER_SUBNET_ID= STACK_NAME="ParallelClusterApi" ENABLE_IAM_ADMIN="true" CREATE_API_USER="false" -SKIP_IMAGE_IMPORT="false" # if "true", uses the image uploaded to ECR directly, without creating a private copy while [[ $# -gt 0 ]] do key="$1" @@ -34,11 +29,6 @@ case $key in shift # past argument shift # past value ;; - --ecr-repo) - ECR_REPO=$2 - shift # past argument - shift # past value - ;; --region) export AWS_DEFAULT_REGION=$2 shift # past argument @@ -59,20 +49,6 @@ case $key in shift # past argument shift # past value ;; - --skip-image-import) - export SKIP_IMAGE_IMPORT="true" - shift # past argument - ;; - --image-builder-vpc-id) - export IMAGE_BUILDER_VPC_ID=$2 - shift # past argument - shift # past value - ;; - --image-builder-subnet-id) - export IMAGE_BUILDER_SUBNET_ID=$2 - shift # past argument - shift # past value - ;; --enable-fsx-s3-access) export ENABLE_FSX_S3_ACCESS=$2 shift # past argument @@ -90,35 +66,21 @@ case $key in esac done -if [ -z "${S3_BUCKET}" ] || [ -z "${ECR_REPO}" ] || [ -z "${AWS_DEFAULT_REGION}" ] ; then +if [ -z "${S3_BUCKET}" ] || [ -z "${AWS_DEFAULT_REGION}" ] ; then echo "$usage" >&2 exit 1 fi ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text) -ECR_ENDPOINT="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" S3_UPLOAD_URI="s3://${S3_BUCKET}/api/ParallelCluster.openapi.yaml" - -echo "Building docker image" -"${SCRIPT_DIR}/../docker/awslambda/docker-build.sh" - -echo "Pushing docker image to ${ECR_ENDPOINT}/${ECR_REPO}" -aws ecr get-login-password | docker login --username AWS --password-stdin "${ECR_ENDPOINT}" -docker tag pcluster-lambda:latest "${ECR_ENDPOINT}/${ECR_REPO}:latest" -docker push "${ECR_ENDPOINT}/${ECR_REPO}:latest" +POLICIES_S3_URI="s3://${S3_BUCKET}/stacks/parallelcluster-policies.yaml" +POLICIES_TEMPLATE_URI="http://${S3_BUCKET}.s3.${AWS_DEFAULT_REGION}.amazonaws.com/stacks/parallelcluster-policies.yaml" echo "Publishing OpenAPI specs to S3" aws s3 cp "${SCRIPT_DIR}/../spec/openapi/ParallelCluster.openapi.yaml" "${S3_UPLOAD_URI}" -echo "" -if [ "$SKIP_IMAGE_IMPORT" = "true" ]; then - echo "Uploaded image will be used directly" - OVERRIDE_IMAGE_URI_PRM="EcrImageUri" - ECR_IMAGE_URI="${ECR_ENDPOINT}/${ECR_REPO}:latest" -else - echo "Uploaded image will simulate a public image and will be therefore imported into a new private image" - OVERRIDE_IMAGE_URI_PRM="PublicEcrImageUri" -fi +echo "Publishing policies CloudFormation stack to S3" +aws s3 cp "${SCRIPT_DIR}/../../cloudformation/policies/parallelcluster-policies.yaml" "${POLICIES_S3_URI}" echo "Deploying API template" aws cloudformation deploy \ @@ -126,17 +88,7 @@ aws cloudformation deploy \ --template-file "${SCRIPT_DIR}/parallelcluster-api.yaml" \ --s3-bucket "${S3_BUCKET}" \ --s3-prefix "api/" \ - --parameter-overrides ApiDefinitionS3Uri="${S3_UPLOAD_URI}" "${OVERRIDE_IMAGE_URI_PRM}"="${ECR_ENDPOINT}/${ECR_REPO}:latest" \ + --parameter-overrides ApiDefinitionS3Uri="${S3_UPLOAD_URI}" \ + PoliciesTemplateUri="${POLICIES_TEMPLATE_URI}" \ EnableIamAdminAccess="${ENABLE_IAM_ADMIN}" CreateApiUserRole="${CREATE_API_USER}" \ - ImageBuilderVpcId="${IMAGE_BUILDER_VPC_ID}" \ - ImageBuilderSubnetId="${IMAGE_BUILDER_SUBNET_ID}" \ --capabilities CAPABILITY_NAMED_IAM - -echo "Updating API Lambda since updates are not fully automated yet" -LAMBDA_FUNCTION_ARN=$(aws cloudformation describe-stacks --stack-name "${STACK_NAME}" --query "Stacks[0].Outputs[?OutputKey=='ParallelClusterLambdaArn'].OutputValue" --output text) -IMPORTED_IMAGE_URI=$(aws cloudformation describe-stacks --stack-name "${STACK_NAME}" --query "Stacks[0].Outputs[?OutputKey=='UriOfCopyOfPublicEcrImage'].OutputValue" --output text) - -aws lambda update-function-code \ - --function-name "${LAMBDA_FUNCTION_ARN}" \ - --image-uri "${ECR_IMAGE_URI:-${IMPORTED_IMAGE_URI}}" \ - --publish diff --git a/api/infrastructure/parallelcluster-api.yaml b/api/infrastructure/parallelcluster-api.yaml index 027ab05ded..76fd956342 100644 --- a/api/infrastructure/parallelcluster-api.yaml +++ b/api/infrastructure/parallelcluster-api.yaml @@ -1,6 +1,6 @@ AWSTemplateFormatVersion: '2010-09-09' Transform: AWS::Serverless-2016-10-31 -Description: 'Template for the ParallelCluster API' +Description: 'AWS ParallelCluster API' Parameters: Region: @@ -8,11 +8,6 @@ Parameters: Type: String Default: '*' - EcrImageUri: - Description: When specified use this image for the Lambda function and skip the import phase - Type: String - Default: '' - ParallelClusterFunctionRole: Description: | When specified, the ARN of the execution role for the ParallelCluster Lambda function @@ -39,10 +34,10 @@ Parameters: Type: String Default: '' - PublicEcrImageUri: - Description: When specified, the URI of the Docker image for the Lambda of the ParallelCluster API + PoliciesTemplateUri: + Description: "S3 URI of the ParallelCluster Policies Template. Defaults to: s3://-aws-parallelcluster/parallelcluster/3.6.0/templates/policies/policies.yaml" Type: String - Default: public.ecr.aws/parallelcluster/pcluster-api:3.6.0 + Default: '' VpcEndpointId: Description: When specified, configure a private API with the specified endpoint @@ -95,13 +90,8 @@ Parameters: - true - false - ImageBuilderVpcId: - Description: Optional. Provide a specific vpc id to use for building the container images. Use this if you don't have a default vpc available. - Type: String - Default: '' - - ImageBuilderSubnetId: - Description: Optional. Provide a specific public subnet id to use for building the container images. Use this if you don't have a default vpc available. + CustomBucket: + Description: (Debug only) bucket to retrieve S3 artifacts for internal resources. Type: String Default: '' @@ -149,22 +139,13 @@ Globals: MetricsEnabled: true Conditions: - UseCustomEcrImageUri: !Not [!Equals [!Ref EcrImageUri, '']] UseCustomParallelClusterFunctionRole: !Not [!Equals [!Ref ParallelClusterFunctionRole, '']] CreateIamResources: !Not [!Condition UseCustomParallelClusterFunctionRole] - DoNotUseCustomEcrImageUri: !Not [!Condition UseCustomEcrImageUri] UseCustomDomain: !Not [!Equals [!Ref CustomDomainName, '']] UseRoute53Configuration: !Not [!Equals [!Ref CustomDomainHostedZoneId, '']] UseCustomDomainAndRoute53Configuration: !And - !Condition UseCustomDomain - !Condition UseRoute53Configuration - IsMultiRegion: !Equals [!Ref Region, '*'] - EnableIamPolicy: !And - - !Or - - !Equals [!Ref EnableIamAdminAccess, true] - - !Condition EnablePermissionsBoundary - - !Not [!Condition UseCustomParallelClusterFunctionRole] - EnablePermissionsBoundary: !Not [!Equals [!Ref PermissionsBoundaryPolicy, '']] UsePrivateVpcEndpoint: !And - !Not [!Condition UseCustomDomainAndRoute53Configuration] - !Not [!Equals [!Ref VpcEndpointId, '']] @@ -172,16 +153,44 @@ Conditions: - !Not [!Condition UsePrivateVpcEndpoint] - !Not [!Condition UseCustomDomainAndRoute53Configuration] CreateApiUserRoleCondition: !Equals [!Ref CreateApiUserRole, true] - NonDefaultVpc: - Fn::And: - - !Not [!Equals [!Ref ImageBuilderVpcId, ""]] - - !Not [!Equals [!Ref ImageBuilderSubnetId, ""]] - EnableFSxS3AccessCondition: !And - - !Equals [!Ref EnableFSxS3Access, true] - - !Condition CreateIamResources - UseAllBucketsForFSxS3: !Equals [!Ref FsxS3Buckets, "*"] + UseCustomBucket: !Not [!Equals [!Ref CustomBucket, '']] + UseCustomPoliciesTemplateUri: !Not [!Equals [!Ref PoliciesTemplateUri, '']] Resources: + # Policies nested stack + PclusterPolicies: + Type: AWS::CloudFormation::Stack + Condition: CreateIamResources + Properties: + TemplateURL: !If + - UseCustomPoliciesTemplateUri + - !Ref PoliciesTemplateUri + - !Sub + - "s3://${AWS::Region}-aws-parallelcluster/parallelcluster/${Version}/templates/policies/policies.yaml" + - { Version: !FindInMap [ParallelCluster, Constants, Version] } + TimeoutInMinutes: 10 + Parameters: + Region: !Ref Region + EnableFSxS3Access: !Ref EnableFSxS3Access + EnableIamAdminAccess: !Ref EnableIamAdminAccess + FsxS3Buckets: !Ref FsxS3Buckets + PermissionsBoundaryPolicy: !Ref PermissionsBoundaryPolicy + EnableBatchAccess: true + + PclusterLayer: + Type: AWS::Lambda::LayerVersion + Properties: + LayerName: !Sub + - PCLayer-${StackIdSuffix} + - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } + Description: Library which contains aws-parallelcluster python package and dependencies + Content: + S3Bucket: !If [ UseCustomBucket, !Ref CustomBucket, !Sub "${AWS::Region}-aws-parallelcluster" ] + S3Key: !Sub + - parallelcluster/${Version}/layers/aws-parallelcluster/lambda-layer.zip + - { Version: !FindInMap [ParallelCluster, Constants, Version]} + CompatibleRuntimes: + - python3.9 # We need to define three AWS::Serverless::Api due to an issue with the handling of AWS::NoValue # See related GitHub issue: https://github.com/aws/serverless-application-model/issues/1435 @@ -259,23 +268,25 @@ Resources: Resource: !GetAtt ParallelClusterFunction.Arn ParallelClusterFunction: - Type: AWS::Serverless::Function + Type: AWS::Lambda::Function Properties: - Tracing: Active - PackageType: Image + TracingConfig: + Mode: Active MemorySize: 2048 - Role: !If [UseCustomParallelClusterFunctionRole, !Ref ParallelClusterFunctionRole, !GetAtt ParallelClusterUserRole.Arn] + Timeout: 30 + Role: !If [UseCustomParallelClusterFunctionRole, !Ref ParallelClusterFunctionRole, !GetAtt [ PclusterPolicies, Outputs.ParallelClusterLambdaRoleArn ]] Tags: - 'parallelcluster:resource': api - ImageUri: !If - - UseCustomEcrImageUri - - !Ref EcrImageUri - - !Sub - - ${AWS::AccountId}.dkr.ecr.${AWS::Region}.${AWS::URLSuffix}/${Repository}:${Version} - - Repository: !Ref PrivateEcrRepository - Version: !Join - - '-' - - [!Select [2, !Split ['/', !Ref EcrImage]], !Select [3, !Split ['/', !Ref EcrImage]]] + - Key: 'parallelcluster:resource' + Value: api + - Key: 'parallelcluster:version' + Value: !FindInMap [ParallelCluster, Constants, Version] + Runtime: python3.9 + Handler: pcluster.api.awslambda.entrypoint.lambda_handler + Layers: + - !Ref PclusterLayer + # Lambda fails creation without specifying a non-empty code or container + Code: + ZipFile: " " ParallelClusterApiUserRole: Type: AWS::IAM::Role @@ -309,716 +320,6 @@ Resources: - !Ref ApiGatewayApiWithoutCustomDomain Version: '2012-10-17' - ParallelClusterUserRole: - Type: AWS::IAM::Role - Condition: CreateIamResources - Properties: - AssumeRolePolicyDocument: - Statement: - - Effect: Allow - Action: sts:AssumeRole - Principal: - Service: lambda.amazonaws.com - ManagedPolicyArns: - # Required for Lambda logging and XRay - - !Sub arn:${AWS::Partition}:iam::aws:policy/AWSXRayDaemonWriteAccess - - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole - # Required to run ParallelCluster functionalities - - !Ref ParallelClusterClusterPolicy - - !Ref ParallelClusterClusterPolicyBatch - - !Ref ParallelClusterBuildImageManagedPolicy - - !Ref ParallelClusterDeleteImageManagedPolicy - - !Ref ParallelClusterListImagesManagedPolicy - - !Ref ParallelClusterDescribeImageManagedPolicy - - !Ref ParallelClusterLogRetrievalPolicy - - ### IAM POLICIES - - DefaultParallelClusterIamAdminPolicy: - Type: AWS::IAM::ManagedPolicy - Condition: EnableIamPolicy - Properties: - Roles: - - !Ref ParallelClusterUserRole - PolicyDocument: - Version: '2012-10-17' - Statement: - - Action: - - iam:CreateServiceLinkedRole - - iam:DeleteRole - - iam:TagRole - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/* - Effect: Allow - Sid: IamRole - - Action: - - iam:CreateRole - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/* - Effect: Allow - Condition: !If - - EnablePermissionsBoundary - - StringEquals: - iam:PermissionsBoundary: - - !Ref PermissionsBoundaryPolicy - - !Ref AWS::NoValue - Sid: IamCreateRole - - Action: - - iam:PutRolePolicy - - iam:DeleteRolePolicy - Resource: !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/* - Effect: Allow - Sid: IamInlinePolicy - Condition: !If - - EnablePermissionsBoundary - - StringEquals: - iam:PermissionsBoundary: - - !Ref PermissionsBoundaryPolicy - - !Ref AWS::NoValue - - Action: - - iam:AttachRolePolicy - - iam:DetachRolePolicy - Resource: !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/* - Condition: - ArnLike: - iam:PolicyARN: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/parallelcluster* - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/parallelcluster/* - - !Sub arn:${AWS::Partition}:iam::aws:policy/CloudWatchAgentServerPolicy - - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonSSMManagedInstanceCore - - !Sub arn:${AWS::Partition}:iam::aws:policy/AWSBatchFullAccess - - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonS3ReadOnlyAccess - - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSBatchServiceRole - - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role - - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy - - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole - - !Sub arn:${AWS::Partition}:iam::aws:policy/EC2InstanceProfileForImageBuilder - - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole - StringEquals: !If - - EnablePermissionsBoundary - - iam:PermissionsBoundary: - - !Ref PermissionsBoundaryPolicy - - !Ref AWS::NoValue - Effect: Allow - Sid: IamPolicy - - ### CLUSTER ACTIONS POLICIES - - ParallelClusterClusterPolicyBatch: - Type: AWS::IAM::ManagedPolicy - Condition: CreateIamResources - Properties: - PolicyDocument: - Version: '2012-10-17' - Statement: - - Action: - - iam:PassRole - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/* - Effect: Allow - Condition: - StringEqualsIfExists: - iam:PassedToService: - - ecs-tasks.amazonaws.com - - batch.amazonaws.com - - codebuild.amazonaws.com - Sid: IamPassRole - - Action: - - iam:CreateServiceLinkedRole - - iam:DeleteServiceLinkedRole - Resource: - # AWS Batch creates a service linked role automatically for the ComputeEnvironment - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/aws-service-role/batch.amazonaws.com/* - Effect: Allow - Condition: - StringEquals: - iam:AWSServiceName: - - batch.amazonaws.com - - Action: - - codebuild:* - Resource: !Sub arn:${AWS::Partition}:codebuild:${Region}:${AWS::AccountId}:project/pcluster-* - Effect: Allow - - Action: - - ecr:* - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: ECR - - Action: - - batch:* - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: Batch - - Action: - - events:* - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Resource: '*' - Sid: AmazonCloudWatchEvents - - Action: - - ecs:DescribeContainerInstances - - ecs:ListContainerInstances - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: ECS - - FSxS3AccessPolicy: - Type: AWS::IAM::Policy - Condition: EnableFSxS3AccessCondition - Properties: - PolicyName: FSxS3AccessPolicy - PolicyDocument: - Version: '2012-10-17' - Statement: - - Action: - - iam:CreateServiceLinkedRole - - iam:AttachRolePolicy - - iam:PutRolePolicy - Resource: !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/aws-service-role/s3.data-source.lustre.fsx.amazonaws.com/* - Effect: Allow - Sid: FSxS3PoliciesAttach - - Action: - - s3:Get* - - s3:List* - - s3:PutObject - Resource: !Split - - "," - - !If - - UseAllBucketsForFSxS3 - - "*" - - !Sub ["${FsxS3Buckets},${FsxS3BucketsObjects}", FsxS3BucketsObjects: !Join ["/*,", !Split [",", !Sub "${FsxS3Buckets}/*"]]] - Effect: Allow - Sid: EnableFSxS3Access - Roles: - - !Ref ParallelClusterUserRole - - ParallelClusterClusterPolicy: - Type: AWS::IAM::ManagedPolicy - Condition: CreateIamResources - Properties: - PolicyDocument: - Version: '2012-10-17' - Statement: - - Action: - - ec2:Describe* - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: EC2Read - - Action: - - ec2:AllocateAddress - - ec2:AssociateAddress - - ec2:AttachNetworkInterface - - ec2:AuthorizeSecurityGroupEgress - - ec2:AuthorizeSecurityGroupIngress - - ec2:CreateFleet - - ec2:CreateLaunchTemplate - - ec2:CreateLaunchTemplateVersion - - ec2:CreateNetworkInterface - - ec2:CreatePlacementGroup - - ec2:CreateSecurityGroup - - ec2:CreateSnapshot - - ec2:CreateTags - - ec2:CreateVolume - - ec2:DeleteLaunchTemplate - - ec2:DeleteNetworkInterface - - ec2:DeletePlacementGroup - - ec2:DeleteSecurityGroup - - ec2:DeleteVolume - - ec2:DisassociateAddress - - ec2:ModifyLaunchTemplate - - ec2:ModifyNetworkInterfaceAttribute - - ec2:ModifyVolume - - ec2:ModifyVolumeAttribute - - ec2:ReleaseAddress - - ec2:RevokeSecurityGroupEgress - - ec2:RevokeSecurityGroupIngress - - ec2:RunInstances - - ec2:TerminateInstances - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: EC2Write - - Action: - - dynamodb:DescribeTable - - dynamodb:ListTagsOfResource - - dynamodb:CreateTable - - dynamodb:DeleteTable - - dynamodb:GetItem - - dynamodb:PutItem - - dynamodb:UpdateItem - - dynamodb:Query - - dynamodb:TagResource - Resource: !Sub arn:${AWS::Partition}:dynamodb:${Region}:${AWS::AccountId}:table/parallelcluster-* - Effect: Allow - Sid: DynamoDB - - Action: - - route53:ChangeResourceRecordSets - - route53:ChangeTagsForResource - - route53:CreateHostedZone - - route53:DeleteHostedZone - - route53:GetChange - - route53:GetHostedZone - - route53:ListResourceRecordSets - - route53:ListQueryLoggingConfigs - Resource: '*' - Effect: Allow - Sid: Route53HostedZones - - Action: - - cloudformation:* - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: CloudFormation - - Action: - - cloudwatch:PutDashboard - - cloudwatch:ListDashboards - - cloudwatch:DeleteDashboards - - cloudwatch:GetDashboard - - cloudwatch:PutMetricAlarm - - cloudwatch:DeleteAlarms - - cloudwatch:DescribeAlarms - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: CloudWatch - - Action: - - iam:GetRole - - iam:GetRolePolicy - - iam:GetPolicy - - iam:SimulatePrincipalPolicy - - iam:GetInstanceProfile - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/* - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/* - - !Sub arn:${AWS::Partition}:iam::aws:policy/* - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:instance-profile/* - Effect: Allow - Sid: IamRead - - Action: - - iam:CreateInstanceProfile - - iam:DeleteInstanceProfile - - iam:AddRoleToInstanceProfile - - iam:RemoveRoleFromInstanceProfile - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:instance-profile/parallelcluster/* - Effect: Allow - Sid: IamInstanceProfile - - Action: - - iam:PassRole - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/* - Effect: Allow - Condition: - StringEqualsIfExists: - iam:PassedToService: - - lambda.amazonaws.com - - ec2.amazonaws.com - - ec2.amazonaws.com.cn - - spotfleet.amazonaws.com - Sid: IamPassRole - - Action: - - iam:CreateServiceLinkedRole - - iam:DeleteServiceLinkedRole - Resource: '*' - Effect: Allow - Condition: - StringEquals: - iam:AWSServiceName: - - fsx.amazonaws.com - - s3.data-source.lustre.fsx.amazonaws.com - - Action: - - lambda:CreateFunction - - lambda:TagResource - - lambda:DeleteFunction - - lambda:GetFunctionConfiguration - - lambda:GetFunction - - lambda:InvokeFunction - - lambda:AddPermission - - lambda:RemovePermission - - lambda:UpdateFunctionConfiguration - - lambda:ListTags - - lambda:UntagResource - Resource: - - !Sub arn:${AWS::Partition}:lambda:${Region}:${AWS::AccountId}:function:parallelcluster-* - - !Sub arn:${AWS::Partition}:lambda:${Region}:${AWS::AccountId}:function:pcluster-* - Effect: Allow - Sid: Lambda - - Action: - - s3:* - Resource: - - !Sub arn:${AWS::Partition}:s3:::parallelcluster-* - - !Sub arn:${AWS::Partition}:s3:::aws-parallelcluster-* - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: S3ResourcesBucket - - Action: - - s3:Get* - - s3:List* - Resource: !Sub arn:${AWS::Partition}:s3:::${Region}-aws-parallelcluster* - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: S3ParallelClusterReadOnly - - Action: - - fsx:* - Resource: - - !Sub arn:${AWS::Partition}:fsx:${Region}:${AWS::AccountId}:* - Effect: Allow - Sid: FSx - - Action: - - elasticfilesystem:* - Resource: - - !Sub arn:${AWS::Partition}:elasticfilesystem:${Region}:${AWS::AccountId}:* - Effect: Allow - Sid: EFS - - Action: - - logs:DeleteLogGroup - - logs:PutRetentionPolicy - - logs:DescribeLogGroups - - logs:CreateLogGroup - - logs:TagResource - - logs:UntagResource - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: CloudWatchLogs - - Action: - - resource-groups:ListGroupResources - - resource-groups:GetGroupConfiguration - Resource: '*' - Effect: Allow - Sid: ResourceGroupRead - - - ### IMAGE ACTIONS POLICIES - - ParallelClusterBuildImageManagedPolicy: - Type: AWS::IAM::ManagedPolicy - Condition: CreateIamResources - Properties: - Description: Managed policy to execute pcluster build-image command without IAM permission - PolicyDocument: - Version: '2012-10-17' - Statement: - - Sid: EC2 - Effect: Allow - Action: - - ec2:DescribeImages - - ec2:DescribeInstanceTypeOfferings - - ec2:DescribeInstanceTypes - Resource: '*' - - Sid: IAM - Effect: Allow - Action: - - iam:CreateInstanceProfile - - iam:AddRoleToInstanceProfile - - iam:GetRole - - iam:GetRolePolicy - - iam:GetInstanceProfile - Resource: - - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:instance-profile/parallelcluster/*' - - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:instance-profile/ParallelClusterImage*' - - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/*' - - Sid: IAMPassRole - Effect: Allow - Action: - - iam:PassRole - Resource: - - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:instance-profile/parallelcluster/*' - - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/*' - Condition: - StringEquals: - iam:PassedToService: - - lambda.amazonaws.com - - ec2.amazonaws.com - - ec2.amazonaws.com.cn - - Sid: CloudWatch - Effect: Allow - Action: - - logs:CreateLogGroup - - logs:TagResource - - logs:UntagResource - Resource: - - !Sub 'arn:${AWS::Partition}:logs:${Region}:${AWS::AccountId}:log-group:/aws/lambda/ParallelClusterImage-*' - - Sid: CloudFormation - Effect: Allow - Action: - - cloudformation:DescribeStacks - - cloudformation:CreateStack - Resource: - - !Sub 'arn:${AWS::Partition}:cloudformation:${Region}:${AWS::AccountId}:stack/*' - - Sid: Lambda - Effect: Allow - Action: - - lambda:CreateFunction - - lambda:TagResource - - lambda:GetFunction - - lambda:AddPermission - Resource: - - !Sub 'arn:${AWS::Partition}:lambda:${Region}:${AWS::AccountId}:function:ParallelClusterImage-*' - - Sid: ImageBuilderGet - Effect: Allow - Action: - - imagebuilder:Get* - Resource: '*' - - Sid: ImageBuilder - Effect: Allow - Action: - - imagebuilder:CreateImage - - imagebuilder:TagResource - - imagebuilder:CreateImageRecipe - - imagebuilder:CreateComponent - - imagebuilder:CreateDistributionConfiguration - - imagebuilder:CreateInfrastructureConfiguration - Resource: - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:image/parallelclusterimage-*' - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:image-recipe/parallelclusterimage-*' - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:component/parallelclusterimage-*' - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:distribution-configuration/parallelclusterimage-*' - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:infrastructure-configuration/parallelclusterimage-*' - - Sid: S3Bucket - Effect: Allow - Action: - - s3:CreateBucket - - s3:ListBucket - Resource: - - !Sub 'arn:${AWS::Partition}:s3:::parallelcluster-*' - - Sid: SNS - Effect: Allow - Action: - - sns:GetTopicAttributes - - sns:TagResource - - sns:CreateTopic - - sns:Subscribe - - sns:Publish - Resource: - - !Sub 'arn:${AWS::Partition}:sns:${Region}:${AWS::AccountId}:ParallelClusterImage-*' - - Sid: S3Objects - Effect: Allow - Action: - - s3:PutObject - - s3:GetObject - Resource: - - !Sub 'arn:${AWS::Partition}:s3:::parallelcluster-*/*' - - Action: - - iam:CreateServiceLinkedRole - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/aws-service-role/imagebuilder.amazonaws.com/AWSServiceRoleForImageBuilder - Effect: Allow - Condition: - StringLike: - iam:AWSServiceName: - - imagebuilder.amazonaws.com - - ParallelClusterDeleteImageManagedPolicy: - Type: AWS::IAM::ManagedPolicy - Condition: CreateIamResources - Properties: - Description: Managed policy to execute pcluster delete-image command without IAM permission - PolicyDocument: - Version: '2012-10-17' - Statement: - - Sid: EC2 - Effect: Allow - Action: - - ec2:DeregisterImage - - ec2:DescribeImages - - ec2:DeleteSnapshot - Resource: '*' - - Sid: IAM - Effect: Allow - Action: - - iam:RemoveRoleFromInstanceProfile - Resource: - - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:instance-profile/parallelcluster/*' - - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/parallelcluster/*' - - Sid: ImageBuilder - Effect: Allow - Action: - - imagebuilder:DeleteImage - - imagebuilder:GetImage - - imagebuilder:CancelImageCreation - - imagebuilder:DeleteComponent - - imagebuilder:DeleteImageRecipe - - imagebuilder:DeleteInfrastructureConfiguration - - imagebuilder:DeleteDistributionConfiguration - Resource: - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:image/parallelclusterimage-*' - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:image-recipe/parallelclusterimage-*' - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:component/parallelclusterimage-*' - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:distribution-configuration/parallelclusterimage-*' - - !Sub 'arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:infrastructure-configuration/parallelclusterimage-*' - - Sid: CloudFormation - Effect: Allow - Action: - - cloudformation:DescribeStacks - - cloudformation:DeleteStack - Resource: - - !Sub 'arn:${AWS::Partition}:cloudformation:${Region}:${AWS::AccountId}:stack/*' - - Sid: Lambda - Effect: Allow - Action: - - lambda:RemovePermission - - lambda:DeleteFunction - - lambda:AddPermission - Resource: - - !Sub 'arn:${AWS::Partition}:lambda:${Region}:${AWS::AccountId}:function:ParallelClusterImage-*' - - Sid: SNS - Effect: Allow - Action: - - SNS:DeleteTopic - - SNS:Unsubscribe - - SNS:GetTopicAttributes - Resource: - - !Sub 'arn:${AWS::Partition}:sns:${Region}:${AWS::AccountId}:ParallelClusterImage-*' - - Sid: S3Bucket - Effect: Allow - Action: - - s3:ListBucket - - s3:ListBucketVersions - Resource: - - !Sub 'arn:${AWS::Partition}:s3:::parallelcluster-*' - - Sid: S3Objects - Effect: Allow - Action: - - s3:PutObject - - s3:GetObject - - s3:GetObjectVersion - - s3:DeleteObject - - s3:DeleteObjectVersion - Resource: - - !Sub 'arn:${AWS::Partition}:s3:::parallelcluster-*/*' - - Sid: CloudWatch - Effect: Allow - Action: - - logs:DeleteLogGroup - Resource: - - !Sub 'arn:${AWS::Partition}:logs:${Region}:${AWS::AccountId}:log-group:/aws/imagebuilder/ParallelClusterImage-*' - - !Sub 'arn:${AWS::Partition}:logs:${Region}:${AWS::AccountId}:log-group:/aws/lambda/ParallelClusterImage-*' - - ParallelClusterListImagesManagedPolicy: - Type: AWS::IAM::ManagedPolicy - Condition: CreateIamResources - Properties: - Description: Managed policy to execute pcluster list-images command - PolicyDocument: - Version: '2012-10-17' - Statement: - - Sid: EC2 - Effect: Allow - Action: - - ec2:DescribeImages - Resource: '*' - - Sid: CloudFormation - Effect: Allow - Action: - - cloudformation:DescribeStacks - Resource: - - '*' - - ParallelClusterDescribeImageManagedPolicy: - Type: AWS::IAM::ManagedPolicy - Condition: CreateIamResources - Properties: - Description: Managed policy to execute pcluster describe-image command - PolicyDocument: - Version: '2012-10-17' - Statement: - - Sid: EC2 - Effect: Allow - Action: - - ec2:DescribeImages - Resource: '*' - - Sid: CloudFormation - Effect: Allow - Action: - - cloudformation:DescribeStacks - Resource: - - !Sub 'arn:${AWS::Partition}:cloudformation:${Region}:${AWS::AccountId}:stack/*' - - ### LOG COMMANDS - - ParallelClusterLogRetrievalPolicy: - Type: AWS::IAM::ManagedPolicy - Condition: CreateIamResources - Properties: - Description: Policies needed to retrieve cluster and images logs - PolicyDocument: - Version: '2012-10-17' - Statement: - - Action: - - logs:DescribeLogGroups - - logs:FilterLogEvents - - logs:GetLogEvents - - logs:CreateExportTask - - logs:DescribeLogStreams - - logs:DescribeExportTasks - Resource: '*' - Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - ### ----------- ParallelClusterFunctionLogGroup: @@ -1027,342 +328,6 @@ Resources: LogGroupName: !Sub /aws/lambda/${ParallelClusterFunction} RetentionInDays: 30 - ImageBuilderInstanceRole: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::IAM::Role - Properties: - ManagedPolicyArns: - - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonSSMManagedInstanceCore - - !Sub arn:${AWS::Partition}:iam::aws:policy/EC2InstanceProfileForImageBuilderECRContainerBuilds - AssumeRolePolicyDocument: - Statement: - - Action: - - sts:AssumeRole - Effect: Allow - Principal: - Service: - - !Sub ec2.${AWS::URLSuffix} - Version: '2012-10-17' - Path: /executionServiceEC2Role/ - - ImageBuilderInstanceProfile: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::IAM::InstanceProfile - Properties: - Path: /executionServiceEC2Role/ - Roles: - - !Ref ImageBuilderInstanceRole - - InfrastructureConfigurationSecurityGroup: - Condition: NonDefaultVpc - Type: AWS::EC2::SecurityGroup - Properties: - VpcId: !Ref ImageBuilderVpcId - GroupDescription: ParallelCluster image builder security group - - InfrastructureConfiguration: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::ImageBuilder::InfrastructureConfiguration - Properties: - Name: !Sub - - ParallelClusterImageBuilderInfrastructureConfiguration-${Version}-${StackIdSuffix} - - { Version: !Join ['_', !Split ['.', !FindInMap [ParallelCluster, Constants, Version]]], StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - InstanceProfileName: !Ref ImageBuilderInstanceProfile - TerminateInstanceOnFailure: true - SnsTopicArn: !Ref EcrImageBuilderSNSTopic - SubnetId: - Fn::If: - - NonDefaultVpc - - !Ref ImageBuilderSubnetId - - !Ref AWS::NoValue - SecurityGroupIds: - Fn::If: - - NonDefaultVpc - - [!Ref InfrastructureConfigurationSecurityGroup] - - !Ref AWS::NoValue - InstanceMetadataOptions: - HttpTokens: required - - PrivateEcrRepository: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::ECR::Repository - Properties: - RepositoryName: !Sub - - 'aws-parallelcluster-${StackIdSuffix}' - - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - Tags: - - Key: 'parallelcluster:version' - Value: !FindInMap [ParallelCluster, Constants, Version] - - EcrImageRecipe: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::ImageBuilder::ContainerRecipe - Properties: - Components: - - ComponentArn: !Sub arn:${AWS::Partition}:imagebuilder:${AWS::Region}:aws:component/update-linux/x.x.x - ContainerType: DOCKER - Name: !Sub - - 'ImportPublicEcrImage-${Version}-${StackIdSuffix}' - - { Version: !Join ['_', !Split ['.', !FindInMap [ParallelCluster, Constants, Version]]], StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - Version: !FindInMap [ParallelCluster, Constants, ShortVersion] - ParentImage: !Ref PublicEcrImageUri - PlatformOverride: Linux - TargetRepository: - Service: ECR - RepositoryName: !Ref PrivateEcrRepository - DockerfileTemplateData: 'FROM {{{ imagebuilder:parentImage }}}' - WorkingDirectory: '/tmp' - - EcrImage: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::ImageBuilder::Image - Properties: - ContainerRecipeArn: !Ref EcrImageRecipe - EnhancedImageMetadataEnabled: true - InfrastructureConfigurationArn: !Ref InfrastructureConfiguration - ImageTestsConfiguration: - ImageTestsEnabled: false - - EcrImagePipeline: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::ImageBuilder::ImagePipeline - Properties: - Name: !Sub - - 'EcrImagePipeline-${Version}-${StackIdSuffix}' - - { Version: !Join ['_', !Split ['.', !FindInMap [ParallelCluster, Constants, Version]]], StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - Status: ENABLED - ContainerRecipeArn: !Ref EcrImageRecipe - InfrastructureConfigurationArn: !Ref InfrastructureConfiguration - ImageTestsConfiguration: - ImageTestsEnabled: false - - EcrImageBuilderSNSTopic: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::SNS::Topic - Properties: - DisplayName: "ParallelCluster ECR Image Builder SNS topic" - KmsMasterKeyId: alias/aws/sns - - UpdateParallelClusterLambda: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::Serverless::Function - Properties: - FunctionName: !Sub - - UpdateParallelClusterLambda-${StackIdSuffix} - - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - MemorySize: 128 - InlineCode: | - import boto3 - import json - import os - - client = boto3.client('lambda') - - def handler(event, context): - for record in event['Records']: - print(event) - print('boto version {}'.format(boto3.__version__)) - - event_message = record['Sns']['Message'] - message_json = json.loads(event_message) - image_state = message_json['state']['status'] - - if image_state == 'AVAILABLE': - uri = message_json['outputResources']['containers'][0]['imageUris'][0] - function_to_update = os.environ['LambdaFunctionToUpdate'] - client.update_function_code(FunctionName=function_to_update, ImageUri=uri, Publish=True) - - Handler: index.handler - Runtime: python3.9 - Role: !GetAtt UpdateParallelClusterLambdaRole.Arn - Environment: - Variables: - LambdaFunctionToUpdate: !GetAtt ParallelClusterFunction.Arn - Events: - SNSTopicEvent: - Type: SNS - Properties: - Topic: !Ref EcrImageBuilderSNSTopic - - UpdateParallelClusterLambdaLogGroup: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::Logs::LogGroup - Properties: - LogGroupName: !Sub - - /aws/lambda/UpdateParallelClusterLambda-${StackIdSuffix} - - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - - UpdateParallelClusterLambdaRole: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Principal: - Service: lambda.amazonaws.com - Action: - - 'sts:AssumeRole' - Policies: - - PolicyName: LoggingPolicy - PolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Action: - - logs:CreateLogStream - - logs:PutLogEvents - Resource: !Sub - - arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/UpdateParallelClusterLambda-${StackIdSuffix}:* - - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - - PolicyName: UpdateParallelClusterLambdaPolicy - PolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Action: - - lambda:UpdateFunctionCode - Resource: !GetAtt ParallelClusterFunction.Arn - - EcrImageDeletionLambda: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::Lambda::Function - Properties: - MemorySize: 128 - Code: - ZipFile: | - import cfnresponse - import boto3 - import random - import string - - ecr = boto3.client('ecr') - imagebuilder = boto3.client('imagebuilder') - - def get_image_ids(repository_name, version): - image_digests = set() - paginator = ecr.get_paginator('list_images') - response_iterator = paginator.paginate(repositoryName=repository_name, filter={'tagStatus': 'TAGGED'}) - for response in response_iterator: - image_digests.update([image_id['imageDigest'] for image_id in response['imageIds'] if f"{version}-" in image_id['imageTag']]) - return list({'imageDigest': image_digest} for image_digest in image_digests) - - def get_imagebuilder_images(ecr_image_pipeline_arn): - response = imagebuilder.list_image_pipeline_images(imagePipelineArn=ecr_image_pipeline_arn) - images = [image['arn'] for image in response['imageSummaryList']] - while 'nextToken' in response: - response = imagebuilder.list_image_pipeline_images(imagePipelineArn=ecr_image_pipeline_arn, nextToken=response['nextToken']) - images.extend([image['arn'] for image in response['imageSummaryList']]) - return images - - def create_physical_resource_id(): - alnum = string.ascii_uppercase + string.ascii_lowercase + string.digits - return ''.join(random.choice(alnum) for _ in range(16)) - - def handler(event, context): - print(event) - print('boto version {}'.format(boto3.__version__)) - - response_data = {} - reason = None - response_status = cfnresponse.SUCCESS - - if event['RequestType'] == 'Create': - response_data['Message'] = 'Resource creation successful!' - physical_resource_id = create_physical_resource_id() - else: - physical_resource_id = event['PhysicalResourceId'] - - if event['RequestType'] == 'Update' or event['RequestType'] == 'Delete': - try: - resource_key = 'OldResourceProperties' if 'OldResourceProperties' in event else 'ResourceProperties' - ecr_repository_name = event[resource_key]['EcrRepositoryName'] - ecr_image_pipeline_arn = event[resource_key]['EcrImagePipelineArn'] - version = event[resource_key]['Version'] - - image_ids = get_image_ids(ecr_repository_name, version) - if image_ids: - ecr.batch_delete_image(repositoryName=ecr_repository_name, imageIds=image_ids) - reason = 'Image deletion successful!' - else: - reason = 'No image found, considering image deletion successful' - - for imagebuilder_image in get_imagebuilder_images(ecr_image_pipeline_arn): - imagebuilder.delete_image(imageBuildVersionArn=imagebuilder_image) - - except ecr.exceptions.RepositoryNotFoundException: - reason = 'Repository was not found, considering image deletion successfull' - except Exception as exception: - response_status = cfnresponse.FAILED - reason = 'Failed image deletion with error: {}'.format(exception) - - cfnresponse.send(event, context, response_status, response_data, physical_resource_id, reason) - - Handler: index.handler - Runtime: python3.9 - Role: !GetAtt EcrImageDeletionLambdaRole.Arn - - EcrImageDeletionLambdaLogGroup: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::Logs::LogGroup - Properties: - LogGroupName: !Sub /aws/lambda/${EcrImageDeletionLambda} - - EcrImageDeletionLambdaRole: - Condition: DoNotUseCustomEcrImageUri - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Principal: - Service: lambda.amazonaws.com - Action: - - 'sts:AssumeRole' - Policies: - - PolicyName: LoggingPolicy - PolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Action: - - logs:CreateLogStream - - logs:PutLogEvents - Resource: !Sub arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/ParallelClusterApi-EcrImageDeletionLambda-* - - PolicyName: BatchDeletePolicy - PolicyDocument: - Version: 2012-10-17 - Statement: - - Effect: Allow - Action: - - ecr:BatchDeleteImage - - ecr:ListImages - Resource: !GetAtt PrivateEcrRepository.Arn - - Effect: Allow - Action: - - imagebuilder:ListImagePipelineImages - Resource: !Sub - - arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:image-pipeline/ecrimagepipeline-*${StackIdSuffix}* - - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - - Effect: Allow - Action: - - imagebuilder:DeleteImage - Resource: !Sub - - arn:${AWS::Partition}:imagebuilder:${Region}:${AWS::AccountId}:image/*${StackIdSuffix}* - - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } - - EcrImagesRemover: - Condition: DoNotUseCustomEcrImageUri - Type: Custom::EcrImagesRemover - Properties: - ServiceToken: !GetAtt EcrImageDeletionLambda.Arn - EcrRepositoryName: !Ref PrivateEcrRepository - Version: !FindInMap [ParallelCluster, Constants, ShortVersion] - EcrImagePipelineArn: !GetAtt EcrImagePipeline.Arn - - Outputs: ParallelClusterLambdaArn: Description: 'ARN of the ParallelCluster Lambda function' @@ -1387,16 +352,6 @@ Outputs: - !Ref ApiGatewayApiWithoutCustomDomain StageName: !FindInMap [ParallelCluster, Constants, Stage] - UriOfCopyOfPublicEcrImage: - Condition: DoNotUseCustomEcrImageUri - Description: 'Uri of the copy of the Public ParallelCluster API Lambda Container image' - Value: !Sub - - ${AWS::AccountId}.dkr.ecr.${AWS::Region}.${AWS::URLSuffix}/${Repository}:${Version} - - Repository: !Ref PrivateEcrRepository - Version: !Join - - '-' - - [!Select [2, !Split ['/', !Ref EcrImage]], !Select [3, !Split ['/', !Ref EcrImage]]] - ParallelClusterApiUserRole: Condition: CreateApiUserRoleCondition Export: @@ -1404,11 +359,6 @@ Outputs: Description: 'IAM Role with permissions to invoke the ParallelCluster API' Value: !GetAtt ParallelClusterApiUserRole.Arn - ParallelClusterDockerUpdateImagePipeline: - Condition: DoNotUseCustomEcrImageUri - Description: 'Image Builder pipeline that can be triggered to pull latest API Docker image for the deployed ParallelCluster version' - Value: !Ref EcrImagePipeline - ParallelClusterLambdaLogGroup: Value: !Ref ParallelClusterFunctionLogGroup Description: 'LogGroup for the Lambda function implementing ParallelCluster Api' diff --git a/api/spec/build-model.sh b/api/spec/build-model.sh index 1b3f27926e..0d8872e3f2 100755 --- a/api/spec/build-model.sh +++ b/api/spec/build-model.sh @@ -8,6 +8,7 @@ then exit 1 fi pushd smithy && ../../gradlew build && popd -GENERATED_YAML_PATH="smithy/build/smithyprojections/smithy/source/openapi/ParallelCluster.openapi.json" -./spec_overrides.sh "$GENERATED_YAML_PATH" -yq eval -P $GENERATED_YAML_PATH > openapi/ParallelCluster.openapi.yaml +GENERATED_JSON_PATH="smithy/build/smithyprojections/smithy/source/openapi/ParallelCluster.openapi.json" +./spec_overrides.sh "$GENERATED_JSON_PATH" +# Convert json into yaml +yq eval -P $GENERATED_JSON_PATH -o yaml > openapi/ParallelCluster.openapi.yaml diff --git a/awsbatch-cli/tox.ini b/awsbatch-cli/tox.ini index e534fdd62d..36e087b2d5 100644 --- a/awsbatch-cli/tox.ini +++ b/awsbatch-cli/tox.ini @@ -15,13 +15,10 @@ usedevelop = deps = -rtests/requirements.txt pytest-travis-fold - cov: codecov commands = nocov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --ignore=src tests/ cov: python setup.py clean --all build_ext --force --inplace cov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --cov=src --cov-report=xml --cov-append tests/ - # Disabling coverage report for awsbatch-cli because it's currently conflicting with pcluster cli report - # cov: codecov -e TOXENV # Section used to define common variables used by multiple testenvs. [vars] diff --git a/cli/src/pcluster/aws/aws_api.py b/cli/src/pcluster/aws/aws_api.py index 73c1d762dd..b49a384326 100644 --- a/cli/src/pcluster/aws/aws_api.py +++ b/cli/src/pcluster/aws/aws_api.py @@ -25,6 +25,7 @@ from pcluster.aws.s3 import S3Client from pcluster.aws.s3_resource import S3Resource from pcluster.aws.secretsmanager import SecretsManagerClient +from pcluster.aws.ssm import SsmClient from pcluster.aws.sts import StsClient @@ -58,6 +59,7 @@ def __init__(self): self._logs = None self._route53 = None self._secretsmanager = None + self._ssm = None self._resource_groups = None @property @@ -165,6 +167,13 @@ def secretsmanager(self): self._secretsmanager = SecretsManagerClient() return self._secretsmanager + @property + def ssm(self): + """SSM client.""" + if not self._ssm: + self._ssm = SsmClient() + return self._ssm + @property def resource_groups(self): """Resource Groups client.""" diff --git a/cli/src/pcluster/aws/cfn.py b/cli/src/pcluster/aws/cfn.py index 2ef8f411e2..2a1368346e 100644 --- a/cli/src/pcluster/aws/cfn.py +++ b/cli/src/pcluster/aws/cfn.py @@ -16,6 +16,7 @@ from pcluster.aws.aws_resources import StackInfo from pcluster.aws.common import AWSClientError, AWSExceptionHandler, Boto3Client, StackNotFoundError from pcluster.constants import PCLUSTER_IMAGE_ID_TAG, PCLUSTER_VERSION_TAG +from pcluster.utils import remove_none_values LOGGER = logging.getLogger(__name__) @@ -27,14 +28,21 @@ def __init__(self): super().__init__("cloudformation") @AWSExceptionHandler.handle_client_exception - def create_stack(self, stack_name: str, disable_rollback: bool, tags: list, template_body: str): + def create_stack( + self, stack_name: str, disable_rollback: bool, tags: list, template_body: str, parameters: list = None + ): """Create CFN stack by using the given template.""" + optional_args = { + "Tags": tags, + "Parameters": parameters, + } + optional_args_with_value = remove_none_values(optional_args) return self._client.create_stack( StackName=stack_name, TemplateBody=template_body, Capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM"], DisableRollback=disable_rollback, - Tags=tags, + **optional_args_with_value, ) @AWSExceptionHandler.handle_client_exception @@ -45,14 +53,21 @@ def create_stack_from_url( tags: list, template_url: str, capabilities: str = "CAPABILITY_IAM", + parameters: list = None, ): """Create CFN stack by using the given template url.""" + optional_args = { + "Tags": tags, + "Parameters": parameters, + } + optional_args_with_value = remove_none_values(optional_args) + return self._client.create_stack( StackName=stack_name, TemplateURL=template_url, Capabilities=[capabilities, "CAPABILITY_NAMED_IAM"], DisableRollback=disable_rollback, - Tags=tags, + **optional_args_with_value, ) @AWSExceptionHandler.handle_client_exception @@ -71,19 +86,19 @@ def update_stack(self, stack_name: str, updated_template: str, params: list): ) @AWSExceptionHandler.handle_client_exception - def update_stack_from_url(self, stack_name: str, template_url: str, tags: list = None): + def update_stack_from_url(self, stack_name: str, template_url: str, tags: list = None, parameters: list = None): """Update CFN stack by using the given template url.""" - if tags is None: - return self._client.update_stack( - StackName=stack_name, - TemplateURL=template_url, - Capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM"], - ) + optional_args = { + "Tags": tags, + "Parameters": parameters, + } + optional_args_with_value = remove_none_values(optional_args) + return self._client.update_stack( StackName=stack_name, TemplateURL=template_url, Capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM"], - Tags=tags, + **optional_args_with_value, ) @AWSExceptionHandler.handle_client_exception diff --git a/cli/src/pcluster/aws/ssm.py b/cli/src/pcluster/aws/ssm.py new file mode 100644 index 0000000000..b06af69ddb --- /dev/null +++ b/cli/src/pcluster/aws/ssm.py @@ -0,0 +1,28 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +from pcluster.aws.common import AWSExceptionHandler, Boto3Client + + +class SsmClient(Boto3Client): + """SSM Boto3 client.""" + + def __init__(self): + super().__init__("ssm") + + @AWSExceptionHandler.handle_client_exception + def get_parameter(self, name: str): + """ + Retrieve a Parameter. + + :param name: Parameter name. + :return: Parameter info + """ + return self._client.get_parameter(Name=name) diff --git a/cli/src/pcluster/cli/commands/configure/easyconfig.py b/cli/src/pcluster/cli/commands/configure/easyconfig.py index d2da397e7f..524161afd5 100644 --- a/cli/src/pcluster/cli/commands/configure/easyconfig.py +++ b/cli/src/pcluster/cli/commands/configure/easyconfig.py @@ -39,7 +39,8 @@ from pcluster.constants import ( DEFAULT_MAX_COUNT, DEFAULT_MIN_COUNT, - MAX_NUMBER_OF_COMPUTE_RESOURCES, + MAX_COMPUTE_RESOURCES_PER_QUEUE, + MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_CLUSTER, MAX_NUMBER_OF_QUEUES, SUPPORTED_SCHEDULERS, ) @@ -99,12 +100,13 @@ def _get_vpcs_and_subnets(): def _get_subnets(conn, vpc_id): subnet_options = [] - subnet_list = conn.describe_subnets( - Filters=[ - {"Name": "vpcId", "Values": [vpc_id]}, - {"Name": "ipv6-native", "Values": ["false"]}, - ] - ).get("Subnets") + subnet_filters = [{"Name": "vpcId", "Values": [vpc_id]}] + # US isolated regions do not support IPv6. + # Subnets in these regions do not have the field "Ipv6Native", so + # applying the filter ipv6-native=false would make the DescribeSubnets call to always return an empty set. + if not conn.meta.region_name.startswith("us-iso"): + subnet_filters.append({"Name": "ipv6-native", "Values": ["false"]}) + subnet_list = conn.describe_subnets(Filters=subnet_filters).get("Subnets") for subnet in subnet_list: subnet_options.append( OrderedDict( @@ -167,7 +169,7 @@ def configure(args): # noqa: C901 number_of_queues = int( prompt( "Number of queues", - lambda x: str(x).isdigit() and int(x) >= 1 and int(x) <= MAX_NUMBER_OF_QUEUES, + lambda x: str(x).isdigit() and 1 <= int(x) <= MAX_NUMBER_OF_QUEUES, default_value=1, ) ) @@ -194,10 +196,13 @@ def configure(args): # noqa: C901 if scheduler == "awsbatch": number_of_compute_resources = 1 else: + queue_limit = min( + (MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_CLUSTER / number_of_queues), MAX_COMPUTE_RESOURCES_PER_QUEUE + ) number_of_compute_resources = int( prompt( f"Number of compute resources for {queue_name}", - validator=lambda x: str(x).isdigit() and int(x) >= 1 and int(x) <= MAX_NUMBER_OF_COMPUTE_RESOURCES, + validator=lambda x, q=queue_limit: str(x).isdigit() and 1 <= int(x) <= q, default_value=1, ) ) @@ -360,7 +365,17 @@ def _create_vpc_parameters(scheduler, head_node_instance_type, compute_instance_ if ( not qualified_head_node_subnets or not qualified_compute_subnets - or (prompt("Automate Subnet creation? (y/n)", lambda x: x in ("y", "n"), default_value="y") == "y") + or ( + prompt( + "The creation of a public and private subnet combination will result in\n" + "charges for NAT gateway creation that are not covered under the free tier.\n" + "Please refer to https://aws.amazon.com/vpc/pricing/ for more details.\n" + "Automate Subnet creation? (y/n)", + lambda x: x in ("y", "n"), + default_value="y", + ) + == "y" + ) ): # Start auto subnets creation in the absence of qualified subnets. # Otherwise, user selects between manual and automate subnets creation diff --git a/cli/src/pcluster/cli/commands/dcv_connect.py b/cli/src/pcluster/cli/commands/dcv_connect.py index 9c3b452b94..af5cc521e4 100644 --- a/cli/src/pcluster/cli/commands/dcv_connect.py +++ b/cli/src/pcluster/cli/commands/dcv_connect.py @@ -121,7 +121,7 @@ def _retrieve_dcv_session_url(ssh_cmd, cluster_name, head_node_ip): ) -def _retry(func, func_args, attempts=1, wait=0): +def _retry(func, func_args, attempts=1, wait=0): # pylint: disable=R1710 """ Call function and re-execute it if it raises an Exception. diff --git a/cli/src/pcluster/cli/commands/dcv_util.py b/cli/src/pcluster/cli/commands/dcv_util.py index 8023a10dcf..d017743d03 100644 --- a/cli/src/pcluster/cli/commands/dcv_util.py +++ b/cli/src/pcluster/cli/commands/dcv_util.py @@ -16,6 +16,6 @@ def get_supported_dcv_os(architecture): """Return a list of all the operating system supported by DCV.""" architectures_dict = { "x86_64": SUPPORTED_OSES, - "arm64": ["ubuntu1804", "alinux2", "centos7"], + "arm64": ["ubuntu1804", "alinux2", "centos7", "rhel8"], } return architectures_dict.get(architecture, []) diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index 248b9d47d7..bc20c4f5d2 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -36,20 +36,25 @@ DEFAULT_MAX_COUNT, DEFAULT_MIN_COUNT, DELETE_POLICY, + DETAILED_MONITORING_ENABLED_DEFAULT, EBS_VOLUME_SIZE_DEFAULT, EBS_VOLUME_TYPE_DEFAULT, + EBS_VOLUME_TYPE_DEFAULT_US_ISO, EBS_VOLUME_TYPE_IOPS_DEFAULT, LUSTRE, + MAX_COMPUTE_RESOURCES_PER_QUEUE, MAX_EBS_COUNT, MAX_EXISTING_STORAGE_COUNT, MAX_NEW_STORAGE_COUNT, - MAX_NUMBER_OF_COMPUTE_RESOURCES, + MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_CLUSTER, MAX_NUMBER_OF_QUEUES, NODE_BOOTSTRAP_TIMEOUT, ONTAP, OPENZFS, SCHEDULER_PLUGIN_INTERFACE_VERSION, SCHEDULER_PLUGIN_INTERFACE_VERSION_LOW_RANGE, + SCHEDULER_PLUGIN_MAX_NUMBER_OF_COMPUTE_RESOURCES, + SCHEDULER_PLUGIN_MAX_NUMBER_OF_QUEUES, SUPPORTED_OSES, Feature, ) @@ -94,10 +99,12 @@ ManagedFsxMultiAzValidator, MaxCountValidator, MixedSecurityGroupOverwriteValidator, + MultiNetworkInterfacesInstancesValidator, NameValidator, NumberOfStorageValidator, OverlappingMountDirValidator, RegionValidator, + RootVolumeEncryptionConsistencyValidator, RootVolumeSizeValidator, SchedulableMemoryValidator, SchedulerOsValidator, @@ -106,7 +113,7 @@ SharedStorageNameValidator, UnmanagedFsxMultiAzValidator, ) -from pcluster.validators.common import ValidatorContext +from pcluster.validators.common import ValidatorContext, get_async_timed_validator_type_for from pcluster.validators.database_validators import DatabaseUriValidator from pcluster.validators.directory_service_validators import ( AdditionalSssdConfigsValidator, @@ -165,7 +172,7 @@ InstancesNetworkingValidator, ) from pcluster.validators.kms_validators import KmsKeyIdEncryptedValidator, KmsKeyValidator -from pcluster.validators.monitoring_validators import LogRotationValidator +from pcluster.validators.monitoring_validators import DetailedMonitoringValidator, LogRotationValidator from pcluster.validators.networking_validators import ( ElasticIpValidator, MultiAzPlacementGroupValidator, @@ -189,6 +196,14 @@ SupportedVersionsValidator, UserNameValidator, ) +from pcluster.validators.slurm_settings_validator import ( + SLURM_SETTINGS_DENY_LIST, + CustomSlurmNodeNamesValidator, + CustomSlurmSettingLevel, + CustomSlurmSettingsIncludeFileOnlyValidator, + CustomSlurmSettingsValidator, +) +from pcluster.validators.tags_validators import ComputeResourceTagsValidator LOGGER = logging.getLogger(__name__) @@ -210,7 +225,10 @@ def __init__( ): super().__init__(**kwargs) self.encrypted = Resource.init_param(encrypted, default=True) - self.volume_type = Resource.init_param(volume_type, default=EBS_VOLUME_TYPE_DEFAULT) + self.volume_type = Resource.init_param( + volume_type, + default=EBS_VOLUME_TYPE_DEFAULT_US_ISO if get_region().startswith("us-iso") else EBS_VOLUME_TYPE_DEFAULT, + ) self.iops = Resource.init_param(iops, default=EBS_VOLUME_TYPE_IOPS_DEFAULT.get(self.volume_type)) self.throughput = Resource.init_param(throughput, default=125 if self.volume_type == "gp3" else None) @@ -231,7 +249,14 @@ class RootVolume(Ebs): def __init__(self, size: int = None, delete_on_termination: bool = None, **kwargs): super().__init__(**kwargs) - self.size = Resource.init_param(size) + # When the RootVolume size is None, EC2 implicitly sets it as the AMI size. + # In US Isolated regions, the root volume size cannot be left unspecified, + # so we consider it as the default EBS volume size. + # In theory, the default value should be maximum between the default EBS volume size (35GB) and the AMI size, + # but in US Isolated region this is fine because the only supported AMI as of 2023 Feb + # is the official ParallelCluster AMI for Amazon Linux 2, which has size equal to + # the default EBS volume size (35GB). + self.size = Resource.init_param(size, EBS_VOLUME_SIZE_DEFAULT if get_region().startswith("us-iso") else None) # The default delete_on_termination takes effect both on head and compute nodes. # If the default of the head node is to be changed, please separate this class for different defaults. self.delete_on_termination = Resource.init_param(delete_on_termination, default=True) @@ -744,6 +769,25 @@ def __init__(self, enabled: bool = None, gdr_support: bool = None, **kwargs): self.gdr_support = Resource.init_param(gdr_support, default=False) +# ---------------------- Health Checks ---------------------- # + + +class GpuHealthCheck(Resource): + """Represent the configuration for the GPU Health Check.""" + + def __init__(self, enabled: bool = None, **kwargs): + super().__init__(**kwargs) + self.enabled = enabled + + +class HealthChecks(Resource): + """Represent the health checks configuration.""" + + def __init__(self, gpu: GpuHealthCheck = None, **kwargs): + super().__init__(**kwargs) + self.gpu = gpu or GpuHealthCheck(implied=True) + + # ---------------------- Monitoring ---------------------- # @@ -798,10 +842,13 @@ class Monitoring(Resource): def __init__(self, detailed_monitoring: bool = None, logs: Logs = None, dashboards: Dashboards = None, **kwargs): super().__init__(**kwargs) - self.detailed_monitoring = Resource.init_param(detailed_monitoring, default=False) + self.detailed_monitoring = Resource.init_param(detailed_monitoring, default=DETAILED_MONITORING_ENABLED_DEFAULT) self.logs = logs or Logs(implied=True) self.dashboards = dashboards or Dashboards(implied=True) + def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument + self._register_validator(DetailedMonitoringValidator, is_detailed_monitoring_enabled=self.detailed_monitoring) + # ---------------------- Others ---------------------- # @@ -945,7 +992,11 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: D102 DomainAddrValidator, domain_addr=self.domain_addr, additional_sssd_configs=self.additional_sssd_configs ) if self.password_secret_arn: - self._register_validator(PasswordSecretArnValidator, password_secret_arn=self.password_secret_arn) + self._register_validator( + PasswordSecretArnValidator, + password_secret_arn=self.password_secret_arn, + region=get_region(), + ) if self.ldap_tls_req_cert: self._register_validator(LdapTlsReqCertValidator, ldap_tls_reqcert=self.ldap_tls_req_cert) if self.additional_sssd_configs: @@ -1084,7 +1135,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: D102 class CustomAction(Resource): - """Represent a custom action resource.""" + """Represent a custom action script resource.""" def __init__(self, script: str, args: List[str] = None): super().__init__() @@ -1092,7 +1143,7 @@ def __init__(self, script: str, args: List[str] = None): self.args = Resource.init_param(args) def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument - self._register_validator(UrlValidator, url=self.script) + self._register_validator(get_async_timed_validator_type_for(UrlValidator), url=self.script, timeout=5) class CustomActions(Resource): @@ -1100,9 +1151,9 @@ class CustomActions(Resource): def __init__( self, - on_node_start: CustomAction = None, - on_node_configured: CustomAction = None, - on_node_updated: CustomAction = None, + on_node_start=None, + on_node_configured=None, + on_node_updated=None, ): super().__init__() self.on_node_start = Resource.init_param(on_node_start) @@ -1309,7 +1360,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: D102 head_node=self.head_node, os=self.image.os, ami_id=self.head_node_ami, - tags=self.get_cluster_tags(), + tags=self.get_tags(), imds_support=self.imds.imds_support, ) if self.head_node.dcv: @@ -1651,6 +1702,11 @@ def is_cw_dashboard_enabled(self): else False ) + @property + def is_detailed_monitoring_enabled(self): + """Return True if Detailed Monitoring is enabled.""" + return self.monitoring.detailed_monitoring + @property def is_dcv_enabled(self): """Return True if DCV is enabled.""" @@ -1733,7 +1789,7 @@ def lambda_functions_vpc_config(self): """Return the vpc config of the PCluster Lambda Functions or None.""" return self.deployment_settings.lambda_functions_vpc_config if self.deployment_settings else None - def get_cluster_tags(self): + def get_tags(self): """Return tags configured in the cluster configuration.""" return self.tags @@ -1856,6 +1912,9 @@ def __init__( schedulable_memory: int = None, capacity_reservation_target: CapacityReservationTarget = None, networking: SlurmComputeResourceNetworking = None, + health_checks: HealthChecks = None, + custom_slurm_settings: Dict = None, + tags: List[Tag] = None, **kwargs, ): super().__init__(**kwargs) @@ -1871,6 +1930,9 @@ def __init__( self._instance_types_with_instance_storage = [] self._instance_type_info_map = {} self.networking = networking or SlurmComputeResourceNetworking(implied=True) + self.health_checks = health_checks or HealthChecks(implied=True) + self.custom_slurm_settings = Resource.init_param(custom_slurm_settings, default={}) + self.tags = tags @staticmethod def fetch_instance_type_info(instance_type) -> InstanceTypeInfo: @@ -1921,6 +1983,10 @@ def is_ebs_optimized(self) -> bool: def instance_types(self) -> List[str]: pass + def get_tags(self): + """Return tags configured in the slurm compute resource configuration.""" + return self.tags + class FlexibleInstanceType(Resource): """Represent an instance type listed in the Instances of a ComputeResources.""" @@ -1933,7 +1999,11 @@ def __init__(self, instance_type: str, **kwargs): class SlurmFlexibleComputeResource(_BaseSlurmComputeResource): """Represents a Slurm Compute Resource with Multiple Instance Types.""" - def __init__(self, instances: List[FlexibleInstanceType], **kwargs): + def __init__( + self, + instances: List[FlexibleInstanceType], + **kwargs, + ): super().__init__(**kwargs) self.instances = Resource.init_param(instances) @@ -1970,7 +2040,11 @@ def max_network_interface_count(self) -> int: class SlurmComputeResource(_BaseSlurmComputeResource): """Represents a Slurm Compute Resource with a Single Instance Type.""" - def __init__(self, instance_type, **kwargs): + def __init__( + self, + instance_type, + **kwargs, + ): super().__init__(**kwargs) self.instance_type = Resource.init_param(instance_type) self.__instance_type_info = None @@ -2081,6 +2155,10 @@ def multi_az_enabled(self): """Return true if more than one AZ are defined in the queue Networking section.""" return len(self.networking.az_list) > 1 + def get_tags(self): + """Return tags configured in the queue configuration.""" + return None + def get_managed_placement_group_keys(self) -> List[str]: managed_placement_group_keys = [] for compute_resource in self.compute_resources: @@ -2159,9 +2237,15 @@ class SlurmQueue(_CommonQueue): def __init__( self, allocation_strategy: str = None, + custom_slurm_settings: Dict = None, + health_checks: HealthChecks = None, + tags: List[Tag] = None, **kwargs, ): super().__init__(**kwargs) + self.health_checks = health_checks or HealthChecks(implied=True) + self.custom_slurm_settings = Resource.init_param(custom_slurm_settings, default={}) + self.tags = tags if any( isinstance(compute_resource, SlurmFlexibleComputeResource) for compute_resource in self.compute_resources ): @@ -2187,6 +2271,10 @@ def instance_types_with_instance_storage(self): result.update(compute_resource.instance_types_with_instance_storage) return result + def get_tags(self): + """Return tags configured in the slurm queue configuration.""" + return self.tags + def _register_validators(self, context: ValidatorContext = None): super()._register_validators(context) self._register_validator( @@ -2197,8 +2285,8 @@ def _register_validators(self, context: ValidatorContext = None): self._register_validator( MaxCountValidator, resources_length=len(self.compute_resources), - max_length=MAX_NUMBER_OF_COMPUTE_RESOURCES, - resource_name="ComputeResources", + max_length=MAX_COMPUTE_RESOURCES_PER_QUEUE, + resource_name="ComputeResources per Queue", ) self._register_validator( QueueSubnetsValidator, @@ -2212,6 +2300,13 @@ def _register_validators(self, context: ValidatorContext = None): queue_name=self.name, subnet_ids=self.networking.subnet_ids, ) + if self.custom_slurm_settings: + self._register_validator( + CustomSlurmSettingsValidator, + custom_settings=[self.custom_slurm_settings], + deny_list=SLURM_SETTINGS_DENY_LIST["Queue"]["Global"], + settings_level=CustomSlurmSettingLevel.QUEUE, + ) for compute_resource in self.compute_resources: self._register_validator( EfaSecurityGroupValidator, @@ -2229,6 +2324,13 @@ def _register_validators(self, context: ValidatorContext = None): is False, multi_az_enabled=self.multi_az_enabled, ) + if compute_resource.custom_slurm_settings: + self._register_validator( + CustomSlurmSettingsValidator, + custom_settings=[compute_resource.custom_slurm_settings], + deny_list=SLURM_SETTINGS_DENY_LIST["ComputeResource"]["Global"], + settings_level=CustomSlurmSettingLevel.COMPUTE_RESOURCE, + ) for instance_type in compute_resource.instance_types: self._register_validator( CapacityTypeValidator, @@ -2265,10 +2367,16 @@ def __init__( self.password_secret_arn = Resource.init_param(password_secret_arn) def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument + region = get_region() + self._register_validator(FeatureRegionValidator, feature=Feature.SLURM_DATABASE, region=region) if self.uri: self._register_validator(DatabaseUriValidator, uri=self.uri) if self.password_secret_arn: - self._register_validator(PasswordSecretArnValidator, password_secret_arn=self.password_secret_arn) + self._register_validator( + PasswordSecretArnValidator, + password_secret_arn=self.password_secret_arn, + region=region, + ) class SlurmSettings(Resource): @@ -2281,6 +2389,8 @@ def __init__( queue_update_strategy: str = None, enable_memory_based_scheduling: bool = None, database: Database = None, + custom_slurm_settings: List[Dict] = None, + custom_slurm_settings_include_file: str = None, **kwargs, ): super().__init__() @@ -2291,6 +2401,33 @@ def __init__( ) self.enable_memory_based_scheduling = Resource.init_param(enable_memory_based_scheduling, default=False) self.database = database + self.custom_slurm_settings = Resource.init_param(custom_slurm_settings) + self.custom_slurm_settings_include_file = Resource.init_param(custom_slurm_settings_include_file) + + def _register_validators(self, context: ValidatorContext = None): + super()._register_validators(context) + if self.custom_slurm_settings: # if not empty register validator + self._register_validator( + CustomSlurmSettingsValidator, + custom_settings=self.custom_slurm_settings, + deny_list=SLURM_SETTINGS_DENY_LIST["SlurmConf"]["Global"], + settings_level=CustomSlurmSettingLevel.SLURM_CONF, + ) + self._register_validator(CustomSlurmNodeNamesValidator, custom_settings=self.custom_slurm_settings) + if self.database: + self._register_validator( + CustomSlurmSettingsValidator, + custom_settings=self.custom_slurm_settings, + deny_list=SLURM_SETTINGS_DENY_LIST["SlurmConf"]["Accounting"], + settings_level=CustomSlurmSettingLevel.SLURM_CONF, + ) + if self.custom_slurm_settings_include_file: + self._register_validator(UrlValidator, url=self.custom_slurm_settings_include_file) + self._register_validator( + CustomSlurmSettingsIncludeFileOnlyValidator, + custom_settings=self.custom_slurm_settings, + include_file_url=self.custom_slurm_settings_include_file, + ) class QueueUpdateStrategy(Enum): @@ -2314,12 +2451,24 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: D102 self._register_validator( DuplicateNameValidator, name_list=[queue.name for queue in self.queues], resource_name="Queue" ) + self._register_validator( + RootVolumeEncryptionConsistencyValidator, + encryption_settings=[ + (queue.name, queue.compute_settings.local_storage.root_volume.encrypted) for queue in self.queues + ], + ) self._register_validator( MaxCountValidator, resources_length=len(self.queues), max_length=MAX_NUMBER_OF_QUEUES, resource_name="SlurmQueues", ) + self._register_validator( + MaxCountValidator, + resources_length=sum(len(queue.compute_resources) for queue in self.queues), + max_length=MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_CLUSTER, + resource_name="ComputeResources per Cluster", + ) class SchedulerPluginQueue(_CommonQueue): @@ -2340,6 +2489,12 @@ def _register_validators(self, context: ValidatorContext = None): name_list=[compute_resource.name for compute_resource in self.compute_resources], resource_name="Compute resource", ) + self._register_validator( + MaxCountValidator, + resources_length=len(self.compute_resources), + max_length=SCHEDULER_PLUGIN_MAX_NUMBER_OF_COMPUTE_RESOURCES, + resource_name="ComputeResources per Queue", + ) self._register_validator( QueueSubnetsValidator, queue_name=self.name, @@ -2402,7 +2557,7 @@ class SchedulerPluginQueueConstraints(Resource): def __init__(self, max_count: int = None, **kwargs): super().__init__(**kwargs) - self.max_count = Resource.init_param(max_count, default=MAX_NUMBER_OF_QUEUES) + self.max_count = Resource.init_param(max_count, default=SCHEDULER_PLUGIN_MAX_NUMBER_OF_QUEUES) class SchedulerPluginComputeResourceConstraints(Resource): @@ -2410,7 +2565,7 @@ class SchedulerPluginComputeResourceConstraints(Resource): def __init__(self, max_count: int = None, **kwargs): super().__init__(**kwargs) - self.max_count = Resource.init_param(max_count, default=MAX_NUMBER_OF_COMPUTE_RESOURCES) + self.max_count = Resource.init_param(max_count, default=SCHEDULER_PLUGIN_MAX_NUMBER_OF_COMPUTE_RESOURCES) class SchedulerPluginRequirements(Resource): @@ -2681,7 +2836,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: D102 max_length=get_attr( self.settings.scheduler_definition, "requirements.queue_constraints.max_count", - default=MAX_NUMBER_OF_QUEUES, + default=SCHEDULER_PLUGIN_MAX_NUMBER_OF_QUEUES, ), resource_name="SchedulerQueues", ) @@ -2692,7 +2847,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: D102 max_length=get_attr( self.settings.scheduler_definition, "requirements.compute_resource_constraints.max_count", - default=MAX_NUMBER_OF_COMPUTE_RESOURCES, + default=SCHEDULER_PLUGIN_MAX_NUMBER_OF_COMPUTE_RESOURCES, ), resource_name="ComputeResources", ) @@ -2704,16 +2859,18 @@ class CommonSchedulerClusterConfig(BaseClusterConfig): def _register_validators(self, context: ValidatorContext = None): super()._register_validators(context) checked_images = [] - for queue in self.scheduling.queues: + for index, queue in enumerate(self.scheduling.queues): queue_image = self.image_dict[queue.name] - self._register_validator( - ComputeResourceLaunchTemplateValidator, - queue=queue, - ami_id=queue_image, - os=self.image.os, - tags=self.get_cluster_tags(), - imds_support=self.imds.imds_support, - ) + if index == 0: + # Execute LaunchTemplateValidator only for the first queue + self._register_validator( + ComputeResourceLaunchTemplateValidator, + queue=queue, + ami_id=queue_image, + os=self.image.os, + tags=self.get_tags(), + imds_support=self.imds.imds_support, + ) ami_volume_size = AWSApi.instance().ec2.describe_image(queue_image).volume_size root_volume = queue.compute_settings.local_storage.root_volume root_volume_size = root_volume.size @@ -2836,6 +2993,14 @@ def all_relevant_capacity_reservation_ids(self): ) return list(capacity_reservation_ids) + @property + def has_custom_actions_in_queue(self): + """Return True if any queues have custom scripts.""" + for queue in self.scheduling.queues: + if queue.custom_actions: + return True + return False + class SchedulerPluginClusterConfig(CommonSchedulerClusterConfig): """Represent the full Scheduler Plugin Cluster configuration.""" @@ -2864,7 +3029,7 @@ def get_instance_types_data(self): result[instance_type_info.instance_type()] = instance_type_info.instance_type_data return result - def get_cluster_tags(self): + def get_tags(self): """Return tags configured in the root of the cluster config and under scheduler definition.""" return (self.tags if self.tags else []) + get_attr( self.scheduling, "settings.scheduler_definition.tags", default=[] @@ -2945,6 +3110,7 @@ def _register_validators(self, context: ValidatorContext = None): ) instance_types_data = self.get_instance_types_data() + self._register_validator(MultiNetworkInterfacesInstancesValidator, queues=self.scheduling.queues) for queue in self.scheduling.queues: for compute_resource in queue.compute_resources: if self.scheduling.settings.enable_memory_based_scheduling: @@ -2989,6 +3155,14 @@ def _register_validators(self, context: ValidatorContext = None): ] for validator in flexible_instance_types_validators: self._register_validator(validator, **validator_args) + self._register_validator( + ComputeResourceTagsValidator, + queue_name=queue.name, + compute_resource_name=compute_resource.name, + cluster_tags=self.get_tags(), + queue_tags=queue.get_tags(), + compute_resource_tags=compute_resource.get_tags(), + ) @property def image_dict(self): diff --git a/cli/src/pcluster/config/common.py b/cli/src/pcluster/config/common.py index 2241720064..399b0ad61c 100644 --- a/cli/src/pcluster/config/common.py +++ b/cli/src/pcluster/config/common.py @@ -12,12 +12,14 @@ # This module contains all the classes representing the Resources objects. # These objects are obtained from the configuration file through a conversion based on the Schema classes. # +import asyncio +import itertools import json import logging from abc import ABC, abstractmethod from typing import List, Set -from pcluster.validators.common import FailureLevel, ValidationResult, Validator, ValidatorContext +from pcluster.validators.common import AsyncValidator, FailureLevel, ValidationResult, Validator, ValidatorContext from pcluster.validators.iam_validators import AdditionalIamPolicyValidator from pcluster.validators.networking_validators import LambdaFunctionsVpcConfigValidator from pcluster.validators.s3_validators import UrlValidator @@ -122,6 +124,7 @@ def __repr__(self): def __init__(self, implied: bool = False): # Parameters registry self.__params = {} + self._validation_futures = [] self._validation_failures: List[ValidationResult] = [] self._validators: List = [] self.implied = implied @@ -167,17 +170,40 @@ def init_param(value, default=None, update_policy=None): """Create a resource attribute backed by a Configuration Parameter.""" return Resource.Param(value, default=default, update_policy=update_policy) - def _validator_execute(self, validator_class, validator_args, suppressors): + @staticmethod + def _validator_execute(validator_class, validator_args, suppressors, validation_executor): validator = validator_class() + if any(suppressor.suppress_validator(validator) for suppressor in (suppressors or [])): LOGGER.debug("Suppressing validator %s", validator_class.__name__) return [] + LOGGER.debug("Executing validator %s", validator_class.__name__) + return validation_executor(validator_args, validator) + + @staticmethod + def _validator_execute_sync(validator_args, validator): try: return validator.execute(**validator_args) except Exception as e: + LOGGER.debug("Validator %s unexpected failure: %s", validator.type, e) return [ValidationResult(str(e), FailureLevel.ERROR, validator.type)] + @staticmethod + def _validator_execute_async(validator_args, validator): + return validator.execute_async(**validator_args) + + def _await_async_validators(self): + # here could be a good spot to add a cascading timeout for the async validators + # if they are taking too long to execute for a resource and its children since the use of + # get_async_timed_validator_type_for allows to decorate only on single validator at a time and + # does not cascade to child resources + return list( + itertools.chain.from_iterable( + asyncio.get_event_loop().run_until_complete(asyncio.gather(*self._validation_futures)) + ) + ) + def _nested_resources(self): nested_resources = [] for _, value in self.__dict__.items(): @@ -188,34 +214,72 @@ def _nested_resources(self): return nested_resources def validate( - self, suppressors: List[ValidatorSuppressor] = None, context: ValidatorContext = None - ) -> List[ValidationResult]: - """Execute registered validators.""" - # Cleanup failures and validators + self, suppressors: List[ValidatorSuppressor] = None, context: ValidatorContext = None, nested: bool = False + ): + """ + Execute registered validators. + + The "nested" parameter is used only for internal recursive calls to distinguish those from the top level + one where the async validators results should be awaited for. + """ + # this validation logic is a responsibility that could be completely separated from the resource tree + # also until we need to support both sync and async validation this logic will be unnecessarily complex + # embracing async validation completely is possible and will greatly simplify this + self._validation_futures.clear() self._validation_failures.clear() + try: + self._validate_nested_resources(context, suppressors) + self._validate_self(context, suppressors) + finally: + if nested: + result = self._validation_failures, self._validation_futures.copy() + else: + self._validation_failures.extend(self._await_async_validators()) + result = self._validation_failures + self._validation_futures.clear() + + return result + + def _validate_nested_resources(self, context, suppressors): # Call validators for nested resources for nested_resource in self._nested_resources(): - self._validation_failures.extend(nested_resource.validate(suppressors, context)) + failures, futures = nested_resource.validate(suppressors, context, nested=True) + self._validation_futures.extend(futures) + self._validation_failures.extend(failures) - # Update validators to be executed according to current status of the model and order by priority + def _validate_self(self, context, suppressors): self._validators.clear() self._register_validators(context) for validator in self._validators: - self._validation_failures.extend(self._validator_execute(*validator, suppressors)) - - return self._validation_failures + if issubclass(validator[0], AsyncValidator): + self._validation_futures.extend( + [self._validator_execute(*validator, suppressors, self._validator_execute_async)] + ) + else: + self._validation_failures.extend( + self._validator_execute(*validator, suppressors, self._validator_execute_sync) + ) def _register_validators(self, context: ValidatorContext = None): """ - Execute validators. + Register all the validators that contribute to ensure that the resource parameters are valid. - Method to be implemented in Resources. + Method to be implemented in Resource subclasses that need to register validators by invoking the internal + _register_validator method. + :param context: + :return: """ pass def _register_validator(self, validator_class, **validator_args): - """Execute the validator.""" + """Add a validator with the specified arguments. + + The validator will be executed according to the current status of the model and ordered by priority. + :param validator_class: Validator class to be executed + :param validator_args: Arguments to be passed to the validator + :return: + """ self._validators.append((validator_class, validator_args)) def __repr__(self): diff --git a/cli/src/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py index 3b8cc3f3a5..f29ca294fa 100644 --- a/cli/src/pcluster/config/update_policy.py +++ b/cli/src/pcluster/config/update_policy.py @@ -420,8 +420,8 @@ def condition_checker_shared_storage_update_policy(change, patch): fail_reason=lambda change, patch: "Shrinking a queue requires the compute fleet to be stopped first", action_needed=UpdatePolicy.ACTIONS_NEEDED["pcluster_stop"], condition_checker=lambda change, patch: not patch.cluster.has_running_capacity() - or (change.new_value if change.new_value is not None else DEFAULT_MAX_COUNT) - >= (change.old_value if change.old_value is not None else DEFAULT_MAX_COUNT), + or (int(change.new_value) if change.new_value is not None else DEFAULT_MAX_COUNT) + >= (int(change.old_value) if change.old_value is not None else DEFAULT_MAX_COUNT), ) # Update supported only with all compute nodes down or with replacement policy set different from COMPUTE_FLEET_STOP diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index fd89f1d2e0..8043525e60 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -92,11 +92,16 @@ } EBS_VOLUME_SIZE_DEFAULT = 35 EBS_VOLUME_TYPE_DEFAULT = "gp3" +EBS_VOLUME_TYPE_DEFAULT_US_ISO = "gp2" DEFAULT_MAX_COUNT = 10 DEFAULT_MIN_COUNT = 0 -MAX_NUMBER_OF_QUEUES = 10 -MAX_NUMBER_OF_COMPUTE_RESOURCES = 5 +MAX_NUMBER_OF_QUEUES = 100 +SCHEDULER_PLUGIN_MAX_NUMBER_OF_QUEUES = 10 +SCHEDULER_PLUGIN_MAX_NUMBER_OF_COMPUTE_RESOURCES = 5 +MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_CLUSTER = 150 # Based on API timeout limitations +MAX_COMPUTE_RESOURCES_PER_DEPLOYMENT_WAVE = 150 # Maximum compute resources that can be deployed at the same time +MAX_COMPUTE_RESOURCES_PER_QUEUE = 40 # Ensures that each queue will share the same stack as its compute resources MAX_EBS_COUNT = 5 MAX_NEW_STORAGE_COUNT = {"efs": 1, "fsx": 1, "raid": 1} @@ -120,6 +125,7 @@ CW_ALARM_PERCENT_THRESHOLD_DEFAULT = 90 CW_ALARM_EVALUATION_PERIODS_DEFAULT = 1 CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT = 1 +DETAILED_MONITORING_ENABLED_DEFAULT = False STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events" @@ -191,7 +197,6 @@ "us-east-1", "us-east-2", "us-iso-east-1", - "us-iso-west-1", "us-isob-east-1", "us-gov-east-1", "us-gov-west-1", @@ -238,6 +243,7 @@ class Feature(Enum): FSX_LUSTRE = "FSx Lustre" FSX_ONTAP = "FSx ONTAP" FSX_OPENZFS = "FSx OpenZfs" + SLURM_DATABASE = "SLURM Database" UNSUPPORTED_FEATURES_MAP = { @@ -246,6 +252,7 @@ class Feature(Enum): Feature.FSX_LUSTRE: ["us-iso"], Feature.FSX_ONTAP: ["us-iso"], Feature.FSX_OPENZFS: ["us-iso"], + Feature.SLURM_DATABASE: ["us-iso"], } @@ -299,3 +306,5 @@ class Operation(Enum): Operation.GET_IMAGE_STACK_EVENTS: ["us-iso"], Operation.LIST_IMAGE_LOG_STREAMS: ["us-iso"], } + +MAX_TAGS_COUNT = 40 # Tags are limited to 50, reserve some tags for parallelcluster specified tags diff --git a/cli/src/pcluster/models/cluster.py b/cli/src/pcluster/models/cluster.py index 6422311c37..2b44608169 100644 --- a/cli/src/pcluster/models/cluster.py +++ b/cli/src/pcluster/models/cluster.py @@ -369,8 +369,9 @@ def create( LOGGER.info("Generation and upload completed successfully") # Create template if not provided by the user + assets_metadata = None if not (self.config.dev_settings and self.config.dev_settings.cluster_template): - self.template_body = CDKTemplateBuilder().build_cluster_template( + self.template_body, assets_metadata = CDKTemplateBuilder().build_cluster_template( cluster_config=self.config, bucket=self.bucket, stack_name=self.stack_name ) @@ -380,6 +381,7 @@ def create( LOGGER.info("Upload of cluster artifacts completed successfully") LOGGER.info("Creating stack named: %s", self.stack_name) + asset_parameters = self._generate_asset_parameters(assets_metadata) creation_result = AWSApi.instance().cfn.create_stack_from_url( stack_name=self.stack_name, template_url=self.bucket.get_cfn_template_url( @@ -387,6 +389,12 @@ def create( ), disable_rollback=disable_rollback, tags=self._get_cfn_tags(), + parameters=[ + parameter_key_value + for asset_parameter in asset_parameters + for parameter_key_value in asset_parameter + if asset_parameters + ], ) return creation_result.get("StackId"), suppressed_validation_failures @@ -399,6 +407,42 @@ def create( self.bucket.delete_s3_artifacts() raise _cluster_error_mapper(e, str(e)) + @staticmethod + def _generate_asset_parameters(assets_metadata): + """ + Create cloud formation template parameters for assets. + + CDK adds 3 parameters to the root stack after running a synthesis. + 1. ArtifactHash (256-Hash of the asset - useful when running `cdk deploy` only, not relevant otherwise) + 2. S3Bucket (S3 Bucket where asset is stored) + 3. S3VersionKey (S3 object key and version of the asset in the form "key||version") + """ + return ( + [ + ( + { + "ParameterKey": asset_metadata["hash_parameter"]["key"], + "ParameterValue": asset_metadata["hash_parameter"]["value"], + }, + { + "ParameterKey": asset_metadata["s3_bucket_parameter"]["key"], + "ParameterValue": asset_metadata["s3_bucket_parameter"]["value"], + }, + # CDK builds the TemplateURL parameter of the NestedStack in the Root Template by concatenating + # the AssetParameter S3 Bucket, Object and Version + # The S3 Object and Version are passed as a string "ObjectKey||Version" + # In this case we don't need to specify a version, hence we pass "ObjectKey||" + { + "ParameterKey": asset_metadata["s3_object_key_parameter"]["key"], + "ParameterValue": f"{asset_metadata['s3_object_key_parameter']['value']}||", + }, + ) + for asset_metadata in assets_metadata + ] + if assets_metadata + else [] + ) + def _load_config(self, cluster_config: dict) -> BaseClusterConfig: """Load the config and catch / translate any errors that occur during loading.""" try: @@ -912,8 +956,9 @@ def update( self._upload_change_set(changes) # Create template if not provided by the user + assets_metadata = None if not (self.config.dev_settings and self.config.dev_settings.cluster_template): - self.template_body = CDKTemplateBuilder().build_cluster_template( + self.template_body, assets_metadata = CDKTemplateBuilder().build_cluster_template( cluster_config=self.config, bucket=self.bucket, stack_name=self.stack_name, @@ -923,6 +968,8 @@ def update( # upload cluster artifacts and generated template self._upload_artifacts() + asset_parameters = self._generate_asset_parameters(assets_metadata) + LOGGER.info("Updating stack named: %s", self.stack_name) AWSApi.instance().cfn.update_stack_from_url( stack_name=self.stack_name, @@ -930,6 +977,12 @@ def update( template_name=PCLUSTER_S3_ARTIFACTS_DICT.get("template_name") ), tags=self._get_cfn_tags(), + parameters=[ + parameter_key_value + for asset_parameter in asset_parameters + for parameter_key_value in asset_parameter + if asset_parameters + ], ) self.__stack = ClusterStack(AWSApi.instance().cfn.describe_stack(self.stack_name)) diff --git a/cli/src/pcluster/models/s3_bucket.py b/cli/src/pcluster/models/s3_bucket.py index 783e538982..b3543c279b 100644 --- a/cli/src/pcluster/models/s3_bucket.py +++ b/cli/src/pcluster/models/s3_bucket.py @@ -34,12 +34,14 @@ class S3FileFormat(Enum): YAML = "yaml" JSON = "json" + MINIFIED_JSON = "min.json" TEXT = "text" class S3FileType(Enum): """Define S3 file types.""" + ASSETS = "assets" CONFIGS = "configs" TEMPLATES = "templates" CUSTOM_RESOURCES = "custom_resources" @@ -198,14 +200,20 @@ def check_bucket_is_bootstrapped(self): def upload_config(self, config, config_name, format=S3FileFormat.YAML): """Upload config file to S3 bucket.""" - return self._upload_file(file_type=S3FileType.CONFIGS, content=config, file_name=config_name, format=format) + return self.upload_file(file_type=S3FileType.CONFIGS, content=config, file_name=config_name, format=format) def upload_cfn_template(self, template_body, template_name, format=S3FileFormat.YAML): """Upload cloudformation template to S3 bucket.""" - return self._upload_file( + return self.upload_file( file_type=S3FileType.TEMPLATES, content=template_body, file_name=template_name, format=format ) + def upload_cfn_asset(self, asset_file_content, asset_name: str, format=S3FileFormat.YAML): + """Upload cloudformation assets to S3 bucket.""" + return self.upload_file( + file_type=S3FileType.ASSETS, content=asset_file_content, file_name=asset_name, format=format + ) + def upload_resources(self, resource_dir, custom_artifacts_name): """ Upload custom resources to S3 bucket. @@ -262,27 +270,13 @@ def get_resource_url(self, resource_name): # --------------------------------------- S3 private functions --------------------------------------- # - def _upload_file(self, content, file_name, file_type, format=S3FileFormat.YAML): + def upload_file(self, content, file_name, file_type, format=S3FileFormat.YAML): """Upload file to S3 bucket.""" - if format == S3FileFormat.YAML: - result = AWSApi.instance().s3.put_object( - bucket_name=self.name, - body=yaml.dump(content), - key=self.get_object_key(file_type, file_name), - ) - elif format == S3FileFormat.JSON: - result = AWSApi.instance().s3.put_object( - bucket_name=self.name, - body=json.dumps(content), - key=self.get_object_key(file_type, file_name), - ) - else: - result = AWSApi.instance().s3.put_object( - bucket_name=self.name, - body=content, - key=self.get_object_key(file_type, file_name), - ) - return result + return AWSApi.instance().s3.put_object( + bucket_name=self.name, + body=format_content(content, format), + key=self.get_object_key(file_type, file_name), + ) def _get_file(self, file_name, file_type, version_id=None, format=S3FileFormat.YAML): """Get file from S3 bucket.""" @@ -418,3 +412,22 @@ def create_s3_presigned_url(s3_uri, expiration=3600): return AWSApi.instance().s3.create_presigned_url( s3_uri_info["bucket_name"], s3_uri_info["object_key"], expiration=expiration ) + + +def format_content(content, s3_file_format: S3FileFormat): + """ + Return content formatted by the given S3 File Format. + + If format is not in the S3FileFormat Enum, it returns the content without any formatting + :param content: Object representing the content to be formatted + :param s3_file_format: S3FileFormat to use for the output + :return: + """ + if s3_file_format == S3FileFormat.YAML: + return yaml.dump(content) + elif s3_file_format == S3FileFormat.JSON: + return json.dumps(content) + elif s3_file_format == S3FileFormat.MINIFIED_JSON: + return json.dumps(content, separators=(",", ":")) + else: + return content diff --git a/cli/src/pcluster/resources/compute_node/user_data.sh b/cli/src/pcluster/resources/compute_node/user_data.sh index 27feb5f684..d33d0c6f74 100644 --- a/cli/src/pcluster/resources/compute_node/user_data.sh +++ b/cli/src/pcluster/resources/compute_node/user_data.sh @@ -35,6 +35,13 @@ export NO_PROXY="localhost,127.0.0.1,169.254.169.254" PROXY fi +# Configure Amazon Linux 2 instance running in US isolated region. +. /etc/os-release +if [[ "${!ID}${!VERSION_ID}" == "amzn2" && "${AWS::Region}" == us-iso* ]]; then + configuration_script="/opt/parallelcluster/scripts/patch-iso-instance.sh" + [ -f ${!configuration_script} ] && bash ${!configuration_script} +fi + --==BOUNDARY== Content-Type: text/cloud-config; charset=us-ascii MIME-Version: 1.0 @@ -64,16 +71,13 @@ write_files: content: | { "cluster": { + "cluster_name": "${ClusterName}", "stack_name": "${AWS::StackName}", "stack_arn": "${AWS::StackId}", "enable_efa": "${EnableEfa}", "raid_shared_dir": "${RAIDSharedDir}", "raid_type": "${RAIDType}", "base_os": "${BaseOS}", - "preinstall": "${PreInstallScript}", - "preinstall_args": "${PreInstallArgs}", - "postinstall": "${PostInstallScript}", - "postinstall_args": "${PostInstallArgs}", "region": "${AWS::Region}", "efs_fs_ids": "${EFSIds}", "efs_shared_dirs": "${EFSSharedDirs}", @@ -148,6 +152,10 @@ write_files: export HOME="${!HOME_BAK}" } [ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh + + # Configure AWS CLI using the expected overrides, if any. + [ -f /etc/profile.d/aws-cli-default-config.sh ] && . /etc/profile.d/aws-cli-default-config.sh + custom_cookbook=${CustomChefCookbook} export _region=${AWS::Region} @@ -159,7 +167,7 @@ write_files: # 60s, the call will timeout the connect attempt at 8m. Setting it to 15s, causes # each attempt to take 240s, so 2m * 3 attempts will result in a failure after 6 # minutes. - S3API_RESULT=$(AWS_RETRY_MODE=standard aws s3api get-bucket-location --cli-connect-timeout 15 --bucket ${!BASH_REMATCH[1]} 2>&1) || error_exit "${!S3API_RESULT}" + S3API_RESULT=$(AWS_RETRY_MODE=standard aws s3api get-bucket-location --cli-connect-timeout 15 --bucket ${!BASH_REMATCH[1]} --region ${AWS::Region} 2>&1) || error_exit "${!S3API_RESULT}" bucket_region=$(echo "${!S3API_RESULT}" | jq -r '.LocationConstraint') if [[ "${!bucket_region}" == null ]]; then bucket_region="us-east-1" diff --git a/cli/src/pcluster/resources/head_node/user_data.sh b/cli/src/pcluster/resources/head_node/user_data.sh index 2a3eed4d0e..af4e885b92 100644 --- a/cli/src/pcluster/resources/head_node/user_data.sh +++ b/cli/src/pcluster/resources/head_node/user_data.sh @@ -35,6 +35,13 @@ export NO_PROXY="localhost,127.0.0.1,169.254.169.254" PROXY fi +# Configure Amazon Linux 2 instance running in US isolated region. +. /etc/os-release +if [[ "${!ID}${!VERSION_ID}" == "amzn2" && "${AWS::Region}" == us-iso* ]]; then + configuration_script="/opt/parallelcluster/scripts/patch-iso-instance.sh" + [ -f ${!configuration_script} ] && bash ${!configuration_script} +fi + --==BOUNDARY== Content-Type: text/cloud-config; charset=us-ascii MIME-Version: 1.0 @@ -70,7 +77,7 @@ function error_exit # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cloudformation-limits.html cutoff=$(expr 4096 - $(stat --printf="%s" /tmp/wait_condition_handle.txt)) reason=$(head --bytes=${!cutoff} /var/log/parallelcluster/bootstrap_error_msg 2>/dev/null) || reason="$1" - cfn-signal --exit-code=1 --reason="${!reason}" "${!wait_condition_handle_presigned_url}" + cfn-signal --exit-code=1 --reason="${!reason}" "${!wait_condition_handle_presigned_url}" --region ${AWS::Region} --url ${CloudFormationUrl} exit 1 } function vendor_cookbook @@ -88,13 +95,16 @@ function vendor_cookbook } [ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh +# Configure AWS CLI using the expected overrides, if any. +[ -f /etc/profile.d/aws-cli-default-config.sh ] && . /etc/profile.d/aws-cli-default-config.sh + # deploy config files export PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin # Load ParallelCluster environment variables [ -f /etc/profile.d/pcluster.sh ] && . /etc/profile.d/pcluster.sh cd /tmp -cfn-init -s ${AWS::StackName} -v -c deployFiles -r HeadNodeLaunchTemplate --region ${AWS::Region} +cfn-init -s ${AWS::StackName} -v -c deployFiles -r HeadNodeLaunchTemplate --region ${AWS::Region} --url ${CloudFormationUrl} wait_condition_handle_presigned_url=$(cat /tmp/wait_condition_handle.txt) custom_cookbook=${CustomChefCookbook} @@ -102,7 +112,7 @@ export _region=${AWS::Region} s3_url=${AWS::URLSuffix} if [ "${!custom_cookbook}" != "NONE" ]; then if [[ "${!custom_cookbook}" =~ ^s3://([^/]*)(.*) ]]; then - bucket_region=$(aws s3api get-bucket-location --bucket ${!BASH_REMATCH[1]} | jq -r '.LocationConstraint') + bucket_region=$(aws s3api get-bucket-location --bucket ${!BASH_REMATCH[1]} --region ${AWS::Region} | jq -r '.LocationConstraint') if [[ "${!bucket_region}" == null ]]; then bucket_region="us-east-1" fi @@ -131,7 +141,7 @@ if [ "${!custom_cookbook}" != "NONE" ]; then fi # Call CloudFormation -cfn-init -s ${AWS::StackName} -v -c default -r HeadNodeLaunchTemplate --region ${AWS::Region} || error_exit 'Failed to bootstrap the head node. Please check /var/log/cfn-init.log or /var/log/chef-client.log in the head node, or check the cfn-init.log or chef-client.log in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.' -cfn-signal --exit-code=0 --reason="HeadNode setup complete" "${!wait_condition_handle_presigned_url}" +cfn-init -s ${AWS::StackName} -v -c default -r HeadNodeLaunchTemplate --region ${AWS::Region} --url ${CloudFormationUrl} || error_exit 'Failed to bootstrap the head node. Please check /var/log/cfn-init.log or /var/log/chef-client.log in the head node, or check the cfn-init.log or chef-client.log in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.' +cfn-signal --exit-code=0 --reason="HeadNode setup complete" "${!wait_condition_handle_presigned_url}" --region ${AWS::Region} --url ${CloudFormationUrl} # End of file --==BOUNDARY== diff --git a/cli/src/pcluster/schemas/cluster_schema.py b/cli/src/pcluster/schemas/cluster_schema.py index a6b09ec567..5d150a6c17 100644 --- a/cli/src/pcluster/schemas/cluster_schema.py +++ b/cli/src/pcluster/schemas/cluster_schema.py @@ -55,9 +55,11 @@ ExistingFsxOntap, ExistingFsxOpenZfs, FlexibleInstanceType, + GpuHealthCheck, HeadNode, HeadNodeImage, HeadNodeNetworking, + HealthChecks, Iam, Image, Imds, @@ -111,6 +113,7 @@ SudoerConfiguration, Timeouts, ) +from pcluster.config.common import BaseTag from pcluster.config.update_policy import UpdatePolicy from pcluster.constants import ( DELETION_POLICIES, @@ -133,7 +136,12 @@ DeploymentSettingsSchema, ) from pcluster.schemas.common_schema import ImdsSchema as TopLevelImdsSchema -from pcluster.schemas.common_schema import TagSchema, get_field_validator, validate_no_reserved_tag +from pcluster.schemas.common_schema import ( + TagSchema, + get_field_validator, + validate_no_duplicate_tag, + validate_no_reserved_tag, +) from pcluster.utils import yaml_load from pcluster.validators.cluster_validators import EFS_MESSAGES, FSX_MESSAGES @@ -807,7 +815,7 @@ def make_resource(self, data, **kwargs): class MonitoringSchema(BaseSchema): """Represent the schema of the Monitoring section.""" - detailed_monitoring = fields.Bool(metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) + detailed_monitoring = fields.Bool(metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP}) logs = fields.Nested(LogsSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) dashboards = fields.Nested(DashboardsSchema, metadata={"update_policy": UpdatePolicy.SUPPORTED}) @@ -1019,6 +1027,31 @@ def make_resource(self, data, **kwargs): return ClusterDevSettings(**data) +# ---------------------- Health Checks ---------------------- # + + +class GpuHealthCheckSchema(BaseSchema): + """Represent the schema of gpu health check.""" + + enabled = fields.Bool(metadata={"update_policy": UpdatePolicy.SUPPORTED}) + + @post_load + def make_resource(self, data, **kwargs): + """Generate resource.""" + return GpuHealthCheck(**data) + + +class HealthChecksSchema(BaseSchema): + """Represent the HealthChecks schema.""" + + gpu = fields.Nested(GpuHealthCheckSchema, metadata={"update_policy": UpdatePolicy.SUPPORTED}) + + @post_load + def make_resource(self, data, **kwargs): + """Generate resource.""" + return HealthChecks(**data) + + # ---------------------- Node and Cluster Schema ---------------------- # @@ -1067,23 +1100,63 @@ def make_resource(self, data, **kwargs): return QueueImage(**data) -class HeadNodeCustomActionSchema(BaseSchema): - """Represent the schema of the custom action.""" - - script = fields.Str(required=True, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) - args = fields.List(fields.Str(), metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) - - @post_load - def make_resource(self, data, **kwargs): - """Generate resource.""" - return CustomAction(**data) +class OneOrManyCustomActionField(fields.Nested): + """Custom Marshmallow filed to handle backward compatible single script custom actions.""" + def __init__(self, **kwargs): + schema = self._build_dynamic_schema_class( + kwargs.get("metadata", {}).get("update_policy", UpdatePolicy.UNSUPPORTED) + ) + super().__init__(schema, **kwargs) + + @staticmethod + def _build_dynamic_schema_class(update_policy): + class_name = f"CustomActionScriptSchema{update_policy.name}" + if class_name not in globals(): + schema_class_type = type( + class_name, + (CustomActionScriptSchemaBase,), + { + "script": fields.Str(required=True, metadata={"update_policy": update_policy}), + "args": fields.List(fields.Str(), metadata={"update_policy": update_policy}), + }, + ) + globals()[class_name] = schema_class_type + else: + schema_class_type = globals()[class_name] + return schema_class_type + + def _deserialize(self, value, attr, data, **kwargs): + if "Script" in value and "Sequence" in value: + raise ValidationError("Both Script and Sequence fields are provided. Only one is allowed.") + + if "Script" in value: + return super()._deserialize(value, attr, data, **kwargs) + + if "Sequence" in value: + sequence = value["Sequence"] + if not isinstance(sequence, list): + raise ValidationError("Invalid input type for Sequence, expected list.") + res = [] + for item in sequence: + res.append(super()._deserialize(item, attr, data, **kwargs)) + return res + + raise ValidationError("Either Script or Sequence field must be provided.") + + def _serialize(self, nested_obj, attr, obj, **kwargs): + if isinstance(nested_obj, list): + nested_serialized = [] + for item in nested_obj: + nested_serialized.append(super()._serialize(item, attr, obj, **kwargs)) + res = {"Sequence": nested_serialized} + else: + res = super()._serialize(nested_obj, attr, obj, **kwargs) + return res -class HeadNodeUpdatableCustomActionSchema(BaseSchema): - """Represent the schema of an updatable custom action.""" - script = fields.Str(required=True, metadata={"update_policy": UpdatePolicy.SUPPORTED}) - args = fields.List(fields.Str(), metadata={"update_policy": UpdatePolicy.SUPPORTED}) +class CustomActionScriptSchemaBase(BaseSchema): + """Represent the schema of the custom action script that cannot be updated.""" @post_load def make_resource(self, data, **kwargs): @@ -1091,14 +1164,11 @@ def make_resource(self, data, **kwargs): return CustomAction(**data) -class HeadNodeCustomActionsSchema(BaseSchema): - """Represent the schema for all available custom actions.""" +class QueueCustomActionsSchema(BaseSchema): + """Represent the schema for all available custom actions in the queues.""" - on_node_start = fields.Nested(HeadNodeCustomActionSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) - on_node_configured = fields.Nested(HeadNodeCustomActionSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) - on_node_updated = fields.Nested( - HeadNodeUpdatableCustomActionSchema, metadata={"update_policy": UpdatePolicy.SUPPORTED} - ) + on_node_start = OneOrManyCustomActionField(metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}) + on_node_configured = OneOrManyCustomActionField(metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}) @post_load def make_resource(self, data, **kwargs): @@ -1106,27 +1176,12 @@ def make_resource(self, data, **kwargs): return CustomActions(**data) -class QueueCustomActionSchema(BaseSchema): - """Represent the schema of the custom action.""" - - script = fields.Str(required=True, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}) - args = fields.List(fields.Str(), metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}) - - @post_load - def make_resource(self, data, **kwargs): - """Generate resource.""" - return CustomAction(**data) - - -class QueueCustomActionsSchema(BaseSchema): - """Represent the schema for all available custom actions.""" +class HeadNodeCustomActionsSchema(BaseSchema): + """Represent the schema for all available custom actions in the head node.""" - on_node_start = fields.Nested( - QueueCustomActionSchema, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY} - ) - on_node_configured = fields.Nested( - QueueCustomActionSchema, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY} - ) + on_node_start = OneOrManyCustomActionField(metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) + on_node_configured = OneOrManyCustomActionField(metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) + on_node_updated = OneOrManyCustomActionField(metadata={"update_policy": UpdatePolicy.SUPPORTED}) @post_load def make_resource(self, data, **kwargs): @@ -1186,6 +1241,26 @@ def make_resource(self, data, **kwargs): return SlurmComputeResourceNetworking(**data) +class QueueTagSchema(BaseSchema): + """Represent the schema of Tag section.""" + + key = fields.Str( + required=True, + validate=validate.Length(max=128), + metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}, + ) + value = fields.Str( + required=True, + validate=validate.Length(max=256), + metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}, + ) + + @post_load + def make_resource(self, data, **kwargs): + """Generate resource.""" + return BaseTag(**data) + + class SlurmComputeResourceSchema(_ComputeResourceSchema): """Represent the schema of the Slurm ComputeResource.""" @@ -1209,6 +1284,11 @@ class SlurmComputeResourceSchema(_ComputeResourceSchema): networking = fields.Nested( SlurmComputeResourceNetworkingSchema, metadata={"update_policy": UpdatePolicy.MANAGED_PLACEMENT_GROUP} ) + health_checks = fields.Nested(HealthChecksSchema, metadata={"update_policy": UpdatePolicy.SUPPORTED}) + custom_slurm_settings = fields.Dict(metadata={"update_policy": UpdatePolicy.SUPPORTED}) + tags = fields.Nested( + QueueTagSchema, many=True, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY, "update_key": "Key"} + ) @validates_schema def no_coexist_instance_type_flexibility(self, data, **kwargs): @@ -1234,6 +1314,12 @@ def no_duplicate_instance_types(self, flexible_instance_types: List[FlexibleInst ) instance_types.add(instance_type_name) + @validates("tags") + def validate_tags(self, tags): + """Validate tags.""" + validate_no_reserved_tag(tags) + validate_no_duplicate_tag(tags) + @post_load def make_resource(self, data, **kwargs): """Generate resource.""" @@ -1332,12 +1418,23 @@ class SlurmQueueSchema(_CommonQueueSchema): networking = fields.Nested( SlurmQueueNetworkingSchema, required=True, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY} ) + health_checks = fields.Nested(HealthChecksSchema, metadata={"update_policy": UpdatePolicy.SUPPORTED}) + custom_slurm_settings = fields.Dict(metadata={"update_policy": UpdatePolicy.SUPPORTED}) + tags = fields.Nested( + QueueTagSchema, many=True, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY, "update_key": "Key"} + ) @post_load def make_resource(self, data, **kwargs): """Generate resource.""" return SlurmQueue(**data) + @validates("tags") + def validate_tags(self, tags): + """Validate tags.""" + validate_no_reserved_tag(tags) + validate_no_duplicate_tag(tags) + class AwsBatchQueueSchema(BaseQueueSchema): """Represent the schema of a Batch Queue.""" @@ -1418,6 +1515,8 @@ class SlurmSettingsSchema(BaseSchema): ) enable_memory_based_scheduling = fields.Bool(metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP}) database = fields.Nested(DatabaseSchema, metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP}) + custom_slurm_settings = fields.List(fields.Dict, metadata={"update_policy": UpdatePolicy.SUPPORTED}) + custom_slurm_settings_include_file = fields.Str(metadata={"update_policy": UpdatePolicy.SUPPORTED}) @post_load def make_resource(self, data, **kwargs): @@ -1958,7 +2057,7 @@ class DirectoryServiceSchema(BaseSchema): domain_addr = fields.Str(required=True, metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP}) password_secret_arn = fields.Str( required=True, - validate=validate.Regexp(r"^arn:.*:secret"), + validate=validate.Regexp(r"^arn:.*:(secretsmanager:.*:.*:secret:|ssm:.*:.*:parameter\/).*$"), metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP}, ) domain_read_only_user = fields.Str(required=True, metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP}) @@ -1992,7 +2091,7 @@ class ClusterSchema(BaseSchema): }, ) - monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.SUPPORTED}) + monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.IGNORED}) additional_packages = fields.Nested(AdditionalPackagesSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED}) tags = fields.Nested( TagSchema, many=True, metadata={"update_policy": UpdatePolicy.UNSUPPORTED, "update_key": "Key"} @@ -2016,6 +2115,7 @@ def __init__(self, cluster_name: str): def validate_tags(self, tags): """Validate tags.""" validate_no_reserved_tag(tags) + validate_no_duplicate_tag(tags) @validates_schema def no_settings_for_batch(self, data, **kwargs): diff --git a/cli/src/pcluster/schemas/common_schema.py b/cli/src/pcluster/schemas/common_schema.py index 7b0041388e..b087d8ca7e 100644 --- a/cli/src/pcluster/schemas/common_schema.py +++ b/cli/src/pcluster/schemas/common_schema.py @@ -65,6 +65,18 @@ def validate_no_reserved_tag(tags): raise ValidationError(message=f"The tag key prefix '{PCLUSTER_PREFIX}' is reserved and cannot be used.") +def validate_no_duplicate_tag(tags): + """Validate there is no duplicate tag keys in the same tag section.""" + all_tags = set() + for tag in tags: + tag_key = tag.key + if tag_key in all_tags: + raise ValidationError( + f"Duplicate tag key ({tag_key}) detected. Tags keys should be unique within the Tags section." + ) + all_tags.add(tag_key) + + def get_field_validator(field_name): allowed_values = ALLOWED_VALUES[field_name] return validate.OneOf(allowed_values) if isinstance(allowed_values, list) else validate.Regexp(allowed_values) diff --git a/cli/src/pcluster/templates/awsbatch_builder.py b/cli/src/pcluster/templates/awsbatch_builder.py index 1409aad99c..bc9d7cb73a 100644 --- a/cli/src/pcluster/templates/awsbatch_builder.py +++ b/cli/src/pcluster/templates/awsbatch_builder.py @@ -567,9 +567,9 @@ def _add_code_build_docker_image_builder_project(self): value=self._docker_build_wait_condition_handle.ref, ), ], - image="aws/codebuild/amazonlinux2-aarch64-standard:1.0" + image="aws/codebuild/amazonlinux2-aarch64-standard:2.0" if self._condition_use_arm_code_build_image() - else "aws/codebuild/amazonlinux2-x86_64-standard:3.0", + else "aws/codebuild/amazonlinux2-x86_64-standard:4.0", type="ARM_CONTAINER" if self._condition_use_arm_code_build_image() else "LINUX_CONTAINER", privileged_mode=True, ), diff --git a/cli/src/pcluster/templates/cdk_artifacts_manager.py b/cli/src/pcluster/templates/cdk_artifacts_manager.py new file mode 100644 index 0000000000..24b5e72426 --- /dev/null +++ b/cli/src/pcluster/templates/cdk_artifacts_manager.py @@ -0,0 +1,162 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +# +# This module contains all the classes representing the Resources objects. +# These objects are obtained from the configuration file through a conversion based on the Schema classes. +# +import os +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List + +from aws_cdk.cx_api import CloudAssembly, CloudFormationStackArtifact + +from pcluster.models.s3_bucket import S3Bucket, S3FileFormat, S3FileType +from pcluster.utils import LOGGER, load_json_dict + + +@dataclass +class ClusterAssetFile: + """Class for asset files generated from a CDK Synthesis.""" + + id: str + path: str + artifact_hash_parameter: str + s3_bucket_parameter: str + s3_key_parameter: str + + +class ClusterCloudAssembly(ABC): + """Wrapper for cloud assembly of a cluster after running `app.synth()`.""" + + def __init__(self, cloud_assembly): + self.cloud_assembly = cloud_assembly + + @abstractmethod + def _initialize_cloud_artifact(self, cloud_assembly: CloudAssembly) -> CloudFormationStackArtifact: + pass + + @abstractmethod + def get_assets(self) -> List[ClusterAssetFile]: + """List of asset files info.""" + pass + + @abstractmethod + def get_cloud_assembly_directory(self) -> str: + """Directory of the cloud assembly files.""" + pass + + @abstractmethod + def get_template_body(self): + """Return the template content.""" + pass + + +class CDKV1ClusterCloudAssembly(ClusterCloudAssembly): + """Implementation of with CDK V1 Cloud Assembly properties.""" + + def __init__(self, cloud_assembly): + super().__init__(cloud_assembly) + self.cloud_artifact = self._initialize_cloud_artifact(cloud_assembly) + + def get_template_body(self): + """Return the template content.""" + return self.cloud_artifact.template + + def _initialize_cloud_artifact(self, cloud_assembly: CloudAssembly) -> CloudFormationStackArtifact: + return next( + artifact for artifact in cloud_assembly.artifacts if isinstance(artifact, self._get_artifacts_class()) + ) + + def get_assets(self) -> List[ClusterAssetFile]: + """List of asset files info.""" + assets = self.cloud_artifact.assets + cluster_assets_files = [ + ClusterAssetFile( + id=asset.id, + path=asset.path, + s3_bucket_parameter=asset.s3_bucket_parameter, + s3_key_parameter=asset.s3_key_parameter, + artifact_hash_parameter=asset.artifact_hash_parameter, + ) + for asset in assets + ] + return cluster_assets_files + + def get_cloud_assembly_directory(self) -> str: + """Directory of the cloud assembly files.""" + return self.cloud_assembly.directory + + @staticmethod + def _get_artifacts_class(): + return CloudFormationStackArtifact + + +class CDKArtifactsManager: + """Manage the discovery and upload of CDK Assets to the cluster S3 bucket.""" + + def __init__(self, cloud_assembly: CloudAssembly): + self.cluster_cdk_assembly = CDKV1ClusterCloudAssembly(cloud_assembly) + + def get_template_body(self): + """Return the template content.""" + return self.cluster_cdk_assembly.get_template_body() + + def upload_assets(self, bucket: S3Bucket): + """ + Upload the assets in the cloud assembly directory to the cluster artifacts S3 Bucket. + + Returns a mapping of the Asset Logical ID and associated parameters to be passed to the root template. + Output: + ``` + [ + { + 'hash_parameter': { + 'key': 'AssetParametersArtifactHash', 'value': '' + }, + 's3_bucket_parameter': { + 'key': 'AssetParametersS3Bucket', 'value': '' + }, + 's3_object_key_parameter': { + 'key': 'AssetParametersS3VersionKey', 'value': '' + } + }, + ... + ] + ``` + """ + cdk_assets = self.cluster_cdk_assembly.get_assets() + assets_metadata = [] + + for cdk_asset in cdk_assets: + asset_file_path = os.path.join(self.cluster_cdk_assembly.get_cloud_assembly_directory(), cdk_asset.path) + asset_file_content = load_json_dict(asset_file_path) + asset_id = cdk_asset.id + assets_metadata.append( + { + # `artifactHashParameter` only needed when using `cdk deploy` to check the integrity of files + # uploaded to S3 + "hash_parameter": {"key": cdk_asset.artifact_hash_parameter, "value": ""}, + "s3_bucket_parameter": {"key": cdk_asset.s3_bucket_parameter, "value": bucket.name}, + "s3_object_key_parameter": { + "key": cdk_asset.s3_key_parameter, + "value": bucket.get_object_key(S3FileType.ASSETS, asset_id), + }, + "content": asset_file_content, + } + ) + LOGGER.info(f"Uploading asset {asset_id} to S3") + + bucket.upload_cfn_asset( + asset_file_content=asset_file_content, asset_name=asset_id, format=S3FileFormat.MINIFIED_JSON + ) + + return assets_metadata diff --git a/cli/src/pcluster/templates/cdk_builder.py b/cli/src/pcluster/templates/cdk_builder.py index 614080a161..8d26b63a7d 100644 --- a/cli/src/pcluster/templates/cdk_builder.py +++ b/cli/src/pcluster/templates/cdk_builder.py @@ -11,7 +11,6 @@ # # This module contains all the classes required to convert a Cluster into a CFN template by using CDK. # - import logging import os import tempfile @@ -19,6 +18,7 @@ from pcluster.config.cluster_config import BaseClusterConfig from pcluster.config.imagebuilder_config import ImageBuilderConfig from pcluster.models.s3_bucket import S3Bucket +from pcluster.templates.cdk_artifacts_manager import CDKArtifactsManager from pcluster.utils import load_yaml_dict LOGGER = logging.getLogger(__name__) @@ -39,15 +39,19 @@ def build_cluster_template( LOGGER.info("CDK import completed successfully") LOGGER.info("Starting CDK template generation...") - with tempfile.TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as cloud_assembly_dir: output_file = str(stack_name) - app = App(outdir=str(tempdir)) + app = App(outdir=str(cloud_assembly_dir)) ClusterCdkStack(app, output_file, stack_name, cluster_config, bucket, log_group_name) - app.synth() - generated_template = load_yaml_dict(os.path.join(tempdir, f"{output_file}.template.json")) - LOGGER.info("CDK template generation completed successfully") - return generated_template + cloud_assembly = app.synth() + LOGGER.info("CDK template generation completed successfully") + + cdk_artifacts_manager = CDKArtifactsManager(cloud_assembly) + assets_metadata = cdk_artifacts_manager.upload_assets(bucket=bucket) + generated_template = cdk_artifacts_manager.get_template_body() + + return generated_template, assets_metadata @staticmethod def build_imagebuilder_template(image_config: ImageBuilderConfig, image_id: str, bucket: S3Bucket): diff --git a/cli/src/pcluster/templates/cdk_builder_utils.py b/cli/src/pcluster/templates/cdk_builder_utils.py index 1788be9c36..41191f5789 100644 --- a/cli/src/pcluster/templates/cdk_builder_utils.py +++ b/cli/src/pcluster/templates/cdk_builder_utils.py @@ -19,7 +19,7 @@ from aws_cdk import aws_lambda as awslambda from aws_cdk import aws_logs as logs from aws_cdk.aws_iam import ManagedPolicy, PermissionsBoundary -from aws_cdk.core import CfnDeletionPolicy, CfnTag, Construct, Fn, Stack +from aws_cdk.core import Arn, ArnFormat, CfnDeletionPolicy, CfnTag, Construct, Fn, Stack from pcluster.config.cluster_config import ( BaseClusterConfig, @@ -28,6 +28,7 @@ HeadNode, SharedStorageType, SlurmClusterConfig, + SlurmComputeResource, SlurmQueue, ) from pcluster.constants import ( @@ -145,10 +146,10 @@ def get_cluster_tags(stack_name: str, raw_dict: bool = False): return tags if raw_dict else dict_to_cfn_tags(tags) -def get_custom_tags(config: BaseClusterConfig, raw_dict: bool = False): +def get_custom_tags(config: Union[BaseClusterConfig, SlurmQueue, SlurmComputeResource], raw_dict: bool = False): """Return a list of tags set by the user.""" - cluster_tags = config.get_cluster_tags() - tags = {tag.key: tag.value for tag in cluster_tags} if cluster_tags else {} + custom_tags = config.get_tags() + tags = {tag.key: tag.value for tag in custom_tags} if custom_tags else {} return tags if raw_dict else dict_to_cfn_tags(tags) @@ -756,10 +757,19 @@ def _build_policy(self) -> List[iam.PolicyStatement]: ) if self._config.directory_service: + password_secret_arn = Arn.split( + self._config.directory_service.password_secret_arn, ArnFormat.COLON_RESOURCE_NAME + ) policy.append( iam.PolicyStatement( sid="AllowGettingDirectorySecretValue", - actions=["secretsmanager:GetSecretValue"], + actions=[ + "secretsmanager:GetSecretValue" + if password_secret_arn.service == "secretsmanager" + else "ssm:GetParameter" + if password_secret_arn.service == "ssm" + else None + ], effect=iam.Effect.ALLOW, resources=[self._config.directory_service.password_secret_arn], ) diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index 0518b59db1..8d60725a4f 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -17,7 +17,7 @@ import json from collections import defaultdict, namedtuple from datetime import datetime -from typing import Dict, List, Union +from typing import Union from aws_cdk import aws_cloudformation as cfn from aws_cdk import aws_cloudwatch as cloudwatch @@ -26,14 +26,11 @@ from aws_cdk import aws_efs as efs from aws_cdk import aws_fsx as fsx from aws_cdk import aws_iam as iam -from aws_cdk import aws_lambda as awslambda from aws_cdk import aws_logs as logs from aws_cdk.core import ( - CfnCustomResource, CfnDeletionPolicy, CfnOutput, CfnParameter, - CfnResource, CfnStack, CfnTag, Construct, @@ -50,14 +47,11 @@ BaseSharedFsx, ExistingFsxOntap, ExistingFsxOpenZfs, - SchedulerPluginQueue, SharedEbs, SharedEfs, SharedFsxLustre, SharedStorageType, SlurmClusterConfig, - SlurmComputeResource, - SlurmQueue, ) from pcluster.constants import ( CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, @@ -70,17 +64,13 @@ LUSTRE, NODE_BOOTSTRAP_TIMEOUT, OS_MAPPING, - PCLUSTER_CLUSTER_NAME_TAG, - PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, PCLUSTER_DYNAMODB_PREFIX, - PCLUSTER_QUEUE_NAME_TAG, PCLUSTER_S3_ARTIFACTS_DICT, ) from pcluster.models.s3_bucket import S3Bucket from pcluster.templates.awsbatch_builder import AwsBatchConstruct from pcluster.templates.cdk_builder_utils import ( CdkLaunchTemplateBuilder, - ComputeNodeIamResources, HeadNodeIamResources, PclusterLambdaConstruct, add_lambda_cfn_role, @@ -97,16 +87,15 @@ get_directory_service_dna_json_for_head_node, get_lambda_log_group_prefix, get_log_group_deletion_policy, - get_queue_security_groups_full, get_shared_storage_ids_by_type, get_slurm_specific_dna_json_for_head_node, get_user_data_content, - scheduler_is_slurm, to_comma_separated_string, ) +from pcluster.templates.compute_fleet_stack import ComputeFleetConstruct from pcluster.templates.cw_dashboard_builder import CWDashboardConstruct from pcluster.templates.slurm_builder import SlurmConstruct -from pcluster.utils import get_attr, get_http_tokens_setting, join_shell_args +from pcluster.utils import get_attr, get_http_tokens_setting, get_service_endpoint StorageInfo = namedtuple("StorageInfo", ["id", "config"]) @@ -304,6 +293,7 @@ def _add_resources(self): head_node_instance=self.head_node_instance, shared_storage_infos=self.shared_storage_infos, cw_log_group_name=self.log_group.log_group_name if self.config.is_cw_logging_enabled else None, + cw_log_group=self.log_group, ) self._add_alarms() @@ -943,6 +933,8 @@ def _add_head_node(self): ) ) + cloudformation_url = get_service_endpoint("cloudformation", self.config.region) + # Head node Launch Template head_node_launch_template = ec2.CfnLaunchTemplate( self.stack, @@ -970,6 +962,7 @@ def _add_head_node(self): "DisableMultiThreadingManually": "true" if head_node.disable_simultaneous_multithreading_manually else "false", + "CloudFormationUrl": cloudformation_url, }, **get_common_user_data_env(head_node, self.config), }, @@ -987,11 +980,6 @@ def _add_head_node(self): # Metadata head_node_launch_template.add_metadata("Comment", "AWS ParallelCluster Head Node") # CloudFormation::Init metadata - pre_install_action, post_install_action, post_update_action = (None, None, None) - if head_node.custom_actions: - pre_install_action = head_node.custom_actions.on_node_start - post_install_action = head_node.custom_actions.on_node_configured - post_update_action = head_node.custom_actions.on_node_updated dna_json = json.dumps( { @@ -1009,18 +997,6 @@ def _add_head_node(self): self.shared_storage_attributes[SharedStorageType.RAID]["Type"] ), "base_os": self.config.image.os, - "preinstall": pre_install_action.script if pre_install_action else "NONE", - "preinstall_args": join_shell_args(pre_install_action.args) - if pre_install_action and pre_install_action.args - else "NONE", - "postinstall": post_install_action.script if post_install_action else "NONE", - "postinstall_args": join_shell_args(post_install_action.args) - if post_install_action and post_install_action.args - else "NONE", - "postupdate": post_update_action.script if post_update_action else "NONE", - "postupdate_args": join_shell_args(post_update_action.args) - if post_update_action and post_update_action.args - else "NONE", "region": self.stack.region, "efs_fs_ids": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.EFS), "efs_shared_dirs": to_comma_separated_string(self.shared_storage_mount_dirs[SharedStorageType.EFS]), @@ -1162,10 +1138,16 @@ def _add_head_node(self): "action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; " ". /etc/profile.d/pcluster.sh; " "cfn-init -v --stack ${StackName} " - "--resource HeadNodeLaunchTemplate --configsets update --region ${Region}\n" + "--resource HeadNodeLaunchTemplate --configsets update " + "--region ${Region} " + "--url ${CloudFormationUrl}\n" "runas=root\n" ), - {"StackName": self._stack_name, "Region": self.stack.region}, + { + "StackName": self._stack_name, + "Region": self.stack.region, + "CloudFormationUrl": cloudformation_url, + }, ), "mode": "000400", "owner": "root", @@ -1173,8 +1155,16 @@ def _add_head_node(self): }, "/etc/cfn/cfn-hup.conf": { "content": Fn.sub( - "[main]\nstack=${StackId}\nregion=${Region}\ninterval=2", - {"StackId": self.stack.stack_id, "Region": self.stack.region}, + "[main]\n" + "stack=${StackId}\n" + "region=${Region}\n" + "url=${CloudFormationUrl}\n" + "interval=2\n", + { + "StackId": self.stack.stack_id, + "Region": self.stack.region, + "CloudFormationUrl": cloudformation_url, + }, ), "mode": "000400", "owner": "root", @@ -1243,9 +1233,11 @@ def _add_head_node(self): " --chef-zero-port 8889 --json-attributes /etc/chef/dna.json" " --override-runlist aws-parallelcluster::update &&" " /opt/parallelcluster/scripts/fetch_and_run -postupdate &&" - " cfn-signal --exit-code=0 --reason='Update complete'" + f" cfn-signal --exit-code=0 --reason='Update complete'" + f" --region {self.stack.region} --url {cloudformation_url}" f" '{self.wait_condition_handle.ref}' ||" - " cfn-signal --exit-code=1 --reason='Update failed'" + f" cfn-signal --exit-code=1 --reason='Update failed'" + f" --region {self.stack.region} --url {cloudformation_url}" f" '{self.wait_condition_handle.ref}'" ), "cwd": "/etc/chef", @@ -1288,9 +1280,9 @@ def _get_launch_templates_config(self): return None lt_config = {"Queues": {}} - for queue, compute_resouces in self.compute_fleet_resources.compute_launch_templates.items(): + for queue, compute_resources in self.compute_fleet_resources.launch_templates.items(): lt_config["Queues"][queue] = {"ComputeResources": {}} - for compute_resource, launch_template in compute_resouces.items(): + for compute_resource, launch_template in compute_resources.items(): lt_config["Queues"][queue]["ComputeResources"][compute_resource] = { "LaunchTemplate": {"Id": launch_template.ref, "Version": launch_template.attr_latest_version_number} } @@ -1382,355 +1374,3 @@ def _add_outputs(self): description="Private DNS name of the head node", value=self.head_node_instance.attr_private_dns_name, ) - - -class ComputeFleetConstruct(Construct): - """Construct defining compute fleet specific resources.""" - - def __init__( - self, - scope: Construct, - id: str, - cluster_config: SlurmClusterConfig, - log_group: logs.CfnLogGroup, - cleanup_lambda: awslambda.CfnFunction, - cleanup_lambda_role: iam.CfnRole, - compute_security_group: ec2.CfnSecurityGroup, - shared_storage_infos: Dict, - shared_storage_mount_dirs: Dict, - shared_storage_attributes: Dict, - cluster_hosted_zone, - dynamodb_table, - head_eni, - slurm_construct: SlurmConstruct, - ): - super().__init__(scope, id) - self._cleanup_lambda = cleanup_lambda - self._cleanup_lambda_role = cleanup_lambda_role - self._compute_security_group = compute_security_group - self._config = cluster_config - self._shared_storage_infos = shared_storage_infos - self._shared_storage_mount_dirs = shared_storage_mount_dirs - self._shared_storage_attributes = shared_storage_attributes - self._log_group = log_group - self._cluster_hosted_zone = cluster_hosted_zone - self._dynamodb_table = dynamodb_table - self._head_eni = head_eni - self._launch_template_builder = CdkLaunchTemplateBuilder() - self._slurm_construct = slurm_construct - self._add_resources() - - # -- Utility methods --------------------------------------------------------------------------------------------- # - - @property - def stack_name(self): - """Name of the CFN stack.""" - return Stack.of(self).stack_name - - # -- Resources --------------------------------------------------------------------------------------------------- # - - def _add_compute_iam_resources(self): - iam_resources = {} - for queue in self._config.scheduling.queues: - iam_resources[queue.name] = ComputeNodeIamResources( - self, - f"ComputeNodeIamResources{queue.name}", - self._config, - queue, - self._shared_storage_infos, - queue.name, - ) - self._compute_instance_profiles = {k: v.instance_profile for k, v in iam_resources.items()} - self._managed_compute_instance_roles = {k: v.instance_role for k, v in iam_resources.items()} - if scheduler_is_slurm(self._config): - self._slurm_construct.register_policies_with_role( - scope=Stack.of(self), - managed_compute_instance_roles=self._managed_compute_instance_roles, - ) - - @property - def managed_compute_instance_roles(self) -> Dict[str, iam.Role]: - """Mapping of each queue and the IAM role associated with its compute resources.""" - return self._managed_compute_instance_roles - - def _add_resources(self): - self._add_compute_iam_resources() - managed_placement_groups = self._add_placement_groups() - self.compute_launch_templates = self._add_launch_templates( - managed_placement_groups, self._compute_instance_profiles - ) - custom_resource_deps = list(managed_placement_groups.values()) - if self._compute_security_group: - custom_resource_deps.append(self._compute_security_group) - self._add_cleanup_custom_resource(dependencies=custom_resource_deps) - - def _add_cleanup_custom_resource(self, dependencies: List[CfnResource]): - terminate_compute_fleet_custom_resource = CfnCustomResource( - self, - "TerminateComputeFleetCustomResource", - service_token=self._cleanup_lambda.attr_arn, - ) - terminate_compute_fleet_custom_resource.add_property_override("StackName", self.stack_name) - terminate_compute_fleet_custom_resource.add_property_override("Action", "TERMINATE_EC2_INSTANCES") - for dep in dependencies: - terminate_compute_fleet_custom_resource.add_depends_on(dep) - - if self._cleanup_lambda_role: - self._add_policies_to_cleanup_resources_lambda_role() - - def _add_policies_to_cleanup_resources_lambda_role(self): - self._cleanup_lambda_role.policies[0].policy_document.add_statements( - iam.PolicyStatement( - actions=["ec2:DescribeInstances"], - resources=["*"], - effect=iam.Effect.ALLOW, - sid="DescribeInstances", - ), - iam.PolicyStatement( - actions=["ec2:TerminateInstances"], - resources=["*"], - effect=iam.Effect.ALLOW, - conditions={"StringEquals": {f"ec2:ResourceTag/{PCLUSTER_CLUSTER_NAME_TAG}": self.stack_name}}, - sid="FleetTerminatePolicy", - ), - ) - - def _add_placement_groups(self) -> Dict[str, ec2.CfnPlacementGroup]: - managed_placement_groups = {} - for queue in self._config.scheduling.queues: - for key in queue.get_managed_placement_group_keys(): - managed_placement_groups[key] = ec2.CfnPlacementGroup( - self, - f"PlacementGroup{create_hash_suffix(key)}", - strategy="cluster", - ) - return managed_placement_groups - - @staticmethod - def _get_placement_group_for_compute_resource(queue, managed_placement_groups, compute_resource) -> str: - placement_group_settings = queue.get_placement_group_settings_for_compute_resource(compute_resource) - placement_group_key = placement_group_settings.get("key") - managed = placement_group_settings.get("is_managed") - return managed_placement_groups[placement_group_key].ref if managed else placement_group_key - - def _add_launch_templates(self, managed_placement_groups, instance_profiles): - compute_launch_templates = {} - for queue in self._config.scheduling.queues: - compute_launch_templates[queue.name] = {} - queue_lt_security_groups = get_queue_security_groups_full(self._compute_security_group, queue) - queue_pre_install_action, queue_post_install_action = (None, None) - if queue.custom_actions: - queue_pre_install_action = queue.custom_actions.on_node_start - queue_post_install_action = queue.custom_actions.on_node_configured - - for resource in queue.compute_resources: - compute_launch_templates[queue.name][resource.name] = self._add_compute_resource_launch_template( - queue, - resource, - queue_pre_install_action, - queue_post_install_action, - queue_lt_security_groups, - self._get_placement_group_for_compute_resource(queue, managed_placement_groups, resource), - instance_profiles, - ) - return compute_launch_templates - - def _add_compute_resource_launch_template( - self, - queue, - compute_resource, - queue_pre_install_action, - queue_post_install_action, - queue_lt_security_groups, - placement_group, - instance_profiles, - ): - # LT network interfaces - compute_lt_nw_interfaces = [ - ec2.CfnLaunchTemplate.NetworkInterfaceProperty( - device_index=0, - associate_public_ip_address=queue.networking.assign_public_ip - if compute_resource.max_network_interface_count == 1 - else None, # parameter not supported for instance types with multiple network interfaces - interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None, - groups=queue_lt_security_groups, - subnet_id=queue.networking.subnet_ids[0] - if isinstance(compute_resource, SlurmComputeResource) - else None, - ) - ] - - for network_interface_index in range(1, compute_resource.max_network_interface_count): - compute_lt_nw_interfaces.append( - ec2.CfnLaunchTemplate.NetworkInterfaceProperty( - device_index=0, - network_card_index=network_interface_index, - interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None, - groups=queue_lt_security_groups, - subnet_id=queue.networking.subnet_ids[0] - if isinstance(compute_resource, SlurmComputeResource) - else None, - ) - ) - - conditional_template_properties = {} - if compute_resource.is_ebs_optimized: - conditional_template_properties.update({"ebs_optimized": True}) - if isinstance(compute_resource, SlurmComputeResource): - conditional_template_properties.update({"instance_type": compute_resource.instance_type}) - - return ec2.CfnLaunchTemplate( - self, - f"LaunchTemplate{create_hash_suffix(queue.name + compute_resource.name)}", - launch_template_name=f"{self.stack_name}-{queue.name}-{compute_resource.name}", - launch_template_data=ec2.CfnLaunchTemplate.LaunchTemplateDataProperty( - block_device_mappings=self._launch_template_builder.get_block_device_mappings( - queue.compute_settings.local_storage.root_volume, self._config.image.os - ), - # key_name=, - network_interfaces=compute_lt_nw_interfaces, - placement=ec2.CfnLaunchTemplate.PlacementProperty(group_name=placement_group), - image_id=self._config.image_dict[queue.name], - iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty( - name=instance_profiles[queue.name] - ), - instance_market_options=self._launch_template_builder.get_instance_market_options( - queue, compute_resource - ), - instance_initiated_shutdown_behavior="terminate", - capacity_reservation_specification=self._launch_template_builder.get_capacity_reservation( - queue, - compute_resource, - ), - metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty( - http_tokens=get_http_tokens_setting(self._config.imds.imds_support) - ), - user_data=Fn.base64( - Fn.sub( - get_user_data_content("../resources/compute_node/user_data.sh"), - { - **{ - "EnableEfa": "efa" if compute_resource.efa and compute_resource.efa.enabled else "NONE", - "RAIDSharedDir": to_comma_separated_string( - self._shared_storage_mount_dirs[SharedStorageType.RAID] - ), - "RAIDType": to_comma_separated_string( - self._shared_storage_attributes[SharedStorageType.RAID]["Type"] - ), - "DisableMultiThreadingManually": "true" - if compute_resource.disable_simultaneous_multithreading_manually - else "false", - "BaseOS": self._config.image.os, - "PreInstallScript": queue_pre_install_action.script - if queue_pre_install_action - else "NONE", - "PreInstallArgs": join_shell_args(queue_pre_install_action.args) - if queue_pre_install_action and queue_pre_install_action.args - else "NONE", - "PostInstallScript": queue_post_install_action.script - if queue_post_install_action - else "NONE", - "PostInstallArgs": join_shell_args(queue_post_install_action.args) - if queue_post_install_action and queue_post_install_action.args - else "NONE", - "EFSIds": get_shared_storage_ids_by_type( - self._shared_storage_infos, SharedStorageType.EFS - ), - "EFSSharedDirs": to_comma_separated_string( - self._shared_storage_mount_dirs[SharedStorageType.EFS] - ), - "EFSEncryptionInTransits": to_comma_separated_string( - self._shared_storage_attributes[SharedStorageType.EFS]["EncryptionInTransits"], - use_lower_case=True, - ), - "EFSIamAuthorizations": to_comma_separated_string( - self._shared_storage_attributes[SharedStorageType.EFS]["IamAuthorizations"], - use_lower_case=True, - ), - "FSXIds": get_shared_storage_ids_by_type( - self._shared_storage_infos, SharedStorageType.FSX - ), - "FSXMountNames": to_comma_separated_string( - self._shared_storage_attributes[SharedStorageType.FSX]["MountNames"] - ), - "FSXDNSNames": to_comma_separated_string( - self._shared_storage_attributes[SharedStorageType.FSX]["DNSNames"] - ), - "FSXVolumeJunctionPaths": to_comma_separated_string( - self._shared_storage_attributes[SharedStorageType.FSX]["VolumeJunctionPaths"] - ), - "FSXFileSystemTypes": to_comma_separated_string( - self._shared_storage_attributes[SharedStorageType.FSX]["FileSystemTypes"] - ), - "FSXSharedDirs": to_comma_separated_string( - self._shared_storage_mount_dirs[SharedStorageType.FSX] - ), - "Scheduler": self._config.scheduling.scheduler, - "EphemeralDir": queue.compute_settings.local_storage.ephemeral_volume.mount_dir - if isinstance(queue, (SlurmQueue, SchedulerPluginQueue)) - and queue.compute_settings.local_storage.ephemeral_volume - else DEFAULT_EPHEMERAL_DIR, - "EbsSharedDirs": to_comma_separated_string( - self._shared_storage_mount_dirs[SharedStorageType.EBS] - ), - "ClusterDNSDomain": str(self._cluster_hosted_zone.name) - if self._cluster_hosted_zone - else "", - "ClusterHostedZone": str(self._cluster_hosted_zone.ref) - if self._cluster_hosted_zone - else "", - "OSUser": OS_MAPPING[self._config.image.os]["user"], - "SlurmDynamoDBTable": self._dynamodb_table.ref if self._dynamodb_table else "NONE", - "LogGroupName": self._log_group.log_group_name - if self._config.monitoring.logs.cloud_watch.enabled - else "NONE", - "IntelHPCPlatform": "true" if self._config.is_intel_hpc_platform_enabled else "false", - "CWLoggingEnabled": "true" if self._config.is_cw_logging_enabled else "false", - "LogRotationEnabled": "true" if self._config.is_log_rotation_enabled else "false", - "QueueName": queue.name, - "ComputeResourceName": compute_resource.name, - "EnableEfaGdr": "compute" - if compute_resource.efa and compute_resource.efa.gdr_support - else "NONE", - "CustomNodePackage": self._config.custom_node_package or "", - "CustomAwsBatchCliPackage": self._config.custom_aws_batch_cli_package or "", - "ExtraJson": self._config.extra_chef_attributes, - "UsePrivateHostname": str( - get_attr(self._config, "scheduling.settings.dns.use_ec2_hostnames", default=False) - ).lower(), - "HeadNodePrivateIp": self._head_eni.attr_primary_private_ip_address, - "DirectoryServiceEnabled": str(self._config.directory_service is not None).lower(), - "Timeout": str( - get_attr( - self._config, - "dev_settings.timeouts.compute_node_bootstrap_timeout", - NODE_BOOTSTRAP_TIMEOUT, - ) - ), - }, - **get_common_user_data_env(queue, self._config), - }, - ) - ), - monitoring=ec2.CfnLaunchTemplate.MonitoringProperty(enabled=False), - tag_specifications=[ - ec2.CfnLaunchTemplate.TagSpecificationProperty( - resource_type="instance", - tags=get_default_instance_tags( - self.stack_name, self._config, compute_resource, "Compute", self._shared_storage_infos - ) - + [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)] - + [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)] - + get_custom_tags(self._config), - ), - ec2.CfnLaunchTemplate.TagSpecificationProperty( - resource_type="volume", - tags=get_default_volume_tags(self.stack_name, "Compute") - + [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)] - + [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)] - + get_custom_tags(self._config), - ), - ], - **conditional_template_properties, - ), - ) diff --git a/cli/src/pcluster/templates/compute_fleet_stack.py b/cli/src/pcluster/templates/compute_fleet_stack.py new file mode 100644 index 0000000000..e9ac652960 --- /dev/null +++ b/cli/src/pcluster/templates/compute_fleet_stack.py @@ -0,0 +1,233 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=too-many-lines + +# +# This module contains all the classes required to convert a Cluster into a CFN template by using CDK. +# +from typing import Dict, List + +from aws_cdk import aws_ec2 as ec2 +from aws_cdk import aws_iam as iam +from aws_cdk import aws_lambda as awslambda +from aws_cdk import aws_logs as logs +from aws_cdk.core import CfnCustomResource, CfnResource, Construct, Stack + +from pcluster.config.cluster_config import SlurmClusterConfig +from pcluster.constants import ( + MAX_COMPUTE_RESOURCES_PER_DEPLOYMENT_WAVE, + MAX_COMPUTE_RESOURCES_PER_QUEUE, + PCLUSTER_CLUSTER_NAME_TAG, +) +from pcluster.templates.queue_group_stack import QueueGroupStack +from pcluster.templates.slurm_builder import SlurmConstruct +from pcluster.utils import LOGGER, batch_by_property_callback + + +class QueueBatchConstruct(Construct): + """ + CDK Construct for the batch of Groups of Queue Stacks. + + This prevents exceeding the AWS API Request Limit. + """ + + def __init__( + self, + scope: Construct, + id: str, + queue_cohort, + cluster_config: SlurmClusterConfig, + log_group: logs.CfnLogGroup, + shared_storage_infos: Dict, + shared_storage_mount_dirs: Dict, + shared_storage_attributes: Dict, + cluster_hosted_zone, + dynamodb_table, + head_eni, + slurm_construct: SlurmConstruct, + compute_security_group, + ): + super().__init__(scope, id) + self._config = cluster_config + self._shared_storage_infos = shared_storage_infos + self._shared_storage_mount_dirs = shared_storage_mount_dirs + self._shared_storage_attributes = shared_storage_attributes + self._log_group = log_group + self._cluster_hosted_zone = cluster_hosted_zone + self._dynamodb_table = dynamodb_table + self._head_eni = head_eni + self._slurm_construct = slurm_construct + self._compute_security_group = compute_security_group + + self.compute_fleet_launch_templates = {} + self.managed_compute_fleet_instance_roles = {} + self.managed_compute_fleet_placement_groups = {} + self.queue_cohort = queue_cohort + self.queue_group_stacks = [] + self._add_resources() + + def _add_resources(self): + queue_groups = batch_by_property_callback( + self._config.scheduling.queues, + lambda q: len(q.compute_resources), + MAX_COMPUTE_RESOURCES_PER_QUEUE, + ) + for group_index, queue_group in enumerate(queue_groups): + LOGGER.info(f"QueueGroup{group_index}: {[queue.name for queue in queue_group]}") + queue_group_stack = QueueGroupStack( + scope=self, + id=f"QueueGroup{group_index}", + queues=queue_group, + cluster_config=self._config, + log_group=self._log_group, + shared_storage_infos=self._shared_storage_infos, + shared_storage_mount_dirs=self._shared_storage_mount_dirs, + shared_storage_attributes=self._shared_storage_attributes, + cluster_hosted_zone=self._cluster_hosted_zone, + dynamodb_table=self._dynamodb_table, + head_eni=self._head_eni, + slurm_construct=self._slurm_construct, + compute_security_group=self._compute_security_group, + ) + self.managed_compute_fleet_instance_roles.update(queue_group_stack.managed_compute_instance_roles) + self.compute_fleet_launch_templates.update(queue_group_stack.compute_launch_templates) + self.managed_compute_fleet_placement_groups.update(queue_group_stack.managed_placement_groups) + self.queue_group_stacks.append(queue_group_stack) + + +class ComputeFleetConstruct(Construct): + """Construct defining compute fleet specific resources.""" + + def __init__( + self, + scope: Construct, + id: str, + cluster_config: SlurmClusterConfig, + log_group: logs.CfnLogGroup, + cleanup_lambda: awslambda.CfnFunction, + cleanup_lambda_role: iam.CfnRole, + compute_security_group: ec2.CfnSecurityGroup, + shared_storage_infos: Dict, + shared_storage_mount_dirs: Dict, + shared_storage_attributes: Dict, + cluster_hosted_zone, + dynamodb_table, + head_eni, + slurm_construct: SlurmConstruct, + ): + super().__init__(scope, id) + self._cleanup_lambda = cleanup_lambda + self._cleanup_lambda_role = cleanup_lambda_role + self._compute_security_group = compute_security_group + self._config = cluster_config + self._shared_storage_infos = shared_storage_infos + self._shared_storage_mount_dirs = shared_storage_mount_dirs + self._shared_storage_attributes = shared_storage_attributes + self._log_group = log_group + self._cluster_hosted_zone = cluster_hosted_zone + self._dynamodb_table = dynamodb_table + self._head_eni = head_eni + self._slurm_construct = slurm_construct + + self.launch_templates = {} + self.managed_compute_fleet_instance_roles = {} + self.managed_compute_fleet_placement_groups = {} + + self._add_resources() + + # -- Utility methods --------------------------------------------------------------------------------------------- # + + @property + def stack_name(self): + """Name of the CFN stack.""" + return Stack.of(self).stack_name + + @property + def managed_compute_instance_roles(self) -> Dict[str, iam.Role]: + """Mapping of each queue and the IAM role associated with its compute resources.""" + return self.managed_compute_fleet_instance_roles + + def _add_resources(self): + queue_batches = batch_by_property_callback( + self._config.scheduling.queues, + lambda q: len(q.compute_resources), + MAX_COMPUTE_RESOURCES_PER_DEPLOYMENT_WAVE, + ) + + queue_deployment_groups = [] + for batch_index, queue_batch in enumerate(queue_batches): + queue_deployment_groups.append( + QueueBatchConstruct( + scope=self, + id=f"QueueBatch{batch_index}", + queue_cohort=queue_batch, + cluster_config=self._config, + log_group=self._log_group, + shared_storage_infos=self._shared_storage_infos, + shared_storage_mount_dirs=self._shared_storage_mount_dirs, + shared_storage_attributes=self._shared_storage_attributes, + cluster_hosted_zone=self._cluster_hosted_zone, + dynamodb_table=self._dynamodb_table, + head_eni=self._head_eni, + slurm_construct=self._slurm_construct, + compute_security_group=self._compute_security_group, + ) + ) + + for group_index, queue_deployment_group in enumerate(queue_deployment_groups): + self.managed_compute_fleet_instance_roles.update( + queue_deployment_group.managed_compute_fleet_instance_roles + ) + self.launch_templates.update(queue_deployment_group.compute_fleet_launch_templates) + self.managed_compute_fleet_placement_groups.update( + queue_deployment_group.managed_compute_fleet_placement_groups + ) + # Make each deployment group dependent on the previous deployment group, this way the stack creation + # of all compute fleet resources will not happen concurrently (avoiding throttling) + if group_index < len(queue_deployment_groups) - 1: + queue_deployment_groups[group_index + 1].node.add_dependency(queue_deployment_groups[group_index]) + + custom_resource_deps = list(self.managed_compute_fleet_placement_groups.values()) + if self._compute_security_group: + custom_resource_deps.append(self._compute_security_group) + self._add_cleanup_custom_resource(dependencies=custom_resource_deps) + + def _add_cleanup_custom_resource(self, dependencies: List[CfnResource]): + terminate_compute_fleet_custom_resource = CfnCustomResource( + self, + "TerminateComputeFleetCustomResource", + service_token=self._cleanup_lambda.attr_arn, + ) + terminate_compute_fleet_custom_resource.add_property_override("StackName", self.stack_name) + terminate_compute_fleet_custom_resource.add_property_override("Action", "TERMINATE_EC2_INSTANCES") + for dep in dependencies: + terminate_compute_fleet_custom_resource.add_depends_on(dep) + + if self._cleanup_lambda_role: + self._add_policies_to_cleanup_resources_lambda_role() + + def _add_policies_to_cleanup_resources_lambda_role(self): + self._cleanup_lambda_role.policies[0].policy_document.add_statements( + iam.PolicyStatement( + actions=["ec2:DescribeInstances"], + resources=["*"], + effect=iam.Effect.ALLOW, + sid="DescribeInstances", + ), + iam.PolicyStatement( + actions=["ec2:TerminateInstances"], + resources=["*"], + effect=iam.Effect.ALLOW, + conditions={"StringEquals": {f"ec2:ResourceTag/{PCLUSTER_CLUSTER_NAME_TAG}": self.stack_name}}, + sid="FleetTerminatePolicy", + ), + ) diff --git a/cli/src/pcluster/templates/cw_dashboard_builder.py b/cli/src/pcluster/templates/cw_dashboard_builder.py index 92eac1ef85..3601cae21b 100644 --- a/cli/src/pcluster/templates/cw_dashboard_builder.py +++ b/cli/src/pcluster/templates/cw_dashboard_builder.py @@ -9,10 +9,12 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. from collections import defaultdict, namedtuple +from typing import Iterable from aws_cdk import aws_cloudwatch as cloudwatch from aws_cdk import aws_ec2 as ec2 -from aws_cdk.core import Construct, Stack +from aws_cdk import aws_logs as logs +from aws_cdk.core import Construct, Duration, Stack from pcluster.config.cluster_config import BaseClusterConfig, SharedFsxLustre, SharedStorageType @@ -30,11 +32,19 @@ def __init__(self, x_value: int, y_value: int): _PclusterMetric = namedtuple( "_PclusterMetric", ["title", "metrics", "supported_vol_types", "namespace", "additional_dimensions"] ) +_CustomMetricFilter = namedtuple( + "_CustomMetricFilter", + ["metric_name", "filter_pattern", "metric_value", "metric_statistic", "metric_unit"], + defaults=("Sum", "Count"), +) _Filter = namedtuple("new_filter", ["pattern", "param"]) _CWLogWidget = namedtuple( "_CWLogWidget", ["title", "conditions", "fields", "filters", "sort", "limit"], ) +_HealthMetric = namedtuple( + "_ErrorMetric", ["title", "metric_filters", "left_y_axis", "left_annotations"], defaults=(None, None) +) def new_pcluster_metric(title=None, metrics=None, supported_vol_types=None, namespace=None, additional_dimensions=None): @@ -53,6 +63,7 @@ def __init__( head_node_instance: ec2.CfnInstance, shared_storage_infos: dict, cw_log_group_name: str, + cw_log_group: logs.CfnLogGroup, ): super().__init__(scope, id) self.stack_scope = scope @@ -61,6 +72,7 @@ def __init__( self.head_node_instance = head_node_instance self.shared_storage_infos = shared_storage_infos self.cw_log_group_name = cw_log_group_name + self.cw_log_group = cw_log_group self.dashboard_name = self.stack_name + "-" + self._stack_region self.coord = Coord(x_value=0, y_value=0) @@ -133,8 +145,10 @@ def _add_resources(self): if len(self.shared_storage_infos[SharedStorageType.FSX]) > 0: self._add_fsx_metrics_graphs() - # Head Node logs, if CW Logs are enabled + # Head Node logs add custom metrics if cw_log and metrics are enabled if self.config.is_cw_logging_enabled: + if self.config.scheduling.scheduler == "slurm": + self._add_custom_health_metrics() self._add_cw_log() def _update_coord(self, d_x, d_y): @@ -172,7 +186,7 @@ def _add_text_widget(self, markdown): self.cloudwatch_dashboard.add_widgets(text_widget) self._update_coord_after_section(d_y=1) - def _generate_graph_widget(self, title, metric_list): + def _generate_graph_widget(self, title, metric_list, **widget_kwargs): """Generate a graph widget and update the coordinates.""" widget = cloudwatch.GraphWidget( title=title, @@ -180,6 +194,7 @@ def _generate_graph_widget(self, title, metric_list): region=self._stack_region, width=self.graph_width, height=self.graph_height, + **widget_kwargs, ) widget.position(x=self.coord.x_value, y=self.coord.y_value) self._update_coord(self.graph_width, self.graph_height) @@ -222,6 +237,196 @@ def _add_conditional_storage_widgets( widgets_list.append(graph_widget) return widgets_list + def _add_custom_pcluster_metric_filter( + self, metric_name, filter_pattern, custom_namespace, metric_value, metric_unit=None + ): + """Adding custom metric filter from named tuple.""" + metric_filter = logs.CfnMetricFilter( + scope=self.stack_scope, + id=metric_name + " Filter", + filter_pattern=filter_pattern, + log_group_name=self.cw_log_group_name, + metric_transformations=[ + logs.CfnMetricFilter.MetricTransformationProperty( + metric_namespace=custom_namespace, + metric_name=metric_name, + metric_value=metric_value, + unit=metric_unit, + dimensions=[ + logs.CfnMetricFilter.DimensionProperty( + key="ClusterName", + value="$.cluster-name", + ), + ], + ) + ], + ) + metric_filter.add_depends_on(self.cw_log_group) + return metric_filter + + def _add_custom_health_metrics(self): + """Create custom health metric filters and outputs to cloudwatch graph.""" + + def _generate_metric_filter_pattern(event_type, failure_type=None): + if failure_type: + return ( + f"{{ $.event-type = {event_type} && $.detail.failure-type = {failure_type} && " + '$.scheduler = "slurm" }' + ) + else: + return f'{{ $.event-type = {event_type} && $.scheduler = "slurm" }}' + + metric_value = "$.detail.count" + launch_failure_event_type = "node-launch-failure-count" + jobs_not_starting_errors = [ + _CustomMetricFilter( + metric_name="IamPolicyErrors", + filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "iam-policy-errors"), + metric_value=metric_value, + ), + _CustomMetricFilter( + metric_name="VcpuLimitErrors", + filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "vcpu-limit-failures"), + metric_value=metric_value, + ), + _CustomMetricFilter( + metric_name="VolumeLimitErrors", + filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "volume-limit-failures"), + metric_value=metric_value, + ), + _CustomMetricFilter( + metric_name="InsufficientCapacityErrors", + filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "ice-failures"), + metric_value=metric_value, + ), + _CustomMetricFilter( + metric_name="OtherInstanceLaunchFailures", + filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "other-failures"), + metric_value=metric_value, + ), + ] + + compute_node_events = [ + _CustomMetricFilter( + metric_name="InstanceBootstrapTimeoutErrors", + filter_pattern='{ $.event-type = "protected-mode-error-count" && ' + '($.detail.failure-type = "static-replacement-timeout-error" || ' + '$.detail.failure-type = "dynamic-resume-timeout-error" ) && ' + '$.scheduler = "slurm" }', + metric_value=metric_value, + ), + _CustomMetricFilter( + metric_name="EC2HealthCheckErrors", + filter_pattern=_generate_metric_filter_pattern("nodes-failing-health-check-count", "ec2_health_check"), + metric_value=metric_value, + ), + _CustomMetricFilter( + metric_name="ScheduledEventHealthCheckErrors", + filter_pattern=_generate_metric_filter_pattern( + "nodes-failing-health-check-count", "scheduled_event_health_check" + ), + metric_value=metric_value, + ), + _CustomMetricFilter( + metric_name="NoCorrespondingInstanceErrors", + filter_pattern=_generate_metric_filter_pattern("invalid-backing-instance-count"), + metric_value=metric_value, + ), + # Use text matching here because it comes from slurmctld.log + _CustomMetricFilter( + metric_name="SlurmNodeNotRespondingErrors", + filter_pattern=_generate_metric_filter_pattern("node-not-responding-down-count"), + metric_value=metric_value, + ), + ] + + cluster_health_metrics = [ + _HealthMetric( + "Instance Provisioning Errors", + jobs_not_starting_errors, + left_y_axis=cloudwatch.YAxisProps(min=0.0), + ), + _HealthMetric( + "Unhealthy Instance Errors", + compute_node_events, + left_y_axis=cloudwatch.YAxisProps(min=0.0), + ), + ] + if self.config.has_custom_actions_in_queue: + custom_action_errors = [ + _CustomMetricFilter( + metric_name="OnNodeStartDownloadErrors", + filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && ' + '$.detail.action = "OnNodeStart" && $.detail.stage = "downloading"}', + metric_value="1", + ), + _CustomMetricFilter( + metric_name="OnNodeStartRunErrors", + filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && ' + '$.detail.action = "OnNodeStart" && $.detail.stage = "executing"}', + metric_value="1", + ), + _CustomMetricFilter( + metric_name="OnNodeConfiguredDownloadErrors", + filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && ' + '$.detail.action = "OnNodeConfigured" && $.detail.stage = "downloading"}', + metric_value="1", + ), + _CustomMetricFilter( + metric_name="OnNodeConfiguredRunErrors", + filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && ' + '$.detail.action = "OnNodeConfigured" && $.detail.stage = "executing"}', + metric_value="1", + ), + ] + + cluster_health_metrics.append( + _HealthMetric( + "Custom Action Errors", + custom_action_errors, + left_y_axis=cloudwatch.YAxisProps(min=0.0), + ) + ) + + cluster_health_metrics.append( + _HealthMetric( + "Compute Fleet Idle Time", + [ + _CustomMetricFilter( + metric_name="MaxDynamicNodeIdleTime", + filter_pattern='{ $.event-type = "compute-node-idle-time" && $.scheduler = "slurm" && ' + '$.detail.node-type = "dynamic"}', + metric_value="$.detail.longest-idle-time", + metric_statistic="max", + metric_unit="Seconds", + ), + ], + left_y_axis=cloudwatch.YAxisProps(min=0.0), + left_annotations=[ + cloudwatch.HorizontalAnnotation( + value=self.config.scheduling.settings.scaledown_idletime * 60, + color=cloudwatch.Color.GREEN, + fill=cloudwatch.Shading.BELOW, + visible=True, + ), + cloudwatch.HorizontalAnnotation( + value=self.config.scheduling.settings.scaledown_idletime * 60, + label="Idle Time Scaledown", + color=cloudwatch.Color.BLUE, + fill=cloudwatch.Shading.ABOVE, + visible=True, + ), + ], + ) + ) + + self._add_text_widget("# Cluster Health Metrics") + self._add_health_metrics_graph_widgets(cluster_health_metrics) + self._add_text_widget( + "General [Troubleshooting Resources]" + "(https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting.html)" + ) + def _add_storage_widgets(self, metrics, storages_list, namespace, dimension_name): widgets_list = [] for metrics_param in metrics: @@ -400,7 +605,7 @@ def _add_fsx_metrics_graphs(self): def _add_cw_log(self): # Create a text widget for subtitle "Head Node Logs" - self._add_text_widget("## Head Node Logs") + self._add_text_widget("# Head Node Logs") dcv_enabled = self.config.is_dcv_enabled scheduler = self.config.scheduling.scheduler @@ -556,3 +761,36 @@ def _new_filter(self, pattern=None, param=None): if param is None: param = "@logStream" return _Filter(pattern, param) + + def _add_health_metrics_graph_widgets(self, cluster_health_metrics: Iterable[_HealthMetric]): + """Add cluster health metrics graph widgets.""" + custom_namespace = "ParallelCluster" + widgets_list = [] + for health_metric in cluster_health_metrics: + metric_list = [] + for new_filter in health_metric.metric_filters: + self._add_custom_pcluster_metric_filter( + metric_name=new_filter.metric_name, + filter_pattern=new_filter.filter_pattern, + custom_namespace=custom_namespace, + metric_value=new_filter.metric_value, + metric_unit=new_filter.metric_unit, + ) + cloudwatch_metric = cloudwatch.Metric( + namespace=custom_namespace, + metric_name=new_filter.metric_name, + period=Duration.minutes(1), + statistic=new_filter.metric_statistic, + dimensions_map={"ClusterName": self.config.cluster_name}, + ) + metric_list.append(cloudwatch_metric) + graph_widget = self._generate_graph_widget( + health_metric.title, + metric_list, + left_y_axis=health_metric.left_y_axis, + left_annotations=health_metric.left_annotations, + ) + widgets_list.append(graph_widget) + + self.cloudwatch_dashboard.add_widgets(*widgets_list) + self._update_coord_after_section(self.graph_height) diff --git a/cli/src/pcluster/templates/imagebuilder_stack.py b/cli/src/pcluster/templates/imagebuilder_stack.py index 233fb555a3..78d90dd888 100644 --- a/cli/src/pcluster/templates/imagebuilder_stack.py +++ b/cli/src/pcluster/templates/imagebuilder_stack.py @@ -577,6 +577,7 @@ def _add_imagebuilder_infrastructure_configuration( "InfrastructureConfiguration", name=self._build_resource_name(IMAGEBUILDER_RESOURCE_NAME_PREFIX), tags=build_tags, + resource_tags=build_tags, instance_profile_name=instance_profile_name or Fn.ref("InstanceProfile"), terminate_instance_on_failure=self.config.dev_settings.terminate_instance_on_failure if self.config.dev_settings and self.config.dev_settings.terminate_instance_on_failure is not None diff --git a/cli/src/pcluster/templates/queue_group_stack.py b/cli/src/pcluster/templates/queue_group_stack.py new file mode 100644 index 0000000000..19a95694c1 --- /dev/null +++ b/cli/src/pcluster/templates/queue_group_stack.py @@ -0,0 +1,330 @@ +from typing import Dict, List + +from aws_cdk import aws_ec2 as ec2 +from aws_cdk import aws_logs as logs +from aws_cdk.core import CfnTag, Fn, NestedStack, Stack +from constructs import Construct + +from pcluster.config.cluster_config import ( + SchedulerPluginQueue, + SharedStorageType, + SlurmClusterConfig, + SlurmComputeResource, + SlurmQueue, +) +from pcluster.constants import ( + DEFAULT_EPHEMERAL_DIR, + NODE_BOOTSTRAP_TIMEOUT, + OS_MAPPING, + PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, + PCLUSTER_QUEUE_NAME_TAG, +) +from pcluster.templates.cdk_builder_utils import ( + CdkLaunchTemplateBuilder, + ComputeNodeIamResources, + create_hash_suffix, + dict_to_cfn_tags, + get_common_user_data_env, + get_custom_tags, + get_default_instance_tags, + get_default_volume_tags, + get_queue_security_groups_full, + get_shared_storage_ids_by_type, + get_user_data_content, + scheduler_is_slurm, + to_comma_separated_string, +) +from pcluster.templates.slurm_builder import SlurmConstruct +from pcluster.utils import get_attr, get_http_tokens_setting + + +class QueueGroupStack(NestedStack): + """Stack encapsulating a set of queues and the associated resources.""" + + def __init__( + self, + scope: Construct, + id: str, + queues: List[SlurmQueue], + slurm_construct: SlurmConstruct, + cluster_config: SlurmClusterConfig, + log_group: logs.CfnLogGroup, + shared_storage_infos: Dict, + shared_storage_mount_dirs: Dict, + shared_storage_attributes: Dict, + compute_security_group, + cluster_hosted_zone, + dynamodb_table, + head_eni, + ): + super().__init__(scope, id) + self._queues = queues + self._slurm_construct = slurm_construct + self._config = cluster_config + self._shared_storage_infos = shared_storage_infos + self._shared_storage_mount_dirs = shared_storage_mount_dirs + self._shared_storage_attributes = shared_storage_attributes + self._compute_security_group = compute_security_group + self._log_group = log_group + self._cluster_hosted_zone = cluster_hosted_zone + self._dynamodb_table = dynamodb_table + self._head_eni = head_eni + self._launch_template_builder = CdkLaunchTemplateBuilder() + self._add_resources() + + @staticmethod + def _get_placement_group_for_compute_resource(queue, managed_placement_groups, compute_resource) -> str: + placement_group_settings = queue.get_placement_group_settings_for_compute_resource(compute_resource) + placement_group_key = placement_group_settings.get("key") + managed = placement_group_settings.get("is_managed") + return managed_placement_groups[placement_group_key].ref if managed else placement_group_key + + @property + def stack_name(self): + """Name of the CFN stack.""" + return Stack.of(self.nested_stack_parent).stack_name + + def _add_resources(self): + self._add_compute_iam_resources() + self._add_placement_groups() + self._add_launch_templates() + + def _add_placement_groups(self): + self.managed_placement_groups = {} + for queue in self._queues: + for key in queue.get_managed_placement_group_keys(): + self.managed_placement_groups[key] = ec2.CfnPlacementGroup( + self, + f"PlacementGroup{create_hash_suffix(key)}", + strategy="cluster", + ) + + def _add_compute_iam_resources(self): + iam_resources = {} + for queue in self._queues: + iam_resources[queue.name] = ComputeNodeIamResources( + self, + f"ComputeNodeIamResources{queue.name}", + self._config, + queue, + self._shared_storage_infos, + queue.name, + ) + self._compute_instance_profiles = {k: v.instance_profile for k, v in iam_resources.items()} + self.managed_compute_instance_roles = {k: v.instance_role for k, v in iam_resources.items()} + if scheduler_is_slurm(self._config): + self._slurm_construct.register_policies_with_role( + scope=Stack.of(self), + managed_compute_instance_roles=self.managed_compute_instance_roles, + ) + + def _add_launch_templates(self): + self.compute_launch_templates = {} + for queue in self._queues: + self.compute_launch_templates[queue.name] = {} + queue_lt_security_groups = get_queue_security_groups_full(self._compute_security_group, queue) + + for resource in queue.compute_resources: + self.compute_launch_templates[queue.name][resource.name] = self._add_compute_resource_launch_template( + queue, + resource, + queue_lt_security_groups, + self._get_placement_group_for_compute_resource(queue, self.managed_placement_groups, resource), + self._compute_instance_profiles, + self._config.is_detailed_monitoring_enabled, + ) + + def _get_custom_compute_resource_tags(self, queue_config, compute_resource_config): + """Compute resource tags and Queue Tags value on Cluster level tags if there are duplicated keys.""" + tags = get_custom_tags(self._config, raw_dict=True) + queue_tags = get_custom_tags(queue_config, raw_dict=True) + compute_resource_tags = get_custom_tags(compute_resource_config, raw_dict=True) + return dict_to_cfn_tags({**tags, **queue_tags, **compute_resource_tags}) + + def _add_compute_resource_launch_template( + self, + queue, + compute_resource, + queue_lt_security_groups, + placement_group, + instance_profiles, + is_detailed_monitoring_enabled, + ): + # LT network interfaces + compute_lt_nw_interfaces = [ + ec2.CfnLaunchTemplate.NetworkInterfaceProperty( + device_index=0, + associate_public_ip_address=queue.networking.assign_public_ip + if compute_resource.max_network_interface_count == 1 + else None, # parameter not supported for instance types with multiple network interfaces + interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None, + groups=queue_lt_security_groups, + subnet_id=queue.networking.subnet_ids[0] + if isinstance(compute_resource, SlurmComputeResource) + else None, + ) + ] + + for network_interface_index in range(1, compute_resource.max_network_interface_count): + compute_lt_nw_interfaces.append( + ec2.CfnLaunchTemplate.NetworkInterfaceProperty( + device_index=0, + network_card_index=network_interface_index, + interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None, + groups=queue_lt_security_groups, + subnet_id=queue.networking.subnet_ids[0] + if isinstance(compute_resource, SlurmComputeResource) + else None, + ) + ) + + conditional_template_properties = {} + if compute_resource.is_ebs_optimized: + conditional_template_properties.update({"ebs_optimized": True}) + if isinstance(compute_resource, SlurmComputeResource): + conditional_template_properties.update({"instance_type": compute_resource.instance_type}) + + return ec2.CfnLaunchTemplate( + self, + f"LaunchTemplate{create_hash_suffix(queue.name + compute_resource.name)}", + launch_template_name=f"{self.stack_name}-{queue.name}-{compute_resource.name}", + launch_template_data=ec2.CfnLaunchTemplate.LaunchTemplateDataProperty( + block_device_mappings=self._launch_template_builder.get_block_device_mappings( + queue.compute_settings.local_storage.root_volume, self._config.image.os + ), + # key_name=, + network_interfaces=compute_lt_nw_interfaces, + placement=ec2.CfnLaunchTemplate.PlacementProperty(group_name=placement_group), + image_id=self._config.image_dict[queue.name], + iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty( + name=instance_profiles[queue.name] + ), + instance_market_options=self._launch_template_builder.get_instance_market_options( + queue, compute_resource + ), + instance_initiated_shutdown_behavior="terminate", + capacity_reservation_specification=self._launch_template_builder.get_capacity_reservation( + queue, + compute_resource, + ), + metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty( + http_tokens=get_http_tokens_setting(self._config.imds.imds_support) + ), + user_data=Fn.base64( + Fn.sub( + get_user_data_content("../resources/compute_node/user_data.sh"), + { + **{ + "EnableEfa": "efa" if compute_resource.efa and compute_resource.efa.enabled else "NONE", + "RAIDSharedDir": to_comma_separated_string( + self._shared_storage_mount_dirs[SharedStorageType.RAID] + ), + "RAIDType": to_comma_separated_string( + self._shared_storage_attributes[SharedStorageType.RAID]["Type"] + ), + "DisableMultiThreadingManually": "true" + if compute_resource.disable_simultaneous_multithreading_manually + else "false", + "BaseOS": self._config.image.os, + "EFSIds": get_shared_storage_ids_by_type( + self._shared_storage_infos, SharedStorageType.EFS + ), + "EFSSharedDirs": to_comma_separated_string( + self._shared_storage_mount_dirs[SharedStorageType.EFS] + ), + "EFSEncryptionInTransits": to_comma_separated_string( + self._shared_storage_attributes[SharedStorageType.EFS]["EncryptionInTransits"], + use_lower_case=True, + ), + "EFSIamAuthorizations": to_comma_separated_string( + self._shared_storage_attributes[SharedStorageType.EFS]["IamAuthorizations"], + use_lower_case=True, + ), + "FSXIds": get_shared_storage_ids_by_type( + self._shared_storage_infos, SharedStorageType.FSX + ), + "FSXMountNames": to_comma_separated_string( + self._shared_storage_attributes[SharedStorageType.FSX]["MountNames"] + ), + "FSXDNSNames": to_comma_separated_string( + self._shared_storage_attributes[SharedStorageType.FSX]["DNSNames"] + ), + "FSXVolumeJunctionPaths": to_comma_separated_string( + self._shared_storage_attributes[SharedStorageType.FSX]["VolumeJunctionPaths"] + ), + "FSXFileSystemTypes": to_comma_separated_string( + self._shared_storage_attributes[SharedStorageType.FSX]["FileSystemTypes"] + ), + "FSXSharedDirs": to_comma_separated_string( + self._shared_storage_mount_dirs[SharedStorageType.FSX] + ), + "Scheduler": self._config.scheduling.scheduler, + "EphemeralDir": queue.compute_settings.local_storage.ephemeral_volume.mount_dir + if isinstance(queue, (SlurmQueue, SchedulerPluginQueue)) + and queue.compute_settings.local_storage.ephemeral_volume + else DEFAULT_EPHEMERAL_DIR, + "EbsSharedDirs": to_comma_separated_string( + self._shared_storage_mount_dirs[SharedStorageType.EBS] + ), + "ClusterDNSDomain": str(self._cluster_hosted_zone.name) + if self._cluster_hosted_zone + else "", + "ClusterHostedZone": str(self._cluster_hosted_zone.ref) + if self._cluster_hosted_zone + else "", + "OSUser": OS_MAPPING[self._config.image.os]["user"], + "ClusterName": self.stack_name, + "SlurmDynamoDBTable": self._dynamodb_table.ref if self._dynamodb_table else "NONE", + "LogGroupName": self._log_group.log_group_name + if self._config.monitoring.logs.cloud_watch.enabled + else "NONE", + "IntelHPCPlatform": "true" if self._config.is_intel_hpc_platform_enabled else "false", + "CWLoggingEnabled": "true" if self._config.is_cw_logging_enabled else "false", + "LogRotationEnabled": "true" if self._config.is_log_rotation_enabled else "false", + "QueueName": queue.name, + "ComputeResourceName": compute_resource.name, + "EnableEfaGdr": "compute" + if compute_resource.efa and compute_resource.efa.gdr_support + else "NONE", + "CustomNodePackage": self._config.custom_node_package or "", + "CustomAwsBatchCliPackage": self._config.custom_aws_batch_cli_package or "", + "ExtraJson": self._config.extra_chef_attributes, + "UsePrivateHostname": str( + get_attr(self._config, "scheduling.settings.dns.use_ec2_hostnames", default=False) + ).lower(), + "HeadNodePrivateIp": self._head_eni.attr_primary_private_ip_address, + "DirectoryServiceEnabled": str(self._config.directory_service is not None).lower(), + "Timeout": str( + get_attr( + self._config, + "dev_settings.timeouts.compute_node_bootstrap_timeout", + NODE_BOOTSTRAP_TIMEOUT, + ) + ), + }, + **get_common_user_data_env(queue, self._config), + }, + ) + ), + monitoring=ec2.CfnLaunchTemplate.MonitoringProperty(enabled=is_detailed_monitoring_enabled), + tag_specifications=[ + ec2.CfnLaunchTemplate.TagSpecificationProperty( + resource_type="instance", + tags=get_default_instance_tags( + self.stack_name, self._config, compute_resource, "Compute", self._shared_storage_infos + ) + + [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)] + + [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)] + + self._get_custom_compute_resource_tags(queue, compute_resource), + ), + ec2.CfnLaunchTemplate.TagSpecificationProperty( + resource_type="volume", + tags=get_default_volume_tags(self.stack_name, "Compute") + + [CfnTag(key=PCLUSTER_QUEUE_NAME_TAG, value=queue.name)] + + [CfnTag(key=PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, value=compute_resource.name)] + + self._get_custom_compute_resource_tags(queue, compute_resource), + ), + ], + **conditional_template_properties, + ), + ) diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py index 01d2d40ef9..c59e2e6298 100644 --- a/cli/src/pcluster/utils.py +++ b/cli/src/pcluster/utils.py @@ -8,7 +8,9 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +import asyncio import datetime +import functools import itertools import json import logging @@ -19,9 +21,10 @@ import sys import time import zipfile +from concurrent.futures import ThreadPoolExecutor from io import BytesIO from shlex import quote -from typing import NoReturn +from typing import Callable, NoReturn from urllib.parse import urlparse import dateutil.parser @@ -80,6 +83,12 @@ def get_docs_base_url(partition: str = None): return DOCS_URL_MAP.get(_partition, DEFAULT_DOCS_URL) +def get_service_endpoint(service: str, region: str): + partition = get_partition(region) + domain = get_url_domain_suffix(partition) + return f"https://{service}.{region}.{domain}" + + def replace_url_parameters(url): """Replace ${Region} and ${URLSuffix} in url.""" return url.replace("${Region}", get_region()).replace("${URLSuffix}", get_url_domain_suffix()) @@ -402,3 +411,128 @@ def get_http_tokens_setting(imds_support): def remove_none_values(original_dictionary): """Return a dictionary without entries with None value.""" return {key: value for key, value in original_dictionary.items() if value is not None} + + +def batch_by_property_callback(items, property_callback: Callable[..., int], batch_size): + """ + Group a list of items into batches based on a property of each item and the specified `batch_size`. + + The property of each item is obtained using the property_callback function. This way the caller of + `batch_by_property_size` defines which property to use for each item. + Example: (With batch_size of 2 and property_callback=lambda item: len(item.property)) + [ + Item(property=["test-1", "test-2"]), + Item(property=["test-3"]), + Item(property=["test-4"]), + ] --> [ [Item(property=["test-1", "test-2"])], [Item(property=["test-3"]), Item(property=["test-4"])] ] + :param items: list of items to organize into batches + :param property_callback: a callback function that returns the property(size) to use for batching + :param batch_size: maximum size of each batch + :return: batches of items as a list + """ + # Avoid batching if total property count is already less than or equal to the batch_size + if sum(property_callback(item) for item in items) < batch_size: + yield items + return + + batch_total_property_value, current_batch = 0, [] + for item_index, item in enumerate(items): + property_value = property_callback(item) + if property_callback(item) > batch_size: + raise ValueError( + f"{item.__class__} property callback value of {property_value} is larger than " + f"the batch size ({batch_size}))" + ) + + if batch_total_property_value + property_value > batch_size: + yield current_batch + batch_total_property_value, current_batch = property_value, [item] + else: + batch_total_property_value += property_value + current_batch.append(item) + if item_index == len(items) - 1: + # If on the last time, yield the batch + yield current_batch + + +class AsyncUtils: + """Utility class for async functions.""" + + @staticmethod + def async_retry(stop_max_attempt_number=5, wait_fixed=1, retry_on_exception=None): + """ + Decorate an async coroutine function to retry its execution when an exception is risen. + + :param stop_max_attempt_number: Max number of retries. + :param wait_fixed: Wait time (in seconds) between retries. + :param retry_on_exception: Exception to retry on. + :return: + """ + if retry_on_exception is None: + retry_on_exception = (Exception,) + + def decorator(func): + @functools.wraps(func) + async def wrapper(*args, **kwargs): + attempt = 0 + result = None + while attempt <= stop_max_attempt_number: + try: + result = await func(*args, **kwargs) + break + except retry_on_exception as err: + if attempt < stop_max_attempt_number: + attempt += 1 + await asyncio.sleep(wait_fixed) + else: + raise err + return result + + return wrapper + + return decorator + + @staticmethod + def async_timeout_cache(timeout=10): + """ + Decorate a function to cache the result of a call for a given time period. + + :param timeout: Timeout in seconds. + :return: + """ + + def decorator(func): + _cache = {} + + @functools.wraps(func) + async def wrapper(*args, **kwargs): + cache_key = (func.__name__, args[1:], frozenset(kwargs.items())) + current_time = time.time() + + if cache_key not in _cache or (_cache[cache_key][1] < current_time): + _cache[cache_key] = (asyncio.ensure_future(func(*args, **kwargs)), current_time + timeout) + + return await _cache[cache_key][0] + + return wrapper + + return decorator + + _thread_pool_executor: ThreadPoolExecutor = ThreadPoolExecutor() + + @staticmethod + def async_from_sync(func): + """ + Convert a synchronous function to an async one. + + :param func: + :return: + """ + + @functools.wraps(func) + async def wrapper(self, *args, **kwargs): + return await asyncio.get_event_loop().run_in_executor( + AsyncUtils._thread_pool_executor, lambda: func(self, *args, **kwargs) + ) + + return wrapper diff --git a/cli/src/pcluster/validators/cluster_validators.py b/cli/src/pcluster/validators/cluster_validators.py index 5c32b07907..51e92a60d4 100644 --- a/cli/src/pcluster/validators/cluster_validators.py +++ b/cli/src/pcluster/validators/cluster_validators.py @@ -1408,3 +1408,59 @@ def _capacity_reservation(self, cr_target): } ) } + + +class RootVolumeEncryptionConsistencyValidator(Validator): + """Verify consistency on the Encryption parameter of all the specified RootVolumes of the queues.""" + + def _validate(self, encryption_settings: list): + reference_queue_name, reference_root_volume_encryption = encryption_settings.pop(0) + for queue in encryption_settings: + queue_name, root_volume_encryption = queue + if reference_root_volume_encryption != root_volume_encryption: + self._add_failure( + f"The Encryption parameter of the root volume of the queue {queue_name} is not consistent " + f"with the value set for the queue {reference_queue_name}, and may cause a problem in case " + f"of Service Control Policies (SCPs) enforcing encryption.", + FailureLevel.WARNING, + ) + + +class MultiNetworkInterfacesInstancesValidator(Validator): + """Verify that queues with multi nic compute resources don't auto-assign public IPs or contain subnets that do.""" + + def _validate(self, queues): + multi_nic_queues = [ + queue + for queue in queues + for compute_resource in queue.compute_resources + if compute_resource.max_network_interface_count > 1 + ] + + all_subnets_with_public_ips = { + subnet.get("SubnetId") + for subnet in AWSApi.instance().ec2.describe_subnets( + {subnet_id for queue in multi_nic_queues for subnet_id in queue.networking.subnet_ids} + ) + if subnet.get("MapPublicIpOnLaunch") + } + + for queue in multi_nic_queues: + if queue.networking.assign_public_ip: + self._add_failure( + f"The queue {queue.name} contains an instance type with multiple network interfaces however the " + f"AssignPublicIp value is set to true. AWS public IPs can only be assigned to instances launched " + f"with a single network interface.", + FailureLevel.ERROR, + ) + + queue_subnets_with_public_ips = sorted( + [subnet_id for subnet_id in queue.networking.subnet_ids if subnet_id in all_subnets_with_public_ips] + ) + if queue_subnets_with_public_ips: + self._add_failure( + f"The queue {queue.name} contains an instance type with multiple network interfaces however the " + f"subnets {queue_subnets_with_public_ips} is configured to automatically assign public IPs. AWS " + f"public IPs can only be assigned to instances launched with a single network interface.", + FailureLevel.ERROR, + ) diff --git a/cli/src/pcluster/validators/common.py b/cli/src/pcluster/validators/common.py index 374d2d1714..ab9c37332e 100644 --- a/cli/src/pcluster/validators/common.py +++ b/cli/src/pcluster/validators/common.py @@ -12,9 +12,13 @@ # This module contains all the classes representing the Resources objects. # These objects are obtained from the configuration file through a conversion based on the Schema classes. # - +import asyncio +import functools from abc import ABC, abstractmethod from enum import Enum +from typing import List + +ASYNC_TIMED_VALIDATORS_DEFAULT_TIMEOUT_SEC = 10 class FailureLevel(Enum): @@ -41,7 +45,7 @@ def __repr__(self): class Validator(ABC): - """Abstract validator. The children must implement the validate method.""" + """Abstract validator. The children must implement the _validate method.""" def __init__(self): self._failures = [] @@ -55,17 +59,84 @@ def type(self): """Identify the type of validator.""" return self.__class__.__name__ - def execute(self, *arg, **kwargs): + def execute(self, *arg, **kwargs) -> List[ValidationResult]: """Entry point of all validators to verify all input params are valid.""" self._validate(*arg, **kwargs) return self._failures @abstractmethod def _validate(self, *args, **kwargs): - """Must be implemented with specific validation logic.""" + """ + Must be implemented with specific validation logic. + + Use _add_failure to add failures to the list of failures returned by execute. + """ + pass + + +class AsyncValidator(Validator): + """Abstract validator that supports *also* async execution. Children must implement the _validate_async method.""" + + def __init__(self): + super().__init__() + + def _validate(self, *arg, **kwargs): + asyncio.get_event_loop().run_until_complete(self._validate_async(*arg, **kwargs)) + return self._failures + + async def execute_async(self, *arg, **kwargs) -> List[ValidationResult]: + """Entry point of all async validators to verify all input params are valid.""" + await self._validate_async(*arg, **kwargs) + return self._failures + + @abstractmethod + async def _validate_async(self, *args, **kwargs): + """ + Must be implemented with specific validation logic. + + Use _add_failure to add failures to the list of failures returned by execute or execute_async when awaited. + """ pass +def get_async_timed_validator_type_for(validator_type: type) -> AsyncValidator: + """ + Return the type decorating the given validator with timeout support. + + The enriched _validate_async will accept an additional timeout parameter. + If not provided will default to ASYNC_TIMED_VALIDATORS_DEFAULT_TIMEOUT_SEC. + + Since validators async execution is coroutine based with preemptive multitasking, + the effective time to fail the validator for timeout may exceed the requested one. + """ + class_name = f"AsyncTimed{validator_type.__name__}" + + if class_name not in globals(): + class_bases = validator_type.__bases__ + class_dict = dict(validator_type.__dict__) + + def _async_timed_validate(original_method): + @functools.wraps(original_method) + async def _validate_async(self: AsyncValidator, *args, **kwargs): + timeout = kwargs.pop("timeout", ASYNC_TIMED_VALIDATORS_DEFAULT_TIMEOUT_SEC) + try: + await asyncio.wait_for(original_method(self, *args, **kwargs), timeout=timeout) + except asyncio.TimeoutError: + self._add_failure( # pylint: disable=protected-access + f"Validation of ({kwargs}) timed out after {timeout} seconds.", FailureLevel.WARNING + ) + + return _validate_async + + class_dict["_validate_async"] = _async_timed_validate(class_dict["_validate_async"]) + + schema_class_type = type(class_name, class_bases, class_dict) + globals()[class_name] = schema_class_type + else: + schema_class_type = globals()[class_name] + return schema_class_type + + class ValidatorContext: """Context containing information about cluster environment meant to be passed to validators.""" diff --git a/cli/src/pcluster/validators/directory_service_validators.py b/cli/src/pcluster/validators/directory_service_validators.py index f4c509be87..24a63afa36 100644 --- a/cli/src/pcluster/validators/directory_service_validators.py +++ b/cli/src/pcluster/validators/directory_service_validators.py @@ -12,6 +12,8 @@ import re from urllib.parse import urlparse +from aws_cdk.core import Arn, ArnFormat + from pcluster.aws.aws_api import AWSApi from pcluster.aws.common import AWSClientError from pcluster.constants import DIRECTORY_SERVICE_RESERVED_SETTINGS @@ -73,13 +75,28 @@ def _validate(self, domain_name): class PasswordSecretArnValidator(Validator): """PasswordSecretArn validator.""" - def _validate(self, password_secret_arn): - """Validate that PasswordSecretArn contains an ARN of a readable secret in AWS Secrets Manager.""" + def _validate(self, password_secret_arn: str, region: str): + """Validate that PasswordSecretArn contains a valid ARN for the given region. + + In particular, the ARN should be one of the following resources: + 1. a readable secret in AWS Secrets Manager, which is supported in all regions but us-isob-east-1. + 2. a readable parameter in SSM Parameter Store, which is supported only in us-isob-east-1. + """ try: # We only require the secret to exist; we do not validate its content. - AWSApi.instance().secretsmanager.describe_secret(password_secret_arn) + arn_components = Arn.split(password_secret_arn, ArnFormat.COLON_RESOURCE_NAME) + service, resource = arn_components.service, arn_components.resource + if service == "secretsmanager" and resource == "secret" and region != "us-isob-east-1": + AWSApi.instance().secretsmanager.describe_secret(password_secret_arn) + elif service == "ssm" and resource == "parameter" and region == "us-isob-east-1": + parameter_name = arn_components.resource_name + AWSApi.instance().ssm.get_parameter(parameter_name) + else: + self._add_failure( + f"The secret {password_secret_arn} is not supported in region {region}.", FailureLevel.ERROR + ) except AWSClientError as e: - if e.error_code == "ResourceNotFoundExceptionSecrets": + if e.error_code in ("ResourceNotFoundExceptionSecrets", "ParameterNotFound"): self._add_failure(f"The secret {password_secret_arn} does not exist.", FailureLevel.ERROR) elif e.error_code == "AccessDeniedException": self._add_failure( diff --git a/cli/src/pcluster/validators/monitoring_validators.py b/cli/src/pcluster/validators/monitoring_validators.py index 1db908b6ea..9bfed76395 100644 --- a/cli/src/pcluster/validators/monitoring_validators.py +++ b/cli/src/pcluster/validators/monitoring_validators.py @@ -12,7 +12,7 @@ class LogRotationValidator(Validator): - """Security groups validator.""" + """Log Rotation validator.""" def _validate(self, log): if not log.cloud_watch.enabled and log.rotation.enabled: @@ -22,3 +22,17 @@ def _validate(self, log): "set `Monitoring / Logs / Rotation / Enabled` to false.", FailureLevel.WARNING, ) + + +class DetailedMonitoringValidator(Validator): + """Detailed Monitoring validator.""" + + def _validate(self, is_detailed_monitoring_enabled): + if is_detailed_monitoring_enabled: + self._add_failure( + "Detailed Monitoring is enabled for EC2 instances in your compute fleet. The Amazon EC2 console will " + "display monitoring graphs with a 1-minute period for these instances. Note that this will increase " + "the cost. If you want to avoid this and use basic monitoring instead, please set " + "`Monitoring / DetailedMonitoring` to false.", + FailureLevel.WARNING, + ) diff --git a/cli/src/pcluster/validators/s3_validators.py b/cli/src/pcluster/validators/s3_validators.py index dc37511363..f497b269ba 100644 --- a/cli/src/pcluster/validators/s3_validators.py +++ b/cli/src/pcluster/validators/s3_validators.py @@ -1,47 +1,60 @@ import re -import time from urllib.error import HTTPError, URLError from urllib.request import urlopen from pcluster.aws.aws_api import AWSApi from pcluster.aws.common import AWSClientError -from pcluster.utils import get_url_scheme -from pcluster.validators.common import FailureLevel, Validator +from pcluster.utils import AsyncUtils, get_url_scheme +from pcluster.validators.common import AsyncValidator, FailureLevel, Validator from pcluster.validators.utils import get_bucket_name_from_s3_url -class UrlValidator(Validator): +class UrlValidator(AsyncValidator): """ Url Validator. Validate given url with s3 or https prefix. + Validation is cached across instances to avoid repeated calls to the same urls. """ - def _validate( + @AsyncUtils.async_timeout_cache(timeout=10) + async def _validate_async( + self, + url, + fail_on_https_error: bool = False, + fail_on_s3_error: bool = False, + expected_bucket_owner: str = None, + ): + try: + await self._validate_async_internal( + url, + fail_on_https_error=fail_on_https_error, + fail_on_s3_error=fail_on_s3_error, + expected_bucket_owner=expected_bucket_owner, + ) + except ConnectionError as err: + self._add_failure(f"The url '{url}' causes ConnectionError: {err}.", FailureLevel.WARNING) + + @AsyncUtils.async_retry(stop_max_attempt_number=3, wait_fixed=1, retry_on_exception=ConnectionError) + async def _validate_async_internal( self, url, - retries=3, fail_on_https_error: bool = False, fail_on_s3_error: bool = False, expected_bucket_owner: str = None, ): scheme = get_url_scheme(url) if scheme in ["https", "s3"]: - try: - if scheme == "s3": - self._validate_s3_uri( - url, fail_on_error=fail_on_s3_error, expected_bucket_owner=expected_bucket_owner - ) - else: - if expected_bucket_owner: - self._add_failure("S3BucketOwner can only be specified with S3 URL", FailureLevel.ERROR) - self._validate_https_uri(url, fail_on_error=fail_on_https_error) - except ConnectionError as e: - if retries > 0: - time.sleep(5) - self._validate(url, retries=retries - 1) - else: - self._add_failure(f"The url '{url}' causes ConnectionError: {e}.", FailureLevel.WARNING) + if scheme == "s3": + _async_validate_s3_uri = AsyncUtils.async_from_sync(self._validate_s3_uri) + await _async_validate_s3_uri( + url, fail_on_error=fail_on_s3_error, expected_bucket_owner=expected_bucket_owner + ) + else: + if expected_bucket_owner: + self._add_failure("S3BucketOwner can only be specified with S3 URL", FailureLevel.ERROR) + _async_validate_https_uri = AsyncUtils.async_from_sync(self._validate_https_uri) + await _async_validate_https_uri(url, fail_on_error=fail_on_https_error) else: self._add_failure( diff --git a/cli/src/pcluster/validators/slurm_settings_validator.py b/cli/src/pcluster/validators/slurm_settings_validator.py new file mode 100644 index 0000000000..080ed4d3b1 --- /dev/null +++ b/cli/src/pcluster/validators/slurm_settings_validator.py @@ -0,0 +1,146 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum +from typing import Dict, List + +from pcluster.validators.common import FailureLevel, Validator + +# SLURM SETTINGS are case-insensitive - keep them lowercase since they are compared with setting.lower() +SLURM_SETTINGS_DENY_LIST = { + "SlurmConf": { + "Global": [ + "communicationparameters", + "epilog", + "grestypes", + "launchparameters", + "prolog", + "reconfigflags", + "resumefailprogram", + "resumeprogram", + "resumetimeout", + "slurmctldhost", + "slurmctldlogfile", + "slurmctldparameters", + "slurmdlogfile", + "slurmuser", + "suspendexcnodes", + "suspendprogram", + "suspendtime", + "taskplugin", + "treewidth", + ], + "Accounting": [ + "accountingstoragetype", + "accountingstoragehost", + "accountingstorageport", + "accountingstorageuser", + "jobacctgathertype", + ], + }, + "Queue": { + "Global": ["nodes", "partitionname", "resumetimeout", "state", "suspendtime"], + }, + "ComputeResource": { + "Global": ["cpus", "features", "gres", "nodeaddr", "nodehostname", "nodename", "state"], + }, +} + + +class CustomSlurmSettingLevel(str, Enum): + """ + Custom Slurm Settings level. + + This enum defines the scope where the custom settings are defined. + """ + + SLURM_CONF = "SlurmConf" + QUEUE = "Queue" + COMPUTE_RESOURCE = "ComputeResource" + + +class CustomSlurmSettingContext(Enum): + """ + Custom Slurm Settings context. + + This enum defines the context where the custom settings are relevant (useful for validation purposes only). + """ + + GLOBAL = "Global" + ACCOUNTING = "Accounting" + + +class CustomSlurmSettingsValidator(Validator): + """ + Custom Slurm Settings validator. + + Validate custom settings in Slurm ComputeResource and Queue. + """ + + def _validate(self, custom_settings: List[Dict], deny_list: List[str], settings_level: CustomSlurmSettingLevel): + denied_settings = set() + + for custom_settings_dict in custom_settings: + for custom_setting in list(custom_settings_dict.keys()): + if custom_setting.lower() in deny_list: + denied_settings.add(custom_setting) + if len(denied_settings) > 0: + settings = ",".join(sorted(denied_settings)) + self._add_failure( + f"Using the following custom Slurm settings at {settings_level} level is not allowed: {settings}", + FailureLevel.ERROR, + ) + + +class CustomSlurmNodeNamesValidator(Validator): + """ + Custom Slurm Nodelists Names validator. + + This validator ensures that any eventual custom node list passed via SlurmSettings/CustomSlurmSettings + does not contain the `-st-` or `-dy-` patterns in the node names, as this would cause the ParallelCluster + daemons to interfere with them. + """ + + def _validate(self, custom_settings: List[Dict]): + bad_nodelists = [] + + for custom_settings_dict in custom_settings: + # Here we validate also the corner case where users provide `NodeName` multiple times with more than + # one combination of cases (e.g. `NodeName` and `nodename`) + nodenames = [custom_settings_dict[key] for key in custom_settings_dict.keys() if key.lower() == "nodename"] + for nodename in nodenames: + if ("-st-" in nodename) or ("-dy-" in nodename): + bad_nodelists.append(nodename) + + if bad_nodelists: + nodelists = ", ".join(sorted(bad_nodelists)) + self._add_failure( + f"Substrings '-st-' and '-dy-' in node names are reserved for nodes managed by ParallelCluster. " + f"Please rename the following custom Slurm nodes: {nodelists}", + FailureLevel.ERROR, + ) + + +class CustomSlurmSettingsIncludeFileOnlyValidator(Validator): + """ + Custom Slurm Settings Include File Only validator. + + This validator returns an error if the CustomSlurmSettingsIncludeFile configuration parameter + is used together with the CustomSlurmSettings under SlurmSettings. + """ + + def _validate(self, custom_settings: List[Dict], include_file_url: str): + if custom_settings and include_file_url: + self._add_failure( + "CustomSlurmsettings and CustomSlurmSettingsIncludeFile cannot be used together " + "under SlurmSettings.", + FailureLevel.ERROR, + ) diff --git a/cli/src/pcluster/validators/tags_validators.py b/cli/src/pcluster/validators/tags_validators.py new file mode 100644 index 0000000000..82c246e7b6 --- /dev/null +++ b/cli/src/pcluster/validators/tags_validators.py @@ -0,0 +1,72 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +from pcluster.constants import MAX_TAGS_COUNT +from pcluster.validators.common import FailureLevel, Validator + + +class ComputeResourceTagsValidator(Validator): + """Compute resources tags validator.""" + + def _validate(self, queue_name, compute_resource_name, cluster_tags, queue_tags, compute_resource_tags): + cluster_tag_keys = {tag.key for tag in cluster_tags} if cluster_tags else set() + queue_tag_keys = {tag.key for tag in queue_tags} if queue_tags else set() + compute_resource_tag_keys = {tag.key for tag in compute_resource_tags} if compute_resource_tags else set() + + overlapping_keys = cluster_tag_keys & queue_tag_keys & compute_resource_tag_keys + key_count = len(cluster_tag_keys | queue_tag_keys | compute_resource_tag_keys) + overlapping_keys_list = sorted(list(overlapping_keys)) + queue_cluster_overlapping_keys_list = sorted(list(cluster_tag_keys & queue_tag_keys - overlapping_keys)) + compute_resource_queue_overlapping_key_list = sorted( + list(queue_tag_keys & compute_resource_tag_keys - overlapping_keys) + ) + cluster_compute_resource_overlapping_key_list = sorted( + list(cluster_tag_keys & compute_resource_tag_keys - overlapping_keys) + ) + + if overlapping_keys_list: + self._add_failure( + "The following Tag keys are defined under `Tags`, `SlurmQueue/Tags` and " + f"`SlurmQueue/ComputeResources/Tags`: {overlapping_keys_list}" + " and will be overridden by the value set in `SlurmQueue/ComputeResources/Tags` for " + f"ComputeResource '{compute_resource_name}' in queue '{queue_name}'.", + FailureLevel.WARNING, + ) + if queue_cluster_overlapping_keys_list: + self._add_failure( + "The following Tag keys are defined in both under `Tags` and `SlurmQueue/Tags`: " + f"{queue_cluster_overlapping_keys_list} and will be overridden by the value set in `SlurmQueue/Tags` " + f"for ComputeResource '{compute_resource_name}' in queue '{queue_name}'.", + FailureLevel.WARNING, + ) + if compute_resource_queue_overlapping_key_list: + self._add_failure( + "The following Tag keys are defined in both under `SlurmQueue/Tags` and " + f"`SlurmQueue/ComputeResources/Tags`: {compute_resource_queue_overlapping_key_list} and will be " + f"overridden by the value set in `SlurmQueue/ComputeResources/Tags` for " + f"ComputeResource '{compute_resource_name}' in queue '{queue_name}'.", + FailureLevel.WARNING, + ) + + if cluster_compute_resource_overlapping_key_list: + self._add_failure( + "The following Tag keys are defined in both under `Tags` and `SlurmQueue/ComputeResources/Tags`: " + f"{cluster_compute_resource_overlapping_key_list} and will be overridden by the value set in " + f"`SlurmQueue/ComputeResources/Tags` for ComputeResource '{compute_resource_name}' in queue" + f" '{queue_name}'.", + FailureLevel.WARNING, + ) + + if key_count > MAX_TAGS_COUNT: + self._add_failure( + f"The number of tags ({key_count}) associated with ComputeResource '{compute_resource_name}' in queue " + f"'{queue_name}' has exceeded the limit of {MAX_TAGS_COUNT}.", + FailureLevel.ERROR, + ) diff --git a/cli/tests/conftest.py b/cli/tests/conftest.py index c04a5cadd1..3f3de94df6 100644 --- a/cli/tests/conftest.py +++ b/cli/tests/conftest.py @@ -345,3 +345,22 @@ def _mock_image_stack(image_id: str = "image", stack_exists: bool = True): mocker.patch("pcluster.aws.cfn.CfnClient.describe_stack", return_value=stack_data) return _mock_image_stack + + +@pytest.fixture +def mock_cloud_assembly(mocker): + def _mock_cloud_assembly(assets, directory="test_dir", template_content="test_template_content"): + cloud_assembly = mocker.patch("aws_cdk.cx_api.CloudAssembly") + cloud_assembly.directory = directory + cluster_cloud_artifact = mocker.patch("aws_cdk.cx_api.CloudFormationStackArtifact") + mocker.patch( + "pcluster.templates.cdk_artifacts_manager.CDKV1ClusterCloudAssembly._get_artifacts_class", + return_value=type(cluster_cloud_artifact), + ) + cluster_cloud_artifact.template = template_content + cluster_cloud_artifact.assets = assets + cloud_assembly.artifacts = [cluster_cloud_artifact] + + return cloud_assembly + + return _mock_cloud_assembly diff --git a/cli/tests/pcluster/aws/dummy_aws_api.py b/cli/tests/pcluster/aws/dummy_aws_api.py index ae989d3a7b..59567049f5 100644 --- a/cli/tests/pcluster/aws/dummy_aws_api.py +++ b/cli/tests/pcluster/aws/dummy_aws_api.py @@ -10,6 +10,7 @@ # limitations under the License. import ipaddress import os +from datetime import datetime from pcluster.aws.aws_api import AWSApi from pcluster.aws.aws_resources import FsxFileSystemInfo, InstanceTypeInfo @@ -27,6 +28,7 @@ from pcluster.aws.s3 import S3Client from pcluster.aws.s3_resource import S3Resource from pcluster.aws.secretsmanager import SecretsManagerClient +from pcluster.aws.ssm import SsmClient from pcluster.aws.sts import StsClient @@ -106,6 +108,7 @@ def __init__(self): self._route53 = _DummyRoute53Client() self._resource_groups = _DummyResourceGroupsClient() self._secretsmanager = _DummySecretsManagerClient() + self._ssm = _DummySsmClient() class _DummyCfnClient(CfnClient): @@ -355,6 +358,25 @@ def describe_secret(self, secret_arn): } +class _DummySsmClient(SsmClient): + def __init__(self): + """Override parent constructor. No real boto3 client is created.""" + self._client = None + + def get_parameter(self, parameter_name): + return { + "Parameter": { + "Name": parameter_name, + "Type": "SecureString", + "Value": "EncryptedValue", + "Version": 1, + "LastModifiedDate": datetime(2023, 3, 3), + "ARN": f"arn:aws:ssm:us-east-1:111111111111:parameter/{parameter_name}", + "DataType": "text", + } + } + + def mock_aws_api(mocker, mock_instance_type_info=True): """Mock AWS Api.""" mocker.patch("pcluster.aws.aws_api.AWSApi.instance", return_value=_DummyAWSApi()) diff --git a/cli/tests/pcluster/aws/test_aws_api.py b/cli/tests/pcluster/aws/test_aws_api.py index 312cefa0c6..d450892ab3 100644 --- a/cli/tests/pcluster/aws/test_aws_api.py +++ b/cli/tests/pcluster/aws/test_aws_api.py @@ -12,6 +12,8 @@ # This module contains all the classes representing the Resources objects. # These objects are obtained from the configuration file through a conversion based on the Schema classes. # +from datetime import datetime + import pytest from assertpy import assert_that @@ -91,3 +93,32 @@ def describe_stack_resources(client): client = boto3_stubber("cloudformation", mocked_requests) describe_stack_resources(client) sleep_mock.assert_called_with(5) + + +FAKE_SSM_PARAMETER = "fake-ssm-parameter-name" + + +@pytest.mark.parametrize( + "response", + [ + pytest.param( + { + "Parameter": { + "Name": FAKE_SSM_PARAMETER, + "Type": "SecureString", + "Value": "EncryptedValue", + "Version": 1, + "LastModifiedDate": datetime(2023, 3, 3), + "ARN": f"arn:aws:ssm:us-east-1:111111111111:parameter/{FAKE_SSM_PARAMETER}", + "DataType": "text", + } + }, + id="SSM GetParameter returns the correct response on success", + ) + ], +) +def test_ssm_get_parameter(mocker, response): + """Verify that SsmClient.get_parameter behaves as expected.""" + mock_aws_api(mocker) + mocker.patch("pcluster.aws.ssm.SsmClient.get_parameter", side_effect=response) + assert_that(_DummyAWSApi().instance().ssm.get_parameter(FAKE_SSM_PARAMETER)).is_equal_to(response) diff --git a/cli/tests/pcluster/aws/test_ssm.py b/cli/tests/pcluster/aws/test_ssm.py new file mode 100644 index 0000000000..57e63f595e --- /dev/null +++ b/cli/tests/pcluster/aws/test_ssm.py @@ -0,0 +1,49 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +from datetime import datetime + +import pytest +from assertpy import assert_that + +from pcluster.aws.ssm import SsmClient +from tests.utils import MockedBoto3Request + + +@pytest.fixture() +def boto3_stubber_path(): + return "pcluster.aws.common.boto3" + + +def test_get_parameter(boto3_stubber): + parameter_name = "mocked_parameter_name" + expected_response = { + "Parameter": { + "Name": parameter_name, + "Type": "string", + "Value": "string", + "Version": 123, + "LastModifiedDate": datetime(2023, 3, 3), + "ARN": "string", + "DataType": "string", + } + } + mocked_requests = [ + MockedBoto3Request( + method="get_parameter", + expected_params={"Name": parameter_name}, + response=expected_response, + generate_error=False, + error_code=None, + ), + ] + boto3_stubber("ssm", mocked_requests) + actual_response = SsmClient().get_parameter(parameter_name) + assert_that(actual_response).is_equal_to(expected_response) diff --git a/cli/tests/pcluster/cli/test_commands.py b/cli/tests/pcluster/cli/test_commands.py index 9abd127993..9be6d2d8be 100644 --- a/cli/tests/pcluster/cli/test_commands.py +++ b/cli/tests/pcluster/cli/test_commands.py @@ -56,6 +56,7 @@ def _mock_cluster( "cluster_name", "expected_config", "expected_template", + "expected_asset", "expected_dirs", "mock_generated_bucket_name", "expected_bucket_name", @@ -67,6 +68,7 @@ def _mock_cluster( "cluster1", "dummy_config1", "dummy_template1", + "dummy_asset1", ["models/../resources/custom_resources"], "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", @@ -77,6 +79,7 @@ def _mock_cluster( "cluster2", "dummy_config2", "dummy_template2", + "dummy_asset2", ["models/../resources/custom_resources", "models/../resources/batch"], "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", @@ -87,6 +90,7 @@ def _mock_cluster( "cluster3", "dummy_config3", "dummy_template3", + "dummy_asset3", [], "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", @@ -97,6 +101,7 @@ def _mock_cluster( "cluster4", "dummy_config4", "dummy_template4", + "dummy_asset4", ["models/../resources/custom_resources"], None, "user_provided_bucket", @@ -110,6 +115,7 @@ def test_setup_bucket_with_resources_success( cluster_name, expected_config, expected_template, + expected_asset, expected_dirs, mock_generated_bucket_name, expected_bucket_name, @@ -128,6 +134,7 @@ def test_setup_bucket_with_resources_success( mock_dict = mock_bucket_object_utils(mocker) upload_config_mock = mock_dict.get("upload_config") upload_template_mock = mock_dict.get("upload_cfn_template") + upload_asset_mock = mock_dict.get("upload_cfn_asset") upload_custom_resources_mock = mock_dict.get("upload_resources") # mock bucket utils check_bucket_mock = mock_bucket_utils(mocker, root_service_dir=f"{cluster_name}-abc123")["check_bucket_exists"] @@ -142,6 +149,7 @@ def test_setup_bucket_with_resources_success( cluster.bucket.upload_config(expected_config, "fake_config_name") cluster.bucket.upload_cfn_template(expected_template, "fake_template_name") + cluster.bucket.upload_cfn_asset(expected_asset, "fake_asset_name") for dir in expected_dirs: cluster.bucket.upload_resources(dir) @@ -150,6 +158,7 @@ def test_setup_bucket_with_resources_success( # assert upload has been called upload_config_mock.assert_called_with(expected_config, "fake_config_name") upload_template_mock.assert_called_with(expected_template, "fake_template_name") + upload_asset_mock.assert_called_with(expected_asset, "fake_asset_name") upload_custom_resources_mock.assert_has_calls([mocker.call(dir) for dir in expected_dirs]) # assert bucket properties diff --git a/cli/tests/pcluster/config/dummy_cluster_config.py b/cli/tests/pcluster/config/dummy_cluster_config.py index c5bea26153..2c5b471767 100644 --- a/cli/tests/pcluster/config/dummy_cluster_config.py +++ b/cli/tests/pcluster/config/dummy_cluster_config.py @@ -18,6 +18,8 @@ AwsBatchQueueNetworking, AwsBatchScheduling, ClusterIam, + CustomAction, + CustomActions, Dcv, HeadNode, HeadNodeNetworking, @@ -122,8 +124,22 @@ def dummy_head_node(mocker): head_node_imds = Imds(secured=True) ssh = Ssh(key_name="test") + custom_actions = CustomActions( + on_node_start=[ + CustomAction(script="https://tests1", args=["arg1", "arg2"]), + CustomAction(script="https://tests2", args=["arg1", "arg2"]), + ], + on_node_updated=CustomAction(script="https://testus", args=["arg1", "arg2"]), + on_node_configured=None, + ) + head_node = HeadNode( - instance_type="fake", networking=head_node_networking, ssh=ssh, dcv=head_node_dcv, imds=head_node_imds + instance_type="fake", + networking=head_node_networking, + ssh=ssh, + dcv=head_node_dcv, + imds=head_node_imds, + custom_actions=custom_actions, ) return head_node diff --git a/cli/tests/pcluster/config/test_cluster_config.py b/cli/tests/pcluster/config/test_cluster_config.py index ded488f325..b47c2e781f 100644 --- a/cli/tests/pcluster/config/test_cluster_config.py +++ b/cli/tests/pcluster/config/test_cluster_config.py @@ -6,13 +6,18 @@ AmiSearchFilters, BaseClusterConfig, ClusterDevSettings, + ComputeSettings, + Ebs, FlexibleInstanceType, + GpuHealthCheck, HeadNode, HeadNodeImage, HeadNodeNetworking, + HealthChecks, Image, PlacementGroup, QueueImage, + SharedEbs, SlurmClusterConfig, SlurmComputeResource, SlurmComputeResourceNetworking, @@ -29,11 +34,13 @@ instance_type="test", name="test1", networking=SlurmComputeResourceNetworking(placement_group=PlacementGroup(implied=True)), + health_checks=HealthChecks(gpu=GpuHealthCheck(enabled=True)), ), SlurmComputeResource( instance_type="test", name="test2", networking=SlurmComputeResourceNetworking(placement_group=PlacementGroup(enabled=True)), + health_checks=HealthChecks(gpu=GpuHealthCheck(enabled=False)), ), SlurmComputeResource( instance_type="test", @@ -53,6 +60,11 @@ ] +@pytest.fixture +def get_region(mocker): + mocker.patch("pcluster.config.cluster_config.get_region", return_value="WHATEVER_REGION") + + @pytest.fixture def instance_type_info_mock(aws_api_mock): aws_api_mock.ec2.get_instance_type_info.return_value = InstanceTypeInfo( @@ -67,6 +79,7 @@ def instance_type_info_mock(aws_api_mock): @pytest.mark.usefixtures("instance_type_info_mock") +@pytest.mark.usefixtures("get_region") class TestBaseClusterConfig: @pytest.fixture() def base_cluster_config(self): @@ -123,7 +136,6 @@ def test_registration_of_validators(self, memory_scheduling_enabled, mocker): ), ), ) - mocker.patch("pcluster.config.cluster_config.get_region", return_value="") cluster_config._register_validators() assert_that(cluster_config._validators).is_not_empty() @@ -271,10 +283,10 @@ def test_compute_node_ami_id( aws_api_mock.ec2.get_official_image_id.assert_not_called() @pytest.mark.parametrize( - "queue, expected_result", + "queue_parameters, expected_result", [ ( - SlurmQueue( + dict( name="queue", networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(enabled=False)), compute_resources=mock_compute_resources, @@ -288,7 +300,7 @@ def test_compute_node_ami_id( ], ), ( - SlurmQueue( + dict( name="queue", networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(enabled=True)), compute_resources=mock_compute_resources, @@ -302,7 +314,7 @@ def test_compute_node_ami_id( ], ), ( - SlurmQueue( + dict( name="queue", networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(name="test-q")), compute_resources=mock_compute_resources, @@ -316,7 +328,7 @@ def test_compute_node_ami_id( ], ), ( - SlurmQueue( + dict( name="queue", networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup()), compute_resources=mock_compute_resources, @@ -331,17 +343,18 @@ def test_compute_node_ami_id( ), ], ) - def test_get_placement_group_settings_for_compute_resource(self, queue, expected_result): + def test_get_placement_group_settings_for_compute_resource(self, queue_parameters, expected_result): + queue = SlurmQueue(**queue_parameters) actual = [] for resource in queue.compute_resources: actual.append(queue.get_placement_group_settings_for_compute_resource(resource)) assert_that(actual).is_equal_to(expected_result) @pytest.mark.parametrize( - "queue, expected_result", + "queue_parameters, expected_result", [ ( - SlurmQueue( + dict( name="queue", networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(enabled=False)), compute_resources=mock_compute_resources, @@ -349,7 +362,7 @@ def test_get_placement_group_settings_for_compute_resource(self, queue, expected ["queue-test2"], ), ( - SlurmQueue( + dict( name="queue", networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(enabled=True)), compute_resources=mock_compute_resources, @@ -357,7 +370,7 @@ def test_get_placement_group_settings_for_compute_resource(self, queue, expected ["queue-test1", "queue-test2"], ), ( - SlurmQueue( + dict( name="queue", networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(name="test-q")), compute_resources=mock_compute_resources, @@ -365,7 +378,7 @@ def test_get_placement_group_settings_for_compute_resource(self, queue, expected ["queue-test2"], ), ( - SlurmQueue( + dict( name="queue", networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup()), compute_resources=mock_compute_resources, @@ -374,9 +387,142 @@ def test_get_placement_group_settings_for_compute_resource(self, queue, expected ), ], ) - def test_get_managed_placement_group_keys(self, queue, expected_result): + def test_get_managed_placement_group_keys(self, queue_parameters, expected_result): + queue = SlurmQueue(**queue_parameters) actual = queue.get_managed_placement_group_keys() assert_that(actual).is_equal_to(expected_result) def test_get_instance_types_data(self, base_cluster_config): assert_that(base_cluster_config.get_instance_types_data()).is_equal_to({}) + + @pytest.mark.parametrize( + "queue_parameters, expected_result", + [ + # At ComputeResource level the Health Check is enabled for CR test1, disabled for CR test2 + # undefined otherwise + ( + # Health Checks section is not defined at SlurmQuel level + dict( + name="queue", + networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(enabled=False)), + compute_resources=mock_compute_resources, + ), + ["test1", "", "", "", ""], + ), + ( + # Health Checks section is enabled at SlurmQuel level + dict( + name="queue", + networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(enabled=False)), + health_checks=HealthChecks(gpu=GpuHealthCheck(enabled=True)), + compute_resources=mock_compute_resources, + ), + ["test1", "", "queue", "queue", "queue"], + ), + ( + # Health Checks section is disabled at SlurmQuel level + dict( + name="queue", + networking=SlurmQueueNetworking(subnet_ids=[], placement_group=PlacementGroup(enabled=False)), + health_checks=HealthChecks(gpu=GpuHealthCheck(enabled=False)), + compute_resources=mock_compute_resources, + ), + ["test1", "", "", "", ""], + ), + ], + ) + def test_get_enabled_health_checks_section(self, queue_parameters, expected_result): + queue = SlurmQueue(**queue_parameters) + health_check_gpu_enabled = [] + queue_gpu_check_enabled = queue.health_checks.gpu is not None and queue.health_checks.gpu.enabled + for compute_resource in queue.compute_resources: + compute_resource_gpu_check_enabled = ( + compute_resource.health_checks.gpu is not None and compute_resource.health_checks.gpu.enabled + ) + if compute_resource_gpu_check_enabled: + health_check_gpu_enabled.append(compute_resource.name) + elif compute_resource_gpu_check_enabled is False: + health_check_gpu_enabled.append("") + elif queue_gpu_check_enabled: + health_check_gpu_enabled.append(queue.name) + else: + health_check_gpu_enabled.append("") + assert_that(health_check_gpu_enabled).is_equal_to(expected_result) + + @pytest.mark.parametrize( + "region, expected_volume_type", + [ + ("us-iso-WHATEVER", "gp2"), + ("us-isob-WHATEVER", "gp2"), + ("WHATEVER_ELSE_REGION", "gp3"), + ], + ) + def test_head_node_root_volume(self, mocker, region, expected_volume_type): + mocker.patch("pcluster.config.cluster_config.get_region", return_value=region) + + cluster_config = BaseClusterConfig( + cluster_name="clustername", + image=Image("alinux2"), + head_node=HeadNode("c5.xlarge", HeadNodeNetworking("subnet")), + ) + + assert_that(cluster_config.head_node.local_storage.root_volume.volume_type).is_equal_to(expected_volume_type) + + @pytest.mark.parametrize( + "region, expected_volume_type", + [ + ("us-iso-WHATEVER", "gp2"), + ("us-isob-WHATEVER", "gp2"), + ("WHATEVER_ELSE_REGION", "gp3"), + ], + ) + def test_compute_settings_root_volume(self, mocker, region, expected_volume_type): + mocker.patch("pcluster.config.cluster_config.get_region", return_value=region) + + compute_settings = ComputeSettings() + + assert_that(compute_settings.local_storage.root_volume.volume_type).is_equal_to(expected_volume_type) + + def test_tags_in_slurm_queue(self): + tags = [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")] + queue = SlurmQueue( + name="queue0", + networking=SlurmQueueNetworking(subnet_ids=["subnet"]), + compute_resources=mock_compute_resources, + tags=tags, + ) + assert_that(queue.get_tags()).is_equal_to(tags) + + +class TestSharedEbs: + @pytest.mark.parametrize( + "region, expected_volume_type", + [ + ("us-iso-WHATEVER", "gp2"), + ("us-isob-WHATEVER", "gp2"), + ("WHATEVER_ELSE_REGION", "gp3"), + ], + ) + def test_shared_storage_ebs(self, mocker, region, expected_volume_type): + mocker.patch("pcluster.config.cluster_config.get_region", return_value=region) + + shared_ebs = SharedEbs(mount_dir="/mount/dir", name="mount-name") + + assert_that(shared_ebs.volume_type).is_equal_to(expected_volume_type) + + +class TestEbs: + @pytest.mark.parametrize( + "region, expected_volume_type", + [ + ("us-iso-WHATEVER", "gp2"), + ("us-isob-WHATEVER", "gp2"), + ("WHATEVER_ELSE_REGION", "gp3"), + ], + ) + def test_shared_storage_ebs(self, mocker, region, expected_volume_type): + mocker.patch("pcluster.config.cluster_config.get_region", return_value=region) + + ebs = Ebs() + + assert_that(ebs.volume_type).is_equal_to(expected_volume_type) diff --git a/cli/tests/pcluster/config/test_common.py b/cli/tests/pcluster/config/test_common.py index 5c36ae6727..74df9e790d 100644 --- a/cli/tests/pcluster/config/test_common.py +++ b/cli/tests/pcluster/config/test_common.py @@ -8,13 +8,20 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +import asyncio from typing import List import pytest from assertpy import assert_that from pcluster.config.common import Resource -from pcluster.validators.common import FailureLevel, Validator, ValidatorContext +from pcluster.validators.common import ( + AsyncValidator, + FailureLevel, + Validator, + ValidatorContext, + get_async_timed_validator_type_for, +) class FakeInfoValidator(Validator): @@ -31,6 +38,22 @@ def _validate(self, param): self._add_failure(f"Error {param}.", FailureLevel.ERROR) +class FakeAsyncInfoValidator(AsyncValidator): + """Dummy validator of info level.""" + + async def _validate_async(self, param): + await asyncio.sleep(0.2) + self._add_failure(f"Wrong async value {param}.", FailureLevel.INFO) + + +class FakeAsyncErrorValidator(AsyncValidator): + """Dummy validator of error level.""" + + async def _validate_async(self, param): + await asyncio.sleep(0.5) + self._add_failure(f"Error async {param}.", FailureLevel.ERROR) + + class FakeComplexValidator(Validator): """Dummy validator requiring multiple parameters as input.""" @@ -45,6 +68,13 @@ def _validate(self, property_value: str): self._add_failure(f"Wrong value {property_value}.", FailureLevel.INFO) +class FakeFaultyValidator(Validator): + """Dummy validator that raises an unexpected error.""" + + def _validate(self, param: str): + raise RuntimeError("dummy fault") + + def assert_validation_result(result, expected_level, expected_message): """Assert that validation results is the expected one, by checking level and message.""" assert_that(result.level).is_equal_to(expected_level) @@ -80,6 +110,99 @@ def _register_validators(self, context: ValidatorContext = None): assert_validation_result(validation_failures[2], FailureLevel.INFO, "Wrong value other-value.") +def test_resource_validate_unexpected_error(): + """Verify that an unexpected error thrown by a validator does not interrupt the validation process.""" + + class FakeResource(Resource): + """Fake resource class to test validators.""" + + def __init__(self): + super().__init__() + self.fake_attribute = "fake-value" + self.other_attribute = "other-value" + + def _register_validators(self, context: ValidatorContext = None): + self._register_validator(FakeErrorValidator, param=self.fake_attribute) + self._register_validator(FakeFaultyValidator, param=self.fake_attribute) + self._register_validator(FakeInfoValidator, param=self.other_attribute) + + fake_resource = FakeResource() + validation_failures = fake_resource.validate() + + # Verify high prio is the first of the list + assert_validation_result(validation_failures[0], FailureLevel.ERROR, "Error fake-value.") + assert_validation_result(validation_failures[1], FailureLevel.ERROR, "dummy fault") + assert_validation_result(validation_failures[2], FailureLevel.INFO, "Wrong value other-value.") + + +def test_async_resource_validation(): + """Verify that sync and async validators are executed in the right order according to priorities.""" + + class FakeResource(Resource): + """Fake resource class to test validators.""" + + def __init__(self): + super().__init__() + self.fake_attribute = "fake-value" + self.other_attribute = "other-value" + + def _register_validators(self, context: ValidatorContext = None): + self._register_validator(FakeErrorValidator, param=self.fake_attribute) + self._register_validator(FakeInfoValidator, param=self.other_attribute) + self._register_validator(FakeAsyncErrorValidator, param=self.fake_attribute) + self._register_validator(FakeAsyncInfoValidator, param=self.other_attribute) + + fake_resource = FakeResource() + validation_failures = fake_resource.validate() + + assert_validation_result(validation_failures[0], FailureLevel.ERROR, "Error fake-value.") + assert_validation_result(validation_failures[1], FailureLevel.INFO, "Wrong value other-value.") + assert_validation_result(validation_failures[2], FailureLevel.ERROR, "Error async fake-value.") + assert_validation_result(validation_failures[3], FailureLevel.INFO, "Wrong async value other-value.") + + +def test_async_resource_validation_with_timeout(): + """Verify that async validators can fail due to timeout.""" + + class FakeResource(Resource): + """Fake resource class to test validators.""" + + def __init__(self): + super().__init__() + self.fake_attribute = "fake-value" + self.other_attribute = "other-value" + + def _register_validators(self, context: ValidatorContext = None): + self._register_validator( + get_async_timed_validator_type_for(FakeAsyncErrorValidator), param=self.fake_attribute, timeout=0.1 + ) + self._register_validator( + get_async_timed_validator_type_for(FakeAsyncInfoValidator), param=self.other_attribute, timeout=0 + ) + self._register_validator( + get_async_timed_validator_type_for(FakeAsyncErrorValidator), param=self.fake_attribute, timeout=3 + ) + self._register_validator( + get_async_timed_validator_type_for(FakeAsyncInfoValidator), param=self.other_attribute, timeout=4 + ) + + fake_resource = FakeResource() + validation_failures = fake_resource.validate() + + assert_validation_result( + validation_failures[0], + FailureLevel.WARNING, + "Validation of ({'param': 'fake-value'}) timed out after 0.1 seconds.", + ) + assert_validation_result( + validation_failures[1], + FailureLevel.WARNING, + "Validation of ({'param': 'other-value'}) timed out after 0 seconds.", + ) + assert_validation_result(validation_failures[2], FailureLevel.ERROR, "Error async fake-value.") + assert_validation_result(validation_failures[3], FailureLevel.INFO, "Wrong async value other-value.") + + def test_dynamic_property_validate(): """Verify that validators of dynamic parameters are working as expected.""" diff --git a/cli/tests/pcluster/config/test_config_patch.py b/cli/tests/pcluster/config/test_config_patch.py index 6c150fbfba..2e42bdb8bc 100644 --- a/cli/tests/pcluster/config/test_config_patch.py +++ b/cli/tests/pcluster/config/test_config_patch.py @@ -197,6 +197,16 @@ def _sorting_func(change): False, id="change queue instance profile", ), + pytest.param( + ["Scheduling", "SlurmQueues[queue1]", "Tags[tag1]"], + "queue_tag_value", + "Value", + "queue_tag_value_1", + "queue_tag_value_2", + UpdatePolicy.QUEUE_UPDATE_STRATEGY, + False, + id="change queue tag value", + ), ], ) def test_single_param_change( diff --git a/cli/tests/pcluster/config/test_config_patch/test_single_param_change/pcluster.config.yaml b/cli/tests/pcluster/config/test_config_patch/test_single_param_change/pcluster.config.yaml index b4bf694401..0d45e45151 100644 --- a/cli/tests/pcluster/config/test_config_patch/test_single_param_change/pcluster.config.yaml +++ b/cli/tests/pcluster/config/test_config_patch/test_single_param_change/pcluster.config.yaml @@ -31,8 +31,18 @@ Scheduling: InstanceType: {{compute_instance_type}} MinCount: 1 MaxCount: {{max_count}} + {% if compute_tag_value %} + Tags: + - Key: computetag1 + Value: {{compute_tag_value}} + {% endif %} Image: CustomAmi: {{queue_custom_ami}} + {% if queue_tag_value %} + Tags: + - Key: tag1 + Value: {{queue_tag_value}} + {% endif %} SharedStorage: - MountDir: vol1 Name: ebs1 diff --git a/cli/tests/pcluster/config/test_update_policy.py b/cli/tests/pcluster/config/test_update_policy.py index fe39a008cf..bcc4b9ecea 100644 --- a/cli/tests/pcluster/config/test_update_policy.py +++ b/cli/tests/pcluster/config/test_update_policy.py @@ -30,6 +30,7 @@ "is_fleet_stopped, old_max, new_max, expected_result", [ pytest.param(True, 10, 9, True, id="stopped fleet and new_max < old_max"), + pytest.param(False, "10", "9", False, id="running fleet and new_max < old_max"), pytest.param(True, 10, 11, True, id="stopped fleet new_max > old_max"), pytest.param(False, 10, 9, False, id="running fleet and new_max < old_max"), pytest.param(False, 10, 11, True, id="running fleet and new_max > old_max"), @@ -208,6 +209,66 @@ def test_max_count_policy(mocker, is_fleet_stopped, old_max, new_max, expected_r False, id="running fleet with change outside SlurmQueues which requires COMPUTE_FLEET_STOP", ), + pytest.param( + False, + "Tags", + ["Scheduling", "SlurmQueues[queue1]"], + '{"Key": "queue_tag1","Value": "queue_tag_value_1"}', + None, + QueueUpdateStrategy.DRAIN.value, + True, + id="running fleet and queue tag unset with update strategy DRAIN", + ), + pytest.param( + False, + "Tags", + ["Scheduling", "SlurmQueues[queue1]"], + None, + '{"Key": "queue_tag1","Value": "queue_tag_value_1"}', + None, + False, + id="running fleet and queue tag set without queue update strategy", + ), + pytest.param( + False, + "Value", + ["Scheduling", "SlurmQueues[queue1]", "Tags[tag1]"], + "value_1", + "value_2", + QueueUpdateStrategy.DRAIN.value, + True, + id="running fleet and change queue tag with update strategy DRAIN", + ), + pytest.param( + True, + "Value2", + ["Scheduling", "SlurmQueues[queue1]", "Tags[tag1]"], + "value_1", + "value_2", + None, + True, + id="Stop fleet and queue tag unset without queue update strategy", + ), + pytest.param( + False, + "Tags", + ["Scheduling", "SlurmQueues[queue1]", "ComputeResources", "Tags[computetag1]"], + '{"Key": "compute_tag2","Value": "compute_tag_value_1"}', + None, + QueueUpdateStrategy.DRAIN.value, + True, + id="running fleet and compute tag unset with update strategy DRAIN", + ), + pytest.param( + False, + "Value", + ["Scheduling", "SlurmQueues[queue1]", "ComputeResources[compute-resource1]", "Tags[computetag1]"], + "value_1", + "value_2", + None, + False, + id="running fleet and change compute tag without update strategy DRAIN", + ), ], ) def test_queue_update_strategy_condition_checker( diff --git a/cli/tests/pcluster/models/dummy_s3_bucket.py b/cli/tests/pcluster/models/dummy_s3_bucket.py index 5d9c38099c..cf8b260096 100644 --- a/cli/tests/pcluster/models/dummy_s3_bucket.py +++ b/cli/tests/pcluster/models/dummy_s3_bucket.py @@ -81,6 +81,7 @@ def mock_bucket_object_utils( upload_config_side_effect=None, get_config_side_effect=None, upload_template_side_effect=None, + upload_asset_side_effect=None, get_template_side_effect=None, upload_resources_side_effect=None, delete_s3_artifacts_side_effect=None, @@ -101,6 +102,9 @@ def mock_bucket_object_utils( upload_cfn_template_mock = mocker.patch( "pcluster.models.s3_bucket.S3Bucket.upload_cfn_template", side_effect=upload_template_side_effect ) + upload_cfn_asset_mock = mocker.patch( + "pcluster.models.s3_bucket.S3Bucket.upload_cfn_asset", side_effect=upload_asset_side_effect + ) get_cfn_template_mock = mocker.patch( "pcluster.models.s3_bucket.S3Bucket.get_cfn_template", return_value=fake_template, @@ -130,6 +134,7 @@ def mock_bucket_object_utils( "upload_config": upload_config_mock, "get_config": get_config_mock, "upload_cfn_template": upload_cfn_template_mock, + "upload_cfn_asset": upload_cfn_asset_mock, "get_cfn_template": get_cfn_template_mock, "upload_resources": upload_resources_mock, "delete_s3_artifacts": delete_s3_artifacts_mock, diff --git a/cli/tests/pcluster/models/test_cluster.py b/cli/tests/pcluster/models/test_cluster.py index c99816f210..0debb84ebd 100644 --- a/cli/tests/pcluster/models/test_cluster.py +++ b/cli/tests/pcluster/models/test_cluster.py @@ -934,6 +934,47 @@ def test_upload_change_set(self, mocker, cluster, changes, change_set): else: assert_that(bucket_object_utils_dict.get("upload_config").call_count).is_equal_to(0) + @pytest.mark.parametrize( + "assets_metadata, expected_parameters", + [ + ( + [ + { + "hash_parameter": {"key": "AssetParameters12345ArtifactHashabcde", "value": ""}, + "s3_bucket_parameter": {"key": "AssetParameters12345S3Bucketabcde", "value": "AssetS3Bucket"}, + "s3_object_key_parameter": { + "key": "AssetParameters12345S3VersionKeyabcde", + "value": "AssetS3ObjectKey", + }, + }, + ], + [ + ( + { + "ParameterKey": "AssetParameters12345ArtifactHashabcde", + "ParameterValue": "", + }, + { + "ParameterKey": "AssetParameters12345S3Bucketabcde", + "ParameterValue": "AssetS3Bucket", + }, + { + "ParameterKey": "AssetParameters12345S3VersionKeyabcde", + "ParameterValue": "AssetS3ObjectKey||", + }, + ) + ], + ), + ( + None, + [], + ), + ], + ) + def test_assets_parameter_generation(self, assets_metadata, expected_parameters): + asset_parameters = Cluster._generate_asset_parameters(assets_metadata) + assert_that(asset_parameters).is_equal_to(expected_parameters) + OLD_CONFIGURATION = """ Image: diff --git a/cli/tests/pcluster/models/test_s3_bucket.py b/cli/tests/pcluster/models/test_s3_bucket.py index 0d68070919..154c17008c 100644 --- a/cli/tests/pcluster/models/test_s3_bucket.py +++ b/cli/tests/pcluster/models/test_s3_bucket.py @@ -9,12 +9,13 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. import os +import textwrap import pytest from assertpy import assert_that from pcluster.aws.common import AWSClientError -from pcluster.models.s3_bucket import S3Bucket +from pcluster.models.s3_bucket import S3Bucket, S3FileFormat, S3FileType, format_content from tests.pcluster.aws.dummy_aws_api import mock_aws_api from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket @@ -206,3 +207,97 @@ def test_get_resource_url(region, bucket_name, cluster_name, resource_name, expe ) assert_that(bucket.get_resource_url(resource_name)).is_equal_to(expected_url) + + +@pytest.mark.parametrize( + "content, s3_file_format, expected_output", + [ + ( + { + "A": { + "A1": "X", + "A2": "Y", + }, + "B": {"B1": "M"}, + }, + S3FileFormat.YAML, + textwrap.dedent( + """\ + A: + A1: X + A2: Y + B: + B1: M + """ + ), + ), + ( + { + "A": { + "A1": "X", + "A2": "Y", + }, + "B": {"B1": "M"}, + }, + S3FileFormat.JSON, + '{"A": {"A1": "X", "A2": "Y"}, "B": {"B1": "M"}}', + ), + ( + { + "A": { + "A1": "X", + "A2": "Y", + }, + "B": {"B1": "M"}, + }, + S3FileFormat.MINIFIED_JSON, + '{"A":{"A1":"X","A2":"Y"},"B":{"B1":"M"}}', + ), + ( + { + "A": { + "A1": "X", + "A2": "Y", + }, + "B": {"B1": "M"}, + }, + None, + {"A": {"A1": "X", "A2": "Y"}, "B": {"B1": "M"}}, + ), + ], +) +def test_format_content(content, s3_file_format, expected_output): + formatted_content = format_content(content=content, s3_file_format=s3_file_format) + assert_that(formatted_content).is_equal_to(expected_output) + assert_that(formatted_content).is_type_of(type(expected_output)) + + +@pytest.mark.parametrize( + "content, file_name, file_type, s3_file_format, expected_object_key, expected_object_body", + [ + ( + {"Test": "Content"}, + "test_file_name", + S3FileType.ASSETS, + S3FileFormat.YAML, + "assets/test_file_name", + "Test: Content\n", + ) + ], +) +def test_upload_file(mocker, content, file_name, file_type, s3_file_format, expected_object_key, expected_object_body): + mock_aws_api(mocker) + mock_bucket(mocker) + + bucket_name = "test-bucket" + artifact_directory = "pcluster_artifact_directory" + bucket = dummy_cluster_bucket(bucket_name=bucket_name, artifact_directory=artifact_directory) + s3_put_object_patch = mocker.patch("pcluster.aws.s3.S3Client.put_object") + + bucket.upload_file(content, file_name, file_type, s3_file_format) + + s3_put_object_patch.assert_called_once_with( + bucket_name=bucket_name, + body=expected_object_body, + key=f"{artifact_directory}/{expected_object_key}", + ) diff --git a/cli/tests/pcluster/schemas/test_cluster_schema.py b/cli/tests/pcluster/schemas/test_cluster_schema.py index 564161e0c2..15112f0fda 100644 --- a/cli/tests/pcluster/schemas/test_cluster_schema.py +++ b/cli/tests/pcluster/schemas/test_cluster_schema.py @@ -30,6 +30,7 @@ ImageSchema, QueueCustomActionsSchema, QueueIamSchema, + QueueTagSchema, SchedulerPluginCloudFormationClusterInfrastructureSchema, SchedulerPluginClusterSharedArtifactSchema, SchedulerPluginDefinitionSchema, @@ -219,9 +220,24 @@ def test_head_node_root_volume_schema(mocker, config_dict, failure_message): [ # Failures ({"OnNodeUpdating": "test"}, "Unknown field"), - ({"OnNodeStart": "test", "OnNodeConfigured": "test", "OnNodeUpdated": "test"}, "Invalid input type."), - ({"OnNodeUpdated": {"ScriptWrong": "test3", "Args": ["5", "6"]}}, "Unknown field"), + ( + {"OnNodeStart": "test", "OnNodeConfigured": "test", "OnNodeUpdated": "test"}, + "Either Script or Sequence field must be provided.", + ), + ( + {"OnNodeConfigured": {"Script": "test3", "Args": ["5", "6"], "Sequence": []}}, + "Both Script and Sequence fields are provided. Only one is allowed.", + ), + ( + {"OnNodeUpdated": {"ScriptWrong": "test3", "Args": ["5", "6"]}}, + "Either Script or Sequence field must be provided.", + ), + ( + {"OnNodeUpdated": {"Sequence": "test"}}, + "Invalid input type for Sequence, expected list.", + ), # Successes + ({}, None), ( { "OnNodeStart": {"Script": "test", "Args": ["1", "2"]}, @@ -230,6 +246,21 @@ def test_head_node_root_volume_schema(mocker, config_dict, failure_message): }, None, ), + ( + { + "OnNodeStart": { + "Sequence": [ + {"Script": "test1", "Args": ["1", "2"]}, + {"Script": "test2", "Args": ["1", "2", "3"]}, + {"Script": "test3"}, + {"Script": "test4", "Args": []}, + ] + }, + "OnNodeConfigured": {"Script": "test2", "Args": ["3", "4"]}, + "OnNodeUpdated": {"Sequence": []}, + }, + None, + ), ( { "OnNodeStart": {"Script": "test"}, @@ -253,7 +284,8 @@ def test_head_node_custom_actions_schema(mocker, config_dict, failure_message): with pytest.raises(ValidationError, match=failure_message): HeadNodeCustomActionsSchema().load(config_dict) else: - HeadNodeCustomActionsSchema().load(config_dict) + conf = HeadNodeCustomActionsSchema().load(config_dict) + HeadNodeCustomActionsSchema().dump(conf) @pytest.mark.parametrize( @@ -266,10 +298,15 @@ def test_head_node_custom_actions_schema(mocker, config_dict, failure_message): "OnNodeStart": "test", "OnNodeConfigured": "test", }, - "Invalid input type.", + "Either Script or Sequence field must be provided.", + ), + ( + {"OnNodeStart": {"Script": "test3", "Args": ["5", "6"], "Sequence": []}}, + "Both Script and Sequence fields are provided. Only one is allowed.", ), ({"OnNodeUpdated": {"Script": "test3", "Args": ["5", "6"]}}, "Unknown field"), # Successes + ({}, None), ( { "OnNodeStart": {"Script": "test", "Args": ["1", "2"]}, @@ -291,6 +328,27 @@ def test_head_node_custom_actions_schema(mocker, config_dict, failure_message): }, None, ), + ( + { + "OnNodeStart": { + "Sequence": [ + {"Script": "test1", "Args": ["1", "2"]}, + {"Script": "test2", "Args": ["1", "2", "3"]}, + {"Script": "test3"}, + {"Script": "test4", "Args": []}, + ] + }, + "OnNodeConfigured": {"Sequence": []}, + }, + None, + ), + ( + { + "OnNodeStart": {"Script": "test1", "Args": ["1", "2"]}, + "OnNodeConfigured": {"Sequence": []}, + }, + None, + ), ], ) def test_queue_custom_actions_schema(mocker, config_dict, failure_message): @@ -299,7 +357,8 @@ def test_queue_custom_actions_schema(mocker, config_dict, failure_message): with pytest.raises(ValidationError, match=failure_message): QueueCustomActionsSchema().load(config_dict) else: - QueueCustomActionsSchema().load(config_dict) + conf = QueueCustomActionsSchema().load(config_dict) + QueueCustomActionsSchema().dump(conf) def dummy_slurm_queue(name="queue1", number_of_compute_resource=1): @@ -1066,3 +1125,244 @@ def test_timeouts_schema(head_node_bootstrap_timeout, compute_node_bootstrap_tim assert_that(timeouts.compute_node_bootstrap_timeout).is_equal_to( compute_node_bootstrap_timeout or NODE_BOOTSTRAP_TIMEOUT ) + + +@pytest.mark.parametrize( + "config_dict, failure_message, expected_queue_gpu_hc, expected_cr1_gpu_hc, expected_cr2_gpu_hc", + [ + # HealthChecks dictionary is empty + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + {"Name": "compute_resource2", "InstanceType": "c4.2xlarge"}, + ], + "HealthChecks": {}, + }, + "", + None, + None, + None, + ), + # Health Checks sections are not defined + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + {"Name": "compute_resource2", "InstanceType": "c4.2xlarge"}, + ], + }, + "", + None, + None, + None, + ), + # Health Checks section is defined at queue level + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + {"Name": "compute_resource2", "InstanceType": "c4.2xlarge"}, + ], + "HealthChecks": {"Gpu": {"Enabled": True}}, + }, + "", + True, + None, + None, + ), + # Health Checks section is defined in a single compute resource + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + { + "Name": "compute_resource2", + "InstanceType": "c4.2xlarge", + "HealthChecks": {"Gpu": {"Enabled": True}}, + }, + ], + }, + "", + None, + None, + True, + ), + # Health Checks sections are defined at queue level and a single compute resource + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + { + "Name": "compute_resource2", + "InstanceType": "c4.2xlarge", + "HealthChecks": {"Gpu": {"Enabled": True}}, + }, + ], + "HealthChecks": {"Gpu": {"Enabled": True}}, + }, + "", + True, + None, + True, + ), + # Health Checks sections are defined at queue level and in both compute resource + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + { + "Name": "compute_resource1", + "InstanceType": "c5.2xlarge", + "MaxCount": 5, + "HealthChecks": {"Gpu": {"Enabled": True}}, + }, + { + "Name": "compute_resource2", + "InstanceType": "c4.2xlarge", + "HealthChecks": {"Gpu": {"Enabled": True}}, + }, + ], + "HealthChecks": {"Gpu": {"Enabled": True}}, + }, + "", + True, + True, + True, + ), + # Gpu Health Check enable is defined using the true string value instead of the boolean value + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + {"Name": "compute_resource2", "InstanceType": "c4.2xlarge"}, + ], + "HealthChecks": {"Gpu": {"Enabled": "true"}}, + }, + "", + True, + None, + None, + ), + # Gpu Health Check enable is defined using the true integer value instead of the boolean value + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + {"Name": "compute_resource2", "InstanceType": "c4.2xlarge"}, + ], + "HealthChecks": {"Gpu": {"Enabled": 1}}, + }, + "", + True, + None, + None, + ), + # Gpu Health Check enable is defined using a string, and it doesn't represent a boolean + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + {"Name": "compute_resource2", "InstanceType": "c4.2xlarge"}, + ], + "HealthChecks": {"Gpu": {"Enabled": "vero"}}, + }, + "Not a valid boolean", + None, + None, + None, + ), + # Gpu Health Check enable is defined using an integer, and it doesn't represent a boolean + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + {"Name": "compute_resource2", "InstanceType": "c4.2xlarge"}, + ], + "HealthChecks": {"Gpu": {"Enabled": -1}}, + }, + "Not a valid boolean", + None, + None, + None, + ), + # Gpu Health Check enable is not defined + ( + { + "Name": "Standard-Queue", + "Networking": {"SubnetIds": ["subnet-12345678"]}, + "ComputeResources": [ + {"Name": "compute_resource1", "InstanceType": "c5.2xlarge", "MaxCount": 5}, + {"Name": "compute_resource2", "InstanceType": "c4.2xlarge"}, + ], + "HealthChecks": {"Gpu"}, + }, + "Invalid input type", + None, + None, + None, + ), + ], +) +def test_slurm_gpu_health_checks( + mocker, + config_dict, + failure_message, + expected_queue_gpu_hc, + expected_cr1_gpu_hc, + expected_cr2_gpu_hc, +): + mock_aws_api(mocker) + if failure_message: + with pytest.raises(ValidationError, match=failure_message): + SlurmQueueSchema().load(config_dict) + else: + queue = SlurmQueueSchema().load(config_dict) + assert_that(queue.health_checks.gpu.enabled).is_equal_to(expected_queue_gpu_hc) + assert_that(queue.compute_resources[0].health_checks.gpu.enabled).is_equal_to(expected_cr1_gpu_hc) + assert_that(queue.compute_resources[1].health_checks.gpu.enabled).is_equal_to(expected_cr2_gpu_hc) + + +@pytest.mark.parametrize( + "config_dict, failure_message", + [ + # Failures + ({"Keys": "my_key", "Value": "my_value"}, "Unknown field"), + ({"Key": "my_key"}, "Missing data for required field"), + ({"Value": "my_value"}, "Missing data for required field"), + ( + { + "Key": "my_test", + "Value": "my_value", + }, + None, + ), + ], +) +def test_queue_tag_schema(mocker, config_dict, failure_message): + mock_aws_api(mocker) + if failure_message: + with pytest.raises(ValidationError, match=failure_message): + QueueTagSchema().load(config_dict) + else: + conf = QueueTagSchema().load(config_dict) + QueueTagSchema().dump(conf) diff --git a/cli/tests/pcluster/schemas/test_cluster_schema/test_scheduler_constraints_for_custom_actions/awsbatch.on_node_configured.yaml b/cli/tests/pcluster/schemas/test_cluster_schema/test_scheduler_constraints_for_custom_actions/awsbatch.on_node_configured.yaml index 6d2c74842f..acf076bcce 100644 --- a/cli/tests/pcluster/schemas/test_cluster_schema/test_scheduler_constraints_for_custom_actions/awsbatch.on_node_configured.yaml +++ b/cli/tests/pcluster/schemas/test_cluster_schema/test_scheduler_constraints_for_custom_actions/awsbatch.on_node_configured.yaml @@ -8,10 +8,15 @@ HeadNode: KeyName: ec2-key-name CustomActions: OnNodeConfigured: - Script: https://test.tgz # s3:// | https:// - Args: - - arg1 - - arg2 + Sequence: + - Script: https://test.tgz # s3:// | https:// + Args: + - arg1 + - arg2 + - Script: https://test.tgz # s3:// | https:// + Args: + - arg1 + - arg2 Scheduling: Scheduler: awsbatch AwsBatchQueues: diff --git a/cli/tests/pcluster/schemas/test_cluster_schema/test_scheduler_constraints_for_custom_actions/slurm.on_node_start.yaml b/cli/tests/pcluster/schemas/test_cluster_schema/test_scheduler_constraints_for_custom_actions/slurm.on_node_start.yaml index 01f7e71bfb..072caaf56d 100644 --- a/cli/tests/pcluster/schemas/test_cluster_schema/test_scheduler_constraints_for_custom_actions/slurm.on_node_start.yaml +++ b/cli/tests/pcluster/schemas/test_cluster_schema/test_scheduler_constraints_for_custom_actions/slurm.on_node_start.yaml @@ -24,3 +24,14 @@ Scheduling: InstanceType: c5.2xlarge - Name: compute_resource2 InstanceType: c4.2xlarge + CustomActions: + OnNodeStart: + Sequence: + - Script: https://test.tgz # s3:// | https:// + Args: + - arg1 + - arg2 + - Script: https://test.tgz # s3:// | https:// + Args: + - arg1 + - arg2 diff --git a/cli/tests/pcluster/schemas/test_common_schema.py b/cli/tests/pcluster/schemas/test_common_schema.py index beefca322b..a75144b548 100644 --- a/cli/tests/pcluster/schemas/test_common_schema.py +++ b/cli/tests/pcluster/schemas/test_common_schema.py @@ -18,6 +18,7 @@ ImdsSchema, LambdaFunctionsVpcConfigSchema, validate_json_format, + validate_no_duplicate_tag, validate_no_reserved_tag, ) @@ -111,3 +112,21 @@ def test_validate_no_reserved_tag(tags, failure_message): validate_no_reserved_tag(tags) else: validate_no_reserved_tag(tags) + + +@pytest.mark.parametrize( + "tags, failure_message", + [ + ([BaseTag(key="test1", value="test"), BaseTag(key="test2", value="test")], None), + ( + [BaseTag(key="test1", value="test"), BaseTag(key="test1", value="test")], + "Duplicate tag key \\(test1\\) detected. Tags keys should be unique within the Tags section.", + ), + ], +) +def test_validate_no_duplicate_tag(tags, failure_message): + if failure_message: + with pytest.raises(ValidationError, match=failure_message): + validate_no_duplicate_tag(tags) + else: + validate_no_duplicate_tag(tags) diff --git a/cli/tests/pcluster/templates/test_additional_packages.py b/cli/tests/pcluster/templates/test_additional_packages.py index 2c6c72a73e..20336eba2f 100644 --- a/cli/tests/pcluster/templates/test_additional_packages.py +++ b/cli/tests/pcluster/templates/test_additional_packages.py @@ -16,7 +16,7 @@ from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_yaml_dict from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket_object_utils from tests.pcluster.utils import get_resources @@ -29,12 +29,13 @@ ) def test_intel_hpc_platform(mocker, test_datadir, config_file_name, enabled): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) diff --git a/cli/tests/pcluster/templates/test_capacity_reservation.py b/cli/tests/pcluster/templates/test_capacity_reservation.py index 1417ae3da1..31ea8f7e23 100644 --- a/cli/tests/pcluster/templates/test_capacity_reservation.py +++ b/cli/tests/pcluster/templates/test_capacity_reservation.py @@ -18,7 +18,7 @@ from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_yaml_dict from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket_object_utils from tests.pcluster.utils import get_head_node_policy, get_statement_by_sid @@ -28,6 +28,7 @@ ) def test_capacity_reservation_id_permissions(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) mocker.patch( "pcluster.aws.ec2.Ec2Client.describe_capacity_reservations", @@ -44,7 +45,7 @@ def test_capacity_reservation_id_permissions(mocker, test_datadir, config_file_n cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -63,6 +64,7 @@ def test_capacity_reservation_id_permissions(mocker, test_datadir, config_file_n ) def test_capacity_reservation_group_arns_permissions(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) mocker.patch( "pcluster.aws.ec2.Ec2Client.describe_capacity_reservations", @@ -78,7 +80,7 @@ def test_capacity_reservation_group_arns_permissions(mocker, test_datadir, confi cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) diff --git a/cli/tests/pcluster/templates/test_cdk_artifacts_manager.py b/cli/tests/pcluster/templates/test_cdk_artifacts_manager.py new file mode 100644 index 0000000000..8639a51d1b --- /dev/null +++ b/cli/tests/pcluster/templates/test_cdk_artifacts_manager.py @@ -0,0 +1,48 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from aws_cdk.cloud_assembly_schema import FileAssetMetadataEntry + +from pcluster.models.s3_bucket import S3FileFormat +from pcluster.templates.cdk_artifacts_manager import CDKArtifactsManager +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket, mock_bucket_object_utils + + +@pytest.mark.parametrize( + "file_assets, asset_content", + [ + ( + [ + FileAssetMetadataEntry( + path="asset_path", + id="asset_logical_id", + s3_bucket_parameter="asset_s3_bucket", + s3_key_parameter="asset_s3_key", + artifact_hash_parameter="asset_hash_parameter", + packaging="File", + source_hash="", + ) + ], + "asset_content", + ) + ], +) +def test_upload_assets(mocker, mock_cloud_assembly, file_assets, asset_content): + cloud_assembly = mock_cloud_assembly(assets=file_assets) + mock_bucket(mocker) + mock_dict = mock_bucket_object_utils(mocker) + mocker.patch("pcluster.templates.cdk_artifacts_manager.load_json_dict", return_value=asset_content) + bucket = dummy_cluster_bucket() + + cdk_assets_manager = CDKArtifactsManager(cloud_assembly) + cdk_assets_manager.upload_assets(bucket) + + bucket_upload_asset_mock = mock_dict.get("upload_cfn_asset") + bucket_upload_asset_mock.assert_called_with( + asset_file_content=asset_content, asset_name=file_assets[0].id, format=S3FileFormat.MINIFIED_JSON + ) diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils.py b/cli/tests/pcluster/templates/test_cdk_builder_utils.py index 1faa3f33bd..7fb6b3a624 100644 --- a/cli/tests/pcluster/templates/test_cdk_builder_utils.py +++ b/cli/tests/pcluster/templates/test_cdk_builder_utils.py @@ -33,7 +33,13 @@ ) from pcluster.utils import load_yaml_dict, split_resource_prefix from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket, mock_bucket_object_utils +from tests.pcluster.utils import get_asset_content_with_resource_name + + +@pytest.fixture +def get_region(mocker): + mocker.patch("pcluster.config.cluster_config.get_region", return_value="WHATEVER_REGION") @pytest.mark.parametrize( @@ -94,12 +100,13 @@ def test_get_default_volume_tags(stack_name, node_type, raw_dict, expected_resul assert_that(get_default_volume_tags(stack_name, node_type, raw_dict)).is_equal_to(expected_result) +@pytest.mark.usefixtures("get_region") class TestCdkLaunchTemplateBuilder: @pytest.mark.parametrize( - "root_volume, image_os, expected_response", + "root_volume_parameters, image_os, expected_response", [ pytest.param( - RootVolume( + dict( size=10, encrypted=False, volume_type="mockVolumeType", @@ -196,7 +203,7 @@ class TestCdkLaunchTemplateBuilder: id="test with all root volume fields populated", ), pytest.param( - RootVolume( + dict( encrypted=True, volume_type="mockVolumeType", iops=15, @@ -293,7 +300,8 @@ class TestCdkLaunchTemplateBuilder: ), ], ) - def test_get_block_device_mappings(self, root_volume, image_os, expected_response): + def test_get_block_device_mappings(self, root_volume_parameters, image_os, expected_response): + root_volume = RootVolume(**root_volume_parameters) assert_that(CdkLaunchTemplateBuilder().get_block_device_mappings(root_volume, image_os)).is_equal_to( expected_response ) @@ -337,10 +345,10 @@ def test_get_instance_market_options(self, queue, compute_resource, expected_res ) @pytest.mark.parametrize( - "queue, compute_resource, expected_response", + "queue_parameters, compute_resource, expected_response", [ pytest.param( - SlurmQueue( + dict( name="queue1", capacity_reservation_target=CapacityReservationTarget( capacity_reservation_resource_group_arn="queue_cr_rg_arn", @@ -364,7 +372,7 @@ def test_get_instance_market_options(self, queue, compute_resource, expected_res id="test with queue and compute resource capacity reservation", ), pytest.param( - SlurmQueue( + dict( name="queue1", capacity_reservation_target=CapacityReservationTarget( capacity_reservation_id="queue_cr_id", @@ -385,7 +393,7 @@ def test_get_instance_market_options(self, queue, compute_resource, expected_res id="test with only queue capacity reservation", ), pytest.param( - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=None, @@ -399,7 +407,8 @@ def test_get_instance_market_options(self, queue, compute_resource, expected_res ), ], ) - def test_get_capacity_reservation(self, queue, compute_resource, expected_response): + def test_get_capacity_reservation(self, queue_parameters, compute_resource, expected_response): + queue = SlurmQueue(**queue_parameters) assert_that(CdkLaunchTemplateBuilder().get_capacity_reservation(queue, compute_resource)).is_equal_to( expected_response ) @@ -438,10 +447,11 @@ def test_iam_resource_prefix_build_in_cdk(mocker, test_datadir, config_file_name ) # mock bucket initialization parameters mock_bucket(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, cdk_assets = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -449,13 +459,16 @@ def test_iam_resource_prefix_build_in_cdk(mocker, test_datadir, config_file_name if cluster_config.iam and cluster_config.iam.resource_prefix: iam_path_prefix, iam_name_prefix = split_resource_prefix(cluster_config.iam.resource_prefix) generated_template = generated_template["Resources"] - role_name_ref = generated_template["InstanceProfile15b342af42246b70"]["Properties"]["Roles"][0][ + asset_resource = get_asset_content_with_resource_name(cdk_assets, "InstanceProfile15b342af42246b70").get( + "Resources" + ) + role_name_ref = asset_resource["InstanceProfile15b342af42246b70"]["Properties"]["Roles"][0][ "Ref" ] # Role15b342af42246b70 role_name_hn_ref = generated_template["InstanceProfileHeadNode"]["Properties"]["Roles"][0]["Ref"] # RoleHeadNode # Checking their Path - _check_instance_roles_n_profiles(generated_template, iam_path_prefix, iam_name_prefix, role_name_ref, "RoleName") + _check_instance_roles_n_profiles(asset_resource, iam_path_prefix, iam_name_prefix, role_name_ref, "RoleName") _check_instance_roles_n_profiles(generated_template, iam_path_prefix, iam_name_prefix, role_name_hn_ref, "RoleName") # Instance Profiles---> Checking Instance Profile Names and Instance profiles Path @@ -463,11 +476,11 @@ def test_iam_resource_prefix_build_in_cdk(mocker, test_datadir, config_file_name generated_template, iam_path_prefix, iam_name_prefix, "InstanceProfileHeadNode", "InstanceProfileName" ) _check_instance_roles_n_profiles( - generated_template, iam_path_prefix, iam_name_prefix, "InstanceProfile15b342af42246b70", "InstanceProfileName" + asset_resource, iam_path_prefix, iam_name_prefix, "InstanceProfile15b342af42246b70", "InstanceProfileName" ) # PC Policies _check_policies( - generated_template, iam_name_prefix, "ParallelClusterPolicies15b342af42246b70", "parallelcluster", role_name_ref + asset_resource, iam_name_prefix, "ParallelClusterPolicies15b342af42246b70", "parallelcluster", role_name_ref ) _check_policies( generated_template, iam_name_prefix, "ParallelClusterPoliciesHeadNode", "parallelcluster", role_name_hn_ref @@ -489,7 +502,7 @@ def test_iam_resource_prefix_build_in_cdk(mocker, test_datadir, config_file_name ) # Slurm Policies _check_policies( - generated_template, + asset_resource, iam_name_prefix, "SlurmPolicies15b342af42246b70", "parallelcluster-slurm-compute", @@ -528,9 +541,7 @@ def test_iam_resource_prefix_build_in_cdk(mocker, test_datadir, config_file_name and cluster_config.scheduling.queues[0].iam and cluster_config.scheduling.queues[0].iam.s3_access ): - _check_policies( - generated_template, iam_name_prefix, "S3AccessPolicies15b342af42246b70", "S3Access", role_name_ref - ) + _check_policies(asset_resource, iam_name_prefix, "S3AccessPolicies15b342af42246b70", "S3Access", role_name_ref) def _check_instance_roles_n_profiles(generated_template, iam_path_prefix, iam_name_prefix, resource_name, key_name): diff --git a/cli/tests/pcluster/templates/test_cdk_manifest_reader/manifest.json b/cli/tests/pcluster/templates/test_cdk_manifest_reader/manifest.json new file mode 100644 index 0000000000..defbe5dc2d --- /dev/null +++ b/cli/tests/pcluster/templates/test_cdk_manifest_reader/manifest.json @@ -0,0 +1,24 @@ +{ + "artifacts": { + "TestStack": { + "type": "aws:cloudformation:stack", + "environment": "aws://unknown-account/unknown-region", + "metadata": { + "/TestStack": [ + { + "type": "aws:cdk:asset", + "data": { + "path": "asset_path", + "id": "asset_logical_id", + "packaging": "file", + "sourceHash": "asset_hash", + "s3BucketParameter": "asset_s3_bucket", + "s3KeyParameter": "asset_s3_key", + "artifactHashParameter": "asset_hash_parameter" + } + } + ] + } + } + } +} \ No newline at end of file diff --git a/cli/tests/pcluster/templates/test_cluster_stack.py b/cli/tests/pcluster/templates/test_cluster_stack.py index 4579014ae7..965d7481f7 100644 --- a/cli/tests/pcluster/templates/test_cluster_stack.py +++ b/cli/tests/pcluster/templates/test_cluster_stack.py @@ -8,6 +8,7 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +import difflib import json import os import re @@ -24,20 +25,24 @@ MAX_EBS_COUNT, MAX_EXISTING_STORAGE_COUNT, MAX_NEW_STORAGE_COUNT, - MAX_NUMBER_OF_COMPUTE_RESOURCES, + MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_CLUSTER, MAX_NUMBER_OF_QUEUES, ) +from pcluster.models.s3_bucket import S3FileFormat, format_content from pcluster.schemas.cluster_schema import ClusterSchema from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_json_dict, load_yaml_dict from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket, mock_bucket_object_utils from tests.pcluster.utils import ( assert_lambdas_have_expected_vpc_config_and_managed_policy, + get_asset_content_with_resource_name, load_cluster_model_from_yaml, ) EXAMPLE_CONFIGS_DIR = f"{os.path.abspath(os.path.join(__file__, '..', '..'))}/example_configs" +MAX_SIZE_OF_CFN_TEMPLATE = 1024 * 1024 +MAX_RESOURCES_PER_TEMPLATE = 500 @pytest.mark.parametrize( @@ -58,12 +63,36 @@ def test_cluster_builder_from_configuration_file( mock_aws_api(mocker) # mock bucket initialization parameters mock_bucket(mocker) + mock_bucket_object_utils(mocker) # Search config file from example_configs folder to test standard configuration _, cluster = load_cluster_model_from_yaml(config_file_name) _generate_template(cluster, capsys) +def _assert_config_snapshot(config, expected_full_config_path): + """ + Confirm that no new configuration sections were added / removed. + + If any sections were added/removed: + 1. Add the section to the "slurm.full.all_resources.yaml" file + 2. Generate a new snapshot using the test output + TODO: Use a snapshot testing library + """ + cluster_name = "test_cluster" + full_config = ClusterSchema(cluster_name).dump(config) + full_config_yaml = yaml.dump(full_config) + + with open(expected_full_config_path, "r") as expected_full_config_file: + expected_full_config = expected_full_config_file.read() + diff = difflib.unified_diff( + full_config_yaml.splitlines(keepends=True), expected_full_config.splitlines(keepends=True) + ) + print("Diff between existing snapshot and new snapshot:") + print("".join(diff), end="") + assert_that(expected_full_config).is_equal_to(full_config_yaml) + + def test_cluster_config_limits(mocker, capsys, tmpdir, pcluster_config_reader, test_datadir): """ Build CFN template starting from config examples and assert CFN limits (file size and number of resources). @@ -74,6 +103,14 @@ def test_cluster_config_limits(mocker, capsys, tmpdir, pcluster_config_reader, t mock_aws_api(mocker) # mock bucket initialization parameters mock_bucket(mocker) + mock_bucket_object_utils(mocker) + + # The max number of queues cannot be used with the max number of compute resources + # (it will exceed the max number of compute resources per cluster) + # This workaround uses half of the max number of queues. It then calculates the number of compute resources to use + # as the quotient of dividing the max number of compute resources per cluster by the half. + max_number_of_queues = MAX_NUMBER_OF_QUEUES // 2 + max_number_of_crs = MAX_NUMBER_OF_COMPUTE_RESOURCES_PER_CLUSTER // max_number_of_queues # Try to search for jinja templates in the test_datadir, this is mainly to verify pcluster limits rendered_config_file = pcluster_config_reader( @@ -82,9 +119,9 @@ def test_cluster_config_limits(mocker, capsys, tmpdir, pcluster_config_reader, t max_new_storage_count=MAX_NEW_STORAGE_COUNT, max_existing_storage_count=MAX_EXISTING_STORAGE_COUNT, # number of queues, compute resources and security groups highly impacts the size of AWS resources - max_number_of_queues=MAX_NUMBER_OF_QUEUES, - max_number_of_ondemand_crs=MAX_NUMBER_OF_COMPUTE_RESOURCES, - max_number_of_spot_crs=MAX_NUMBER_OF_COMPUTE_RESOURCES - 2, # FIXME: Limit num of CRs to not exceed size limits + max_number_of_queues=max_number_of_queues, + max_number_of_ondemand_crs=max_number_of_crs, + max_number_of_spot_crs=max_number_of_crs, number_of_sg_per_queue=1, # The number of following items doesn't impact number of resources, but the size of the template. # We have to reduce number of tags, script args and remove dev settings to reduce template size, @@ -95,27 +132,45 @@ def test_cluster_config_limits(mocker, capsys, tmpdir, pcluster_config_reader, t ) input_yaml, cluster = load_cluster_model_from_yaml(rendered_config_file, test_datadir) + # Confirm that the configuration file is not missing sections that would impact the size of the templates + expected_full_config_path = test_datadir / "slurm.full_config.snapshot.yaml" + _assert_config_snapshot(cluster, expected_full_config_path) + # Generate CFN template file - output_yaml = yaml.dump(_generate_template(cluster, capsys)) - output_path = str(tmpdir / "generated_cfn_template.yaml") - with open(output_path, "w") as output_file: - output_file.write(output_yaml) + cluster_template, assets = _generate_template(cluster, capsys) + cluster_template_as_yaml = format_content(cluster_template, S3FileFormat.YAML) # Main template is YAML formatted + assets_as_json = [ + format_content(asset, S3FileFormat.MINIFIED_JSON) # Nested templates/assets as JSON Minified + for asset in assets + ] - # Assert that size of the template doesn't exceed 1MB and number of resources doesn't exceed 500 - # Note the configuration file defined in the test_datadir is very close to the limit of 500 resources - assert_that(os.stat(output_path).st_size).is_less_than(1024 * 1024) - matches = len(re.findall("Type.*AWS::", str(output_yaml))) - assert_that(matches).is_less_than(500) + for template in [cluster_template_as_yaml] + assets_as_json: + output_path = str(tmpdir / "generated_cfn_template") + with open(output_path, "w") as output_file: + output_file.write(template) + _assert_template_limits(output_path, template) + + +def _assert_template_limits(template_path: str, template_content: str): + """ + Assert that size of the template doesn't exceed 1MB and number of resources doesn't exceed 500. + + :param template_path: path to the generated cfn template + """ + assert_that(os.stat(template_path).st_size).is_less_than(MAX_SIZE_OF_CFN_TEMPLATE) + matches = len(re.findall("Type.*AWS::", str())) + assert_that(matches).is_less_than(MAX_RESOURCES_PER_TEMPLATE) def _generate_template(cluster, capsys): # Try to build the template - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, assets_metadata = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) + cluster_assets = [asset["content"] for asset in assets_metadata] _, err = capsys.readouterr() assert_that(err).is_empty() # Assertion failure may become an update of dependency warning deprecations. - return generated_template + return generated_template, cluster_assets @pytest.mark.parametrize( @@ -133,9 +188,10 @@ def test_add_alarms(mocker, config_file_name): mock_aws_api(mocker) # mock bucket initialization parameters mock_bucket(mocker) + mock_bucket_object_utils(mocker) input_yaml, cluster = load_cluster_model_from_yaml(config_file_name) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) output_yaml = yaml.dump(generated_template, width=float("inf")) @@ -165,12 +221,25 @@ def test_add_alarms(mocker, config_file_name): "ParallelClusterStackId": {"Ref": "AWS::StackId"}, "VpcId": "vpc-123", "HeadNodeRoleName": {"Ref": "RoleHeadNode"}, - "ComputeFleetRoleNames": {"Ref": "Role15b342af42246b70"}, + "ComputeFleetRoleNames": { + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0Role15b342af42246b70E9AB1575Ref", + ] + }, "LaunchTemplate1f8c19f38f8d4f7fVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplate1f8c19f38f8d4f7f3489FB83", "LatestVersionNumber"] + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplate1f8c19f38f8d4f7f658" + + "C4380LatestVersionNumber", + ] }, "LaunchTemplateA6f65dee6703df4aVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplateA6f65dee6703df4a27E3DD2A", "LatestVersionNumber"] + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplateA6f65dee6703df4a05B" + + "14750LatestVersionNumber", + ] }, }, }, @@ -189,12 +258,25 @@ def test_add_alarms(mocker, config_file_name): "ParallelClusterStackId": {"Ref": "AWS::StackId"}, "VpcId": "vpc-123", "HeadNodeRoleName": "", - "ComputeFleetRoleNames": {"Ref": "Role15b342af42246b70"}, + "ComputeFleetRoleNames": { + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0Role15b342af42246b70E9AB1575Ref", + ] + }, "LaunchTemplate1f8c19f38f8d4f7fVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplate1f8c19f38f8d4f7f3489FB83", "LatestVersionNumber"] + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplate1f8c19f38f8d4f7f658" + + "C4380LatestVersionNumber", + ] }, "LaunchTemplateA6f65dee6703df4aVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplateA6f65dee6703df4a27E3DD2A", "LatestVersionNumber"] + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplateA6f65dee6703df4a05B" + + "14750LatestVersionNumber", + ] }, }, }, @@ -215,13 +297,25 @@ def test_add_alarms(mocker, config_file_name): "HeadNodeRoleName": "", "ComputeFleetRoleNames": "", "LaunchTemplate1f8c19f38f8d4f7fVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplate1f8c19f38f8d4f7f3489FB83", "LatestVersionNumber"] - }, - "LaunchTemplateA6f65dee6703df4aVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplateA6f65dee6703df4a27E3DD2A", "LatestVersionNumber"] + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplate1f8c19f38f8d4f7f658" + + "C4380LatestVersionNumber", + ] }, "LaunchTemplate7916067054f91933Version": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplate7916067054f919332FB9590D", "LatestVersionNumber"] + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplate7916067054f919335AF" + + "28643LatestVersionNumber", + ] + }, + "LaunchTemplateA6f65dee6703df4aVersion": { + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplateA6f65dee6703df4a05B" + + "14750LatestVersionNumber", + ] }, }, }, @@ -240,18 +334,39 @@ def test_add_alarms(mocker, config_file_name): "ParallelClusterStackId": {"Ref": "AWS::StackId"}, "VpcId": "vpc-123", "HeadNodeRoleName": "", - "ComputeFleetRoleNames": {"Ref": "Role15b342af42246b70"}, - "LaunchTemplate1f8c19f38f8d4f7fVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplate1f8c19f38f8d4f7f3489FB83", "LatestVersionNumber"] + "ComputeFleetRoleNames": { + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0Role15b342af42246b70E9AB1575Ref", + ] }, - "LaunchTemplateA6f65dee6703df4aVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplateA6f65dee6703df4a27E3DD2A", "LatestVersionNumber"] + "LaunchTemplate1f8c19f38f8d4f7fVersion": { + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplate1f8c19f38f8d4f7f658" + + "C4380LatestVersionNumber", + ] }, "LaunchTemplate7916067054f91933Version": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplate7916067054f919332FB9590D", "LatestVersionNumber"] + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplate7916067054f919335AF" + + "28643LatestVersionNumber", + ] }, "LaunchTemplateA46d18b906a50d3aVersion": { - "Fn::GetAtt": ["ComputeFleetLaunchTemplateA46d18b906a50d3a347605B0", "LatestVersionNumber"] + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplateA46d18b906a50d3a3A2" + + "D0E8FLatestVersionNumber", + ] + }, + "LaunchTemplateA6f65dee6703df4aVersion": { + "Fn::GetAtt": [ + "ComputeFleetQueueBatch0QueueGroup0NestedStackQueueGroup0NestedStackResource356F7DC3", + "Outputs.clusternameComputeFleetQueueBatch0QueueGroup0LaunchTemplateA6f65dee6703df4a05B" + + "14750LatestVersionNumber", + ] }, }, }, @@ -263,11 +378,13 @@ def test_scheduler_plugin_substack(mocker, config_file_name, expected_scheduler_ mock_aws_api(mocker) # mock bucket initialization parameters mock_bucket(mocker) + mock_bucket_object_utils(mocker) + if config_file_name == "scheduler_plugin.full.yaml": input_yaml, cluster = load_cluster_model_from_yaml(config_file_name) else: input_yaml, cluster = load_cluster_model_from_yaml(config_file_name, test_datadir) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) print(yaml.dump(generated_template)) @@ -393,14 +510,17 @@ def test_compute_launch_template_properties( ) # mock bucket initialization parameters mock_bucket(mocker) + mock_bucket_object_utils(mocker) input_yaml, cluster = load_cluster_model_from_yaml(config_file_name, test_datadir) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, cdk_assets = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) + asset_content = get_asset_content_with_resource_name(cdk_assets, "LaunchTemplate64e1c3597ca4c326") + for lt_assertion in lt_assertions: - lt_assertion.assert_lt_properties(generated_template, "ComputeFleetLaunchTemplate64e1c3597ca4c32652225395") + lt_assertion.assert_lt_properties(asset_content, "LaunchTemplate64e1c3597ca4c326") @pytest.mark.parametrize( @@ -420,36 +540,18 @@ def test_compute_launch_template_properties( "scheduler-plugin-headnode-hooks-partial.yaml", { "scheduler": "plugin", - "postinstall": "https://test.tgz", - "postinstall_args": "arg1 arg2", - "preinstall": "NONE", - "preinstall_args": "NONE", - "postupdate": "https://test2.tgz", - "postupdate_args": "arg3 arg4", }, ), ( "awsbatch-headnode-hooks-partial.yaml", { "scheduler": "awsbatch", - "postinstall": "NONE", - "postinstall_args": "NONE", - "preinstall": "https://test.tgz", - "preinstall_args": "arg1 arg2", - "postupdate": "NONE", - "postupdate_args": "NONE", }, ), ( "slurm-headnode-hooks-full.yaml", { "scheduler": "slurm", - "postinstall": "https://test2.tgz", - "postinstall_args": "arg3 arg4", - "preinstall": "https://test.tgz", - "preinstall_args": "arg1 arg2", - "postupdate": "https://test3.tgz", - "postupdate_args": "arg5 arg6", }, ), ], @@ -460,12 +562,13 @@ def test_head_node_dna_json(mocker, test_datadir, config_file_name, expected_hea default_head_node_dna_json = load_json_dict(test_datadir / "head_node_default.dna.json") mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -510,8 +613,10 @@ def test_head_node_bootstrap_timeout(mocker, config_file_name, expected_head_nod mock_aws_api(mocker) # mock bucket initialization parameters mock_bucket(mocker) + mock_bucket_object_utils(mocker) + input_yaml, cluster = load_cluster_model_from_yaml(config_file_name) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) assert_that( @@ -578,8 +683,9 @@ def _get_cfn_init_file_content(template, resource, file): def test_head_node_tags_from_launch_template(mocker, config_file_name, expected_instance_tags, expected_volume_tags): mock_aws_api(mocker) mock_bucket(mocker) + mock_bucket_object_utils(mocker) input_yaml, cluster = load_cluster_model_from_yaml(config_file_name) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) tags_specifications = ( @@ -660,8 +766,9 @@ def test_head_node_tags_from_launch_template(mocker, config_file_name, expected_ def test_head_node_tags_from_instance_definition(mocker, config_file_name, expected_tags): mock_aws_api(mocker) mock_bucket(mocker) + mock_bucket_object_utils(mocker) input_yaml, cluster = load_cluster_model_from_yaml(config_file_name) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) tags = generated_template.get("Resources").get("HeadNode").get("Properties").get("Tags", []) @@ -689,6 +796,7 @@ def test_head_node_tags_from_instance_definition(mocker, config_file_name, expec ) def test_cluster_imds_settings(mocker, config_file_name, imds_support, http_tokens): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(f"{EXAMPLE_CONFIGS_DIR}/{config_file_name}") if imds_support: @@ -696,7 +804,7 @@ def test_cluster_imds_settings(mocker, config_file_name, imds_support, http_toke cluster = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -725,6 +833,7 @@ def test_cluster_imds_settings(mocker, config_file_name, imds_support, http_toke ) def test_cluster_lambda_functions_vpc_config(mocker, config_file_name, vpc_config): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(f"{EXAMPLE_CONFIGS_DIR}/{config_file_name}") if vpc_config: @@ -733,8 +842,51 @@ def test_cluster_lambda_functions_vpc_config(mocker, config_file_name, vpc_confi cluster = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster, bucket=dummy_cluster_bucket(), stack_name="clustername" ) assert_lambdas_have_expected_vpc_config_and_managed_policy(generated_template, vpc_config) + + +@pytest.mark.parametrize( + "no_of_compute_resources_per_queue, expected_no_of_nested_stacks, raises_error", + [ + ({f"queue-{i}": 5 for i in range(10)}, 2, False), + ({f"queue-{i}": 5 for i in range(20)}, 3, False), + ({f"queue-{i}": 5 for i in range(30)}, 4, False), + ({f"queue-{i}": 40 for i in range(1)}, 1, False), + # 1 queue with 41 compute resources (Exceeds max compute resources per queue - 40) + ({f"queue-{i}": 41 for i in range(1)}, 1, True), + ], + ids=[ + "10 queues with 5 compute resources each", + "20 queues with 5 compute resources each", + "30 queues with 5 compute resources each", + "1 queue with 40 compute resources each", + "1 queue with 41 compute resources", + ], +) +def test_cluster_resource_distribution_in_stacks( + test_datadir, + pcluster_config_reader, + capsys, + mocker, + no_of_compute_resources_per_queue: int, + expected_no_of_nested_stacks: int, + raises_error: bool, +): + mock_aws_api(mocker) + mock_bucket_object_utils(mocker) + rendered_config_file = pcluster_config_reader( + "variable_queue_compute_resources.yaml", no_of_compute_resources_per_queue=no_of_compute_resources_per_queue + ) + + input_yaml, cluster = load_cluster_model_from_yaml(rendered_config_file, test_datadir) + + if raises_error: + with pytest.raises(ValueError): + _generate_template(cluster, capsys) + else: + cluster_template, assets = _generate_template(cluster, capsys) + assert_that(expected_no_of_nested_stacks).is_equal_to(len(assets)) diff --git a/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_config_limits/slurm.full.all_resources.yaml b/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_config_limits/slurm.full.all_resources.yaml index 96a36ae991..4e625358a0 100644 --- a/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_config_limits/slurm.full.all_resources.yaml +++ b/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_config_limits/slurm.full.all_resources.yaml @@ -78,6 +78,11 @@ Scheduling: {% for j in range(max_number_of_ondemand_crs) %} - Name: compute-resource-ondemand-{{ i }}{{ j }} InstanceType: c5.xlarge + Tags: + {% for i in range(number_of_tags) %} + - Key: computetag{{ i }} + Value: computetag{{ i }} + {% endfor %} {% endfor %} CustomActions: OnNodeStart: @@ -101,6 +106,11 @@ Scheduling: - Policy: arn:aws:iam::aws:policy/AdministratorAccess Image: CustomAmi: ami-12345678 + Tags: + {% for i in range(number_of_tags) %} + - Key: String{{ i }} + Value: String{{ i }} + {% endfor %} {% endfor %} # spot queues @@ -139,11 +149,25 @@ Scheduling: Efa: Enabled: true GdrSupport: false + Networking: + PlacementGroup: + Enabled: true + Id: String + Tags: + {% for i in range(number_of_tags) %} + - Key: computetag{{ i }} + Value: computetag{{ i }} + {% endfor %} {% endfor %} Iam: InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile Image: CustomAmi: ami-23456789 + Tags: + {% for i in range(number_of_tags) %} + - Key: String{{ i }} + Value: String{{ i }} + {% endfor %} {% endfor %} SharedStorage: @@ -282,4 +306,4 @@ DevSettings: Timeouts: HeadNodeBootstrapTimeout: 1201 ComputeNodeBootstrapTimeout: 1001 -{% endif %} \ No newline at end of file +{% endif %} diff --git a/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_config_limits/slurm.full_config.snapshot.yaml b/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_config_limits/slurm.full_config.snapshot.yaml new file mode 100644 index 0000000000..6ab93fe7f8 --- /dev/null +++ b/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_config_limits/slurm.full_config.snapshot.yaml @@ -0,0 +1,6687 @@ +AdditionalPackages: + IntelSoftware: + IntelHpcPlatform: false +AdditionalResources: https://template.url +CustomS3Bucket: String +DeploymentSettings: null +DevSettings: null +DirectoryService: + AdditionalSssdConfigs: + parameter_1: value_1 + parameter_2: value_2 + DomainAddr: string + DomainName: string + DomainReadOnlyUser: string + GenerateSshKeysForUsers: false + LdapAccessFilter: string + LdapTlsCaCert: string + LdapTlsReqCert: never + PasswordSecretArn: arn:aws:secretsmanager:us-east-1:111111111111:secret:Secret-xxxxxxxx-xxxxx +HeadNode: + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + OnNodeUpdated: null + Dcv: + AllowedIps: 0.0.0.0/0 + Enabled: true + Port: 8443 + DisableSimultaneousMultithreading: false + Iam: + AdditionalIamPolicies: [] + InstanceProfile: null + InstanceRole: arn:aws:iam::aws:role/CustomHeadNodeRole + S3Access: null + Image: + CustomAmi: ami-98765432 + Imds: + Secured: true + InstanceType: t2.micro + LocalStorage: + EphemeralVolume: + MountDir: /test + RootVolume: + DeleteOnTermination: true + Encrypted: true + Iops: 100 + Size: 40 + Throughput: null + VolumeType: gp2 + Networking: + AdditionalSecurityGroups: + - sg-34567890 + - sg-34567891 + - sg-34567892 + - sg-34567893 + - sg-34567894 + - sg-34567895 + - sg-34567896 + - sg-34567897 + - sg-34567898 + - sg-34567899 + ElasticIp: true + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: null + SubnetId: subnet-12345678 + Ssh: + AllowedIps: 1.2.3.4/32 + KeyName: ec2-key-name +Iam: + PermissionsBoundary: null + ResourcePrefix: null + Roles: + LambdaFunctionsRole: arn:aws:iam::aws:role/CustomResourcesLambdaRole +Image: + CustomAmi: ami-12345678 + Os: centos7 +Imds: + ImdsSupport: v1.0 +Monitoring: + Dashboards: + CloudWatch: + Enabled: true + DetailedMonitoring: true + Logs: + CloudWatch: + DeletionPolicy: Retain + Enabled: true + RetentionInDays: 30 + Rotation: + Enabled: true +Region: us-east-1 +Scheduling: + Scheduler: slurm + SlurmQueues: + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-00 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-01 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-02 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-0 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-10 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-11 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-12 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-1 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-20 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-21 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-22 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-2 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-30 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-31 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-32 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-3 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-40 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-41 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-42 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-4 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-50 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-51 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-52 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-5 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-60 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-61 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-62 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-6 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-70 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-71 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-72 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-7 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-80 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-81 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-82 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-8 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-90 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-91 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-92 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-9 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-100 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-101 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-102 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-10 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-110 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-111 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-112 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-11 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-120 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-121 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-122 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-12 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-130 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-131 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-132 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-13 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-140 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-141 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-142 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-14 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-150 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-151 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-152 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-15 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-160 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-161 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-162 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-16 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-170 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-171 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-172 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-17 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-180 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-181 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-182 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-18 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-190 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-191 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-192 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-19 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-200 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-201 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-202 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-20 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-210 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-211 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-212 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-21 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-220 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-221 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-222 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-22 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-230 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-231 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-232 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-23 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-240 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-241 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: false + Efa: + Enabled: false + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.xlarge + MaxCount: 10 + MinCount: 0 + Name: compute-resource-ondemand-242 + Networking: + PlacementGroup: + Enabled: null + Id: null + Name: null + SchedulableMemory: null + SpotPrice: null + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: + OnNodeConfigured: + Args: + - arg0 + Script: https://test.tgz + OnNodeStart: + Args: + - arg0 + Script: https://test.tgz + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AdministratorAccess + InstanceProfile: null + InstanceRole: null + S3Access: + - BucketName: string1 + EnableWriteAccess: false + KeyName: null + Image: + CustomAmi: ami-12345678 + Name: queue-ondemand-24 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: null + Id: null + Name: null + Proxy: null + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-00 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-01 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-02 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-0 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-10 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-11 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-12 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-1 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-20 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-21 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-22 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-2 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-30 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-31 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-32 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-3 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-40 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-41 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-42 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-4 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-50 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-51 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-52 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-5 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-60 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-61 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-62 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-6 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-70 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-71 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-72 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-7 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-80 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-81 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-82 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-8 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-90 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-91 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-92 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-9 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-100 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-101 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-102 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-10 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-110 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-111 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-112 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-11 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-120 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-121 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-122 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-12 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-130 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-131 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-132 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-13 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-140 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-141 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-142 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-14 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-150 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-151 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-152 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-15 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-160 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-161 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-162 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-16 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-170 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-171 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-172 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-17 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-180 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-181 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-182 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-18 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-190 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-191 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-192 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-19 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-200 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-201 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-202 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-20 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-210 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-211 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-212 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-21 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-220 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-221 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-222 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-22 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-230 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-231 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-232 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-23 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + - CapacityReservationTarget: null + CapacityType: ONDEMAND + ComputeResources: + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-240 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-241 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + - CapacityReservationTarget: null + CustomSlurmSettings: {} + DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + HealthChecks: + Gpu: + Enabled: null + InstanceType: c5.2xlarge + MaxCount: 15 + MinCount: 1 + Name: compute-resource-spot-242 + Networking: + PlacementGroup: + Enabled: true + Id: String + Name: null + SchedulableMemory: null + SpotPrice: 1.1 + Tags: + - Key: computetag0 + Value: computetag0 + ComputeSettings: + LocalStorage: + EphemeralVolume: + MountDir: /scratch + RootVolume: + Encrypted: true + Iops: 100 + Size: 35 + Throughput: null + VolumeType: gp2 + CustomActions: null + CustomSlurmSettings: {} + HealthChecks: + Gpu: + Enabled: null + Iam: + AdditionalIamPolicies: [] + InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile + InstanceRole: null + S3Access: null + Image: + CustomAmi: ami-23456789 + Name: queue-spot-24 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: true + PlacementGroup: + Enabled: true + Id: String + Name: null + Proxy: + HttpProxyAddress: https://proxy-address:port + SecurityGroups: + - sg-34567890 + SubnetIds: + - subnet-12345678 + Tags: + - Key: String0 + Value: String0 + SlurmSettings: + CustomSlurmSettings: null + CustomSlurmSettingsIncludeFile: null + Database: null + Dns: + DisableManagedDns: false + HostedZoneId: null + UseEc2Hostnames: false + EnableMemoryBasedScheduling: false + QueueUpdateStrategy: TERMINATE + ScaledownIdletime: 10 +SharedStorage: +- EbsSettings: + DeletionPolicy: Retain + Encrypted: true + Iops: 100 + KmsKeyId: String + Raid: null + Size: 150 + SnapshotId: null + Throughput: null + VolumeId: null + VolumeType: gp2 + MountDir: /new/mount/efs0 + Name: nameebs0 + StorageType: Ebs +- EbsSettings: + DeletionPolicy: Retain + Encrypted: true + Iops: 100 + KmsKeyId: String + Raid: null + Size: 150 + SnapshotId: null + Throughput: null + VolumeId: null + VolumeType: gp2 + MountDir: /new/mount/efs1 + Name: nameebs1 + StorageType: Ebs +- EbsSettings: + DeletionPolicy: Retain + Encrypted: true + Iops: 100 + KmsKeyId: String + Raid: null + Size: 150 + SnapshotId: null + Throughput: null + VolumeId: null + VolumeType: gp2 + MountDir: /new/mount/efs2 + Name: nameebs2 + StorageType: Ebs +- EbsSettings: + DeletionPolicy: Retain + Encrypted: true + Iops: 100 + KmsKeyId: String + Raid: null + Size: 150 + SnapshotId: null + Throughput: null + VolumeId: null + VolumeType: gp2 + MountDir: /new/mount/efs3 + Name: nameebs3 + StorageType: Ebs +- EbsSettings: + DeletionPolicy: Retain + Encrypted: true + Iops: 100 + KmsKeyId: String + Raid: null + Size: 150 + SnapshotId: null + Throughput: null + VolumeId: null + VolumeType: gp2 + MountDir: /new/mount/efs4 + Name: nameebs4 + StorageType: Ebs +- EfsSettings: + DeletionPolicy: Delete + Encrypted: false + EncryptionInTransit: false + FileSystemId: null + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: 1024 + ThroughputMode: provisioned + MountDir: /new/mount/efs0 + Name: efs0 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345610 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs10 + Name: existing-efs10 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345611 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs11 + Name: existing-efs11 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345612 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs12 + Name: existing-efs12 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345613 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs13 + Name: existing-efs13 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345614 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs14 + Name: existing-efs14 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345615 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs15 + Name: existing-efs15 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345616 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs16 + Name: existing-efs16 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345617 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs17 + Name: existing-efs17 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345618 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs18 + Name: existing-efs18 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345619 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs19 + Name: existing-efs19 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345620 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs20 + Name: existing-efs20 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345621 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs21 + Name: existing-efs21 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345622 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs22 + Name: existing-efs22 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345623 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs23 + Name: existing-efs23 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345624 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs24 + Name: existing-efs24 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345625 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs25 + Name: existing-efs25 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345626 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs26 + Name: existing-efs26 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345627 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs27 + Name: existing-efs27 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345628 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs28 + Name: existing-efs28 + StorageType: Efs +- EfsSettings: + DeletionPolicy: null + Encrypted: false + EncryptionInTransit: false + FileSystemId: fs-12345629 + IamAuthorization: false + KmsKeyId: null + PerformanceMode: generalPurpose + ProvisionedThroughput: null + ThroughputMode: bursting + MountDir: /existing/mount/efs29 + Name: existing-efs29 + StorageType: Efs +- FsxLustreSettings: + AutoImportPolicy: NEW + AutomaticBackupRetentionDays: 0 + BackupId: null + CopyTagsToBackups: true + DailyAutomaticBackupStartTime: 01:03 + DataCompressionType: LZ4 + DeletionPolicy: Delete + DeploymentType: PERSISTENT_1 + DriveCacheType: READ + ExportPath: s3://bucket/folder + FileSystemId: null + ImportPath: s3://bucket + ImportedFileChunkSize: 1024 + KmsKeyId: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + PerUnitStorageThroughput: 200 + StorageCapacity: 3600 + StorageType: HDD + WeeklyMaintenanceStartTime: '1:00:00' + MountDir: /new/mount/fsx + Name: fsx0 + StorageType: FsxLustre +- FsxOpenZfsSettings: + VolumeId: fsvol-12345678901234567 + MountDir: /existing/mount/openzfs + Name: existing-open-zfs + StorageType: FsxOpenZfs +- FsxOntapSettings: + VolumeId: fsvol-12345678901234567 + MountDir: /existing/mount/ontap + Name: existing-ontap + StorageType: FsxOntap +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456710 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx10 + Name: existing-fsx10 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456711 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx11 + Name: existing-fsx11 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456712 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx12 + Name: existing-fsx12 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456713 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx13 + Name: existing-fsx13 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456714 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx14 + Name: existing-fsx14 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456715 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx15 + Name: existing-fsx15 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456716 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx16 + Name: existing-fsx16 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456717 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx17 + Name: existing-fsx17 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456718 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx18 + Name: existing-fsx18 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456719 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx19 + Name: existing-fsx19 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456720 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx20 + Name: existing-fsx20 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456721 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx21 + Name: existing-fsx21 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456722 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx22 + Name: existing-fsx22 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456723 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx23 + Name: existing-fsx23 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456724 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx24 + Name: existing-fsx24 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456725 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx25 + Name: existing-fsx25 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456726 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx26 + Name: existing-fsx26 + StorageType: FsxLustre +- FsxLustreSettings: + AutoImportPolicy: null + AutomaticBackupRetentionDays: null + BackupId: null + CopyTagsToBackups: null + DailyAutomaticBackupStartTime: null + DataCompressionType: null + DeletionPolicy: null + DeploymentType: null + DriveCacheType: null + ExportPath: null + FileSystemId: fs-21345678123456727 + ImportPath: null + ImportedFileChunkSize: null + KmsKeyId: null + PerUnitStorageThroughput: null + StorageCapacity: null + StorageType: null + WeeklyMaintenanceStartTime: null + MountDir: /existing/mount/fsx27 + Name: existing-fsx27 + StorageType: FsxLustre +Tags: +- Key: String0 + Value: String0 diff --git a/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_resource_distribution_in_stacks/variable_queue_compute_resources.yaml b/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_resource_distribution_in_stacks/variable_queue_compute_resources.yaml new file mode 100644 index 0000000000..824236a252 --- /dev/null +++ b/cli/tests/pcluster/templates/test_cluster_stack/test_cluster_resource_distribution_in_stacks/variable_queue_compute_resources.yaml @@ -0,0 +1,21 @@ +Image: + Os: alinux2 +HeadNode: + InstanceType: t2.micro + Networking: + SubnetId: subnet-12345678 +Scheduling: + Scheduler: slurm + SlurmQueues: + {% for queue_name, no_of_compute_resources in no_of_compute_resources_per_queue.items() %} + - Name: {{queue_name}} + Networking: + SubnetIds: + - subnet-12345678 + ComputeResources: + {% for cr_index in range(no_of_compute_resources) %} + - Name: compute_resource-{{cr_index}} + InstanceType: c5.2xlarge + {% endfor %} + {% endfor %} + diff --git a/cli/tests/pcluster/templates/test_cluster_stack/test_head_node_dna_json/head_node_default.dna.json b/cli/tests/pcluster/templates/test_cluster_stack/test_head_node_dna_json/head_node_default.dna.json index 25451a853f..efa6950e03 100644 --- a/cli/tests/pcluster/templates/test_cluster_stack/test_head_node_dna_json/head_node_default.dna.json +++ b/cli/tests/pcluster/templates/test_cluster_stack/test_head_node_dna_json/head_node_default.dna.json @@ -1,50 +1,44 @@ { "cluster": { "base_os": "alinux2", + "change_set_s3_key": "parallelcluster/clusters/dummy-cluster-randomstring123/configs/change-set.json", "cluster_config_s3_key": "parallelcluster/clusters/dummy-cluster-randomstring123/configs/cluster-config-with-implied-values.yaml", "cluster_config_version": "", - "change_set_s3_key": "parallelcluster/clusters/dummy-cluster-randomstring123/configs/change-set.json", "cluster_s3_bucket": "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", "cluster_user": "ec2-user", + "compute_node_bootstrap_timeout": 1800, "custom_awsbatchcli_package": "", "custom_node_package": "", "cw_logging_enabled": "true", - "log_rotation_enabled": "true", "dcv_enabled": "false", "dcv_port": "NONE", "ddb_table": "NONE", "ebs_shared_dirs": "", - "efs_fs_ids": "", - "efs_shared_dirs": "", "efs_encryption_in_transits": "", + "efs_fs_ids": "", "efs_iam_authorizations": "", + "efs_shared_dirs": "", "enable_intel_hpc_platform": "false", "ephemeral_dir": "/scratch", "fsx_dns_names": "", "fsx_fs_ids": "", + "fsx_fs_types": "", "fsx_mount_names": "", "fsx_shared_dirs": "", - "fsx_fs_types": "", "fsx_volume_junction_paths": "", "head_node_imds_secured": "true", "instance_types_data_s3_key": "parallelcluster/clusters/dummy-cluster-randomstring123/configs/instance-types-data.json", "log_group_name": "/aws/parallelcluster/clustername-202101010101", + "log_rotation_enabled": "true", "node_type": "HeadNode", - "postinstall": "NONE", - "postinstall_args": "NONE", - "preinstall": "NONE", - "preinstall_args": "NONE", - "postupdate": "NONE", - "postupdate_args": "NONE", "proxy": "NONE", "raid_shared_dir": "", "raid_type": "", "raid_vol_ids": "", "region": "{'Ref': 'AWS::Region'}", - "stack_name": "clustername", - "volume": "", "scheduler_plugin_substack_arn": "", "stack_arn": "{'Ref': 'AWS::StackId'}", - "compute_node_bootstrap_timeout": 1800 + "stack_name": "clustername", + "volume": "" } } diff --git a/cli/tests/pcluster/templates/test_cw_dashboard_builder.py b/cli/tests/pcluster/templates/test_cw_dashboard_builder.py index ae6dcea45c..6fe63fdf02 100644 --- a/cli/tests/pcluster/templates/test_cw_dashboard_builder.py +++ b/cli/tests/pcluster/templates/test_cw_dashboard_builder.py @@ -20,7 +20,7 @@ from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_yaml_dict from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket, mock_bucket_object_utils @pytest.mark.parametrize( @@ -32,6 +32,7 @@ "ubuntu18.slurm.simple.yaml", "alinux2.batch.no_head_node_log.yaml", "ubuntu18.slurm.no_dashboard.yaml", + "alinux2.batch.head_node_log.yaml", ], ) def test_cw_dashboard_builder(mocker, test_datadir, config_file_name): @@ -42,11 +43,12 @@ def test_cw_dashboard_builder(mocker, test_datadir, config_file_name): ) # mock bucket initialization parameters mock_bucket(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) print(cluster_config) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) output_yaml = yaml.dump(generated_template, width=float("inf")) @@ -62,13 +64,41 @@ def test_cw_dashboard_builder(mocker, test_datadir, config_file_name): if cluster_config.is_cw_logging_enabled: _verify_head_node_logs_conditions(cluster_config, output_yaml) + _verify_common_error_metrics_graphs(cluster_config, output_yaml) else: assert_that(output_yaml).does_not_contain("Head Node Logs") + assert_that(output_yaml).does_not_contain("Cluster Health Metrics") + + metric_filters = _extract_metric_filters(generated_template) + _verify_metric_filter_dimensions(metric_filters) else: assert_that(output_yaml).does_not_contain("CloudwatchDashboard") assert_that(output_yaml).does_not_contain("Head Node EC2 Metrics") +def _extract_metric_filters(generated_template): + return { + key: val["Properties"] + for key, val in generated_template["Resources"].items() + if val["Type"] == "AWS::Logs::MetricFilter" + } + + +def _verify_metric_filter_dimensions(metric_filters): + for name, properties in metric_filters.items(): + dimensions = next( + property["Dimensions"] + for property in properties["MetricTransformations"] + if type(property) is dict and "Dimensions" in property + ) + + expected_dimensions = [{"Key": "ClusterName", "Value": "$.cluster-name"}] + + assert_that(dimensions, description=f"{name} should have dimensions {expected_dimensions}").is_equal_to( + expected_dimensions + ) + + def _verify_head_node_instance_metrics_graphs(output_yaml): """Verify CloudWatch graphs within the Head Node Instance Metrics section.""" assert_that(output_yaml).contains("Head Node Instance Metrics") @@ -168,3 +198,44 @@ def _verify_head_node_logs_conditions(cluster_config, output_yaml): assert_that(output_yaml).contains("chef-client") assert_that(output_yaml).contains("cloud-init") assert_that(output_yaml).contains("supervisord") + + +def _verify_common_error_metrics_graphs(cluster_config, output_yaml): + """Verify conditions related to the common error section.""" + scheduler = cluster_config.scheduling.scheduler + slurm_related_metrics = [ + "IamPolicyErrors", + "VcpuLimitErrors", + "VolumeLimitErrors", + "InsufficientCapacityErrors", + "OtherInstanceLaunchFailures", + "InstanceBootstrapTimeoutError", + "EC2HealthCheckErrors", + "ScheduledEventHealthCheckErrors", + "NoCorrespondingInstanceErrors", + "SlurmNodeNotRespondingErrors", + ] + custom_action_metrics = [ + "OnNodeStartDownloadErrors", + "OnNodeStartRunErrors", + "OnNodeConfiguredDownloadErrors", + "OnNodeConfiguredRunErrors", + ] + idle_node_metrics = ["MaxDynamicNodeIdleTime"] + if scheduler == "slurm": + # Contains error metric title + assert_that(output_yaml).contains("Cluster Health Metrics") + for metric in slurm_related_metrics: + assert_that(output_yaml).contains(metric) + for metric in idle_node_metrics: + assert_that(output_yaml).contains(metric) + if cluster_config.has_custom_actions_in_queue: + for metric in custom_action_metrics: + assert_that(output_yaml).contains(metric) + else: + for metric in custom_action_metrics: + assert_that(output_yaml).does_not_contain(metric) + else: + assert_that(output_yaml).does_not_contain("Cluster Health Metrics") + for metric in slurm_related_metrics + custom_action_metrics + idle_node_metrics: + assert_that(output_yaml).does_not_contain(metric) diff --git a/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.batch.head_node_log.yaml b/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.batch.head_node_log.yaml new file mode 100644 index 0000000000..d9e87906c7 --- /dev/null +++ b/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.batch.head_node_log.yaml @@ -0,0 +1,55 @@ +Image: + Os: alinux2 +HeadNode: + InstanceType: String + Ssh: + KeyName: String + Networking: + SubnetId: subnet-12345678 +Scheduling: + Scheduler: awsbatch + AwsBatchQueues: + - Name: queue1 + Networking: + SubnetIds: + - subnet-12345678 + ComputeResources: + - Name: compute_resource1 + InstanceTypes: + - c4.xlarge + MaxvCpus: 10 +SharedStorage: + - MountDir: /my/mount/ebs1 + Name: name1 + StorageType: Ebs + EbsSettings: + VolumeType: sc1 + - MountDir: /my/mount/ebs2 + Name: name2 + StorageType: Ebs + EbsSettings: + VolumeType: st1 + - MountDir: /my/mount/ebs3 + Name: name3 + StorageType: Ebs + EbsSettings: + VolumeType: gp2 + - MountDir: /my/mount/ebs4 + Name: name4 + StorageType: Ebs + EbsSettings: + VolumeType: sc1 + - MountDir: /my/mount/ebs5 + Name: name5 + StorageType: Ebs + EbsSettings: + VolumeType: st1 +Monitoring: + DetailedMonitoring: true + Logs: + CloudWatch: + Enabled: True + RetentionInDays: 14 + Dashboards: + CloudWatch: + Enabled: true diff --git a/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.batch.no_head_node_log.yaml b/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.batch.no_head_node_log.yaml index 3f469b9b3e..5cb5b502d0 100644 --- a/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.batch.no_head_node_log.yaml +++ b/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.batch.no_head_node_log.yaml @@ -52,4 +52,4 @@ Monitoring: RetentionInDays: 14 Dashboards: CloudWatch: - Enabled: true \ No newline at end of file + Enabled: true diff --git a/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.slurm.conditional_vol.yaml b/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.slurm.conditional_vol.yaml index 0272b46be6..941a3fcc8f 100644 --- a/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.slurm.conditional_vol.yaml +++ b/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/alinux2.slurm.conditional_vol.yaml @@ -32,6 +32,9 @@ Scheduling: MaxCount: 5 - Name: compute_resource2 InstanceType: c4.2xlarge + CustomActions: + OnNodeConfigured: + Script: s3://{{ resource_bucket }}/scripts/postinstall.sh SharedStorage: - MountDir: /my/mount/ebs1 Name: name1 diff --git a/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/centos7.slurm.full.yaml b/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/centos7.slurm.full.yaml index 33d7c29256..663c871cd8 100644 --- a/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/centos7.slurm.full.yaml +++ b/cli/tests/pcluster/templates/test_cw_dashboard_builder/test_cw_dashboard_builder/centos7.slurm.full.yaml @@ -22,6 +22,11 @@ Scheduling: InstanceType: c5.2xlarge - Name: compute_resource2 InstanceType: c4.2xlarge + CustomActions: + OnNodeStart: + Script: s3://{{ resource_bucket }}/scripts/preinstall.sh + Args: + - arg1 - Name: queue2 Networking: SubnetIds: diff --git a/cli/tests/pcluster/templates/test_dev_settings.py b/cli/tests/pcluster/templates/test_dev_settings.py index 8e19ec570c..3bbc121840 100644 --- a/cli/tests/pcluster/templates/test_dev_settings.py +++ b/cli/tests/pcluster/templates/test_dev_settings.py @@ -16,7 +16,7 @@ from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_yaml_dict from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket_object_utils from tests.pcluster.utils import flatten, get_resources @@ -28,12 +28,13 @@ ) def test_custom_cookbook(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) diff --git a/cli/tests/pcluster/templates/test_directory_service.py b/cli/tests/pcluster/templates/test_directory_service.py index a0c0f957d5..9f732c76ea 100644 --- a/cli/tests/pcluster/templates/test_directory_service.py +++ b/cli/tests/pcluster/templates/test_directory_service.py @@ -16,29 +16,53 @@ from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_yaml_dict from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket_object_utils from tests.pcluster.utils import get_head_node_policy, get_statement_by_sid @pytest.mark.parametrize( - "config_file_name", + "config_file_name, head_node_permissions", [ - ("config.yaml"), + pytest.param( + "config.yaml", + [ + { + "Sid": "AllowGettingDirectorySecretValue", + "Action": "secretsmanager:GetSecretValue", + "Resource": "arn:aws:secretsmanager:eu-west-1:123456789:secret:a-secret-name", + } + ], + id="DirectoryService with PasswordSecretArn as Secret in Secrets Manager", + ), + pytest.param( + "config-ssm.yaml", + [ + { + "Sid": "AllowGettingDirectorySecretValue", + "Action": "ssm:GetParameter", + "Resource": "arn:aws:ssm:eu-west-1:123456789:parameter/a-parameter-name", + } + ], + id="DirectoryService with PasswordSecretArn as Parameter in SSM", + ), ], ) -def test_head_node_permissions(mocker, test_datadir, config_file_name): +def test_head_node_permissions(mocker, test_datadir, config_file_name, head_node_permissions): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) head_node_policy = get_head_node_policy(generated_template) - statement = get_statement_by_sid(policy=head_node_policy, sid="AllowGettingDirectorySecretValue") - assert_that(statement["Effect"]).is_equal_to("Allow") - assert_that(statement["Action"]).is_equal_to("secretsmanager:GetSecretValue") - assert_that(statement["Resource"]).is_equal_to("arn:aws:secretsmanager:eu-west-1:123456789:secret:a-secret-name") + + for head_node_permission in head_node_permissions: + statement = get_statement_by_sid(policy=head_node_policy, sid=head_node_permission["Sid"]) + assert_that(statement["Effect"]).is_equal_to("Allow") + assert_that(statement["Action"]).is_equal_to(head_node_permission["Action"]) + assert_that(statement["Resource"]).is_equal_to(head_node_permission["Resource"]) diff --git a/cli/tests/pcluster/templates/test_directory_service/test_head_node_permissions/config-ssm.yaml b/cli/tests/pcluster/templates/test_directory_service/test_head_node_permissions/config-ssm.yaml new file mode 100644 index 0000000000..59fb2d8e40 --- /dev/null +++ b/cli/tests/pcluster/templates/test_directory_service/test_head_node_permissions/config-ssm.yaml @@ -0,0 +1,22 @@ +Image: + Os: alinux2 +HeadNode: + InstanceType: t2.micro + Networking: + SubnetId: subnet-12345678 +Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: queue1 + Networking: + SubnetIds: + - subnet-12345678 + ComputeResources: + - Name: compute_resource1 + InstanceType: c5.2xlarge +DirectoryService: + DomainName: corp.something.com + DomainAddr: ldaps://corp.something.com + PasswordSecretArn: arn:aws:ssm:eu-west-1:123456789:parameter/a-parameter-name + DomainReadOnlyUser: cn=ReadOnlyUser,ou=Users,ou=CORP,dc=corp,dc=something,dc=com + LdapTlsCaCert: /path/to/domain-certificate.crt diff --git a/cli/tests/pcluster/templates/test_iam.py b/cli/tests/pcluster/templates/test_iam.py index a36a9c1488..9692092d1f 100644 --- a/cli/tests/pcluster/templates/test_iam.py +++ b/cli/tests/pcluster/templates/test_iam.py @@ -16,7 +16,7 @@ from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_yaml_dict from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket_object_utils from tests.pcluster.utils import get_resources @@ -28,12 +28,13 @@ ) def test_iam_permissions_boundary(mocker, test_datadir, config_file_name, permissions_boundary): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) diff --git a/cli/tests/pcluster/templates/test_imagebuilder_stack.py b/cli/tests/pcluster/templates/test_imagebuilder_stack.py index f1748d6b36..642020969d 100644 --- a/cli/tests/pcluster/templates/test_imagebuilder_stack.py +++ b/cli/tests/pcluster/templates/test_imagebuilder_stack.py @@ -2580,6 +2580,9 @@ def test_imagebuilder_build_tags(mocker, resource, response, expected_imagebuild else: assert_that(resource.get("Properties").get("Tags")).is_equal_to(expected_imagebuilder_resource_tags) + if resource_name == "InfrastructureConfiguration": + assert_that(resource.get("Properties").get("ResourceTags")).is_equal_to(expected_imagebuilder_resource_tags) + @pytest.mark.parametrize( "resource, response, expected_imagebuilder_subnet_id", diff --git a/cli/tests/pcluster/templates/test_scheduling.py b/cli/tests/pcluster/templates/test_scheduling.py index cbeded6724..cbe43f6e96 100644 --- a/cli/tests/pcluster/templates/test_scheduling.py +++ b/cli/tests/pcluster/templates/test_scheduling.py @@ -17,7 +17,7 @@ from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_yaml_dict from tests.pcluster.aws.dummy_aws_api import mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket_object_utils from tests.pcluster.utils import get_head_node_policy, get_resources, get_statement_by_sid @@ -29,12 +29,13 @@ ) def test_additional_security_groups(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -57,12 +58,13 @@ def test_additional_security_groups(mocker, test_datadir, config_file_name): ) def test_permissions_for_slurm_db_secret(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -81,12 +83,13 @@ def test_permissions_for_slurm_db_secret(mocker, test_datadir, config_file_name) ) def test_head_node_custom_pass_role(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -105,12 +108,13 @@ def test_head_node_custom_pass_role(mocker, test_datadir, config_file_name): ) def test_head_node_base_pass_role(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -129,12 +133,13 @@ def test_head_node_base_pass_role(mocker, test_datadir, config_file_name): ) def test_head_node_mixed_pass_role(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) diff --git a/cli/tests/pcluster/templates/test_shared_storage.py b/cli/tests/pcluster/templates/test_shared_storage.py index 24b5739d48..6ba25ee312 100644 --- a/cli/tests/pcluster/templates/test_shared_storage.py +++ b/cli/tests/pcluster/templates/test_shared_storage.py @@ -17,7 +17,7 @@ from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.utils import load_yaml_dict from tests.pcluster.aws.dummy_aws_api import _DummyAWSApi, _DummyInstanceTypeInfo, mock_aws_api -from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket +from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket_object_utils from tests.pcluster.utils import get_head_node_policy, get_resources, get_statement_by_sid @@ -31,12 +31,13 @@ ) def test_shared_storage_ebs(mocker, test_datadir, config_file_name, storage_name, deletion_policy): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -60,12 +61,13 @@ def test_shared_storage_ebs(mocker, test_datadir, config_file_name, storage_name ) def test_shared_storage_efs(mocker, test_datadir, config_file_name, storage_name, deletion_policy): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) file_systems = get_resources( @@ -128,12 +130,13 @@ def test_shared_storage_efs(mocker, test_datadir, config_file_name, storage_name ) def test_shared_storage_fsx(mocker, test_datadir, config_file_name, storage_name, fs_type, deletion_policy): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) @@ -228,12 +231,13 @@ def test_non_happy_ontap_and_openzfs_mounting(mocker, test_datadir): ) def test_efs_permissions(mocker, test_datadir, config_file_name): mock_aws_api(mocker) + mock_bucket_object_utils(mocker) input_yaml = load_yaml_dict(test_datadir / config_file_name) cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template = CDKTemplateBuilder().build_cluster_template( + generated_template, _ = CDKTemplateBuilder().build_cluster_template( cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) diff --git a/cli/tests/pcluster/test_utils.py b/cli/tests/pcluster/test_utils.py index 674851d9dc..40d1ec5c57 100644 --- a/cli/tests/pcluster/test_utils.py +++ b/cli/tests/pcluster/test_utils.py @@ -9,8 +9,11 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. # This module provides unit tests for the functions in the pcluster.utils module.""" +import asyncio import os import time +import unittest +from collections import namedtuple import pytest from assertpy import assert_that @@ -22,7 +25,7 @@ from pcluster.aws.aws_resources import InstanceTypeInfo from pcluster.aws.common import Cache from pcluster.models.cluster import Cluster, ClusterStack -from pcluster.utils import yaml_load +from pcluster.utils import batch_by_property_callback, yaml_load from tests.pcluster.aws.dummy_aws_api import mock_aws_api FAKE_NAME = "cluster-name" @@ -325,6 +328,22 @@ def test_docs_base_url(mocker, partition, docs_base_url): assert_that(pcluster.utils.get_docs_base_url()).is_equal_to(docs_base_url) +def test_get_service_endpoint(mocker): + service = "whatever-service" + region = "whatever-region" + mocked_partition = "correct_partition" + mocked_domain = "correct_domain" + + mocked_get_partition = mocker.patch("pcluster.utils.get_partition", return_value=mocked_partition) + mocked_get_url_domain_suffix = mocker.patch("pcluster.utils.get_url_domain_suffix", return_value=mocked_domain) + + actual_endpoint = pcluster.utils.get_service_endpoint(service, region) + + assert_that(actual_endpoint).is_equal_to(f"https://{service}.{region}.{mocked_domain}") + mocked_get_partition.assert_called_once_with(region) + mocked_get_url_domain_suffix.assert_called_once_with(mocked_partition) + + @pytest.mark.parametrize( "region, s3_bucket_domain", [ @@ -407,3 +426,114 @@ def test_split_resource_prefix(resource_prefix, expected_output): iam_path_prefix, iam_role_prefix = utils.split_resource_prefix(resource_prefix=resource_prefix) assert_that(iam_path_prefix).is_equal_to(expected_output[0]) assert_that(iam_role_prefix).is_equal_to(expected_output[1]) + + +Item = namedtuple("Item", "property") + + +@pytest.mark.parametrize( + "items, expected_batches, batch_size, raises", + [ + ( + [ + Item(property=["test-1", "test-2", "test-3"]), + Item(property=["test-4", "test-5", "test-6"]), + Item(property=["test-7", "test-8", "test-9"]), + Item(property=["test-10", "test-11", "test-12"]), + Item(property=["test-13", "test-14", "test-15"]), + ], + [ + [ + Item(property=["test-1", "test-2", "test-3"]), + Item(property=["test-4", "test-5", "test-6"]), + Item(property=["test-7", "test-8", "test-9"]), + ], + [ + Item(property=["test-10", "test-11", "test-12"]), + Item(property=["test-13", "test-14", "test-15"]), + ], + ], + 9, + False, + ), + ( + [ + Item(property=["test-1", "test-2"]), + Item(property=["test-3"]), + Item(property=["test-4"]), + Item(property=["test-5", "test-6"]), + Item(property=["test-7", "test-8", "test-9"]), + ], + [ + [Item(property=["test-1", "test-2"]), Item(property=["test-3"])], + [Item(property=["test-4"]), Item(property=["test-5", "test-6"])], + [Item(property=["test-7", "test-8", "test-9"])], + ], + 3, + False, + ), + ( + [ + Item(property=["test-1", "test-2", "test-3", "test-4"]), + Item(property=["test-5"]), + ], + None, + 3, + True, + ), + ( + [ + Item(property=["test-1", "test-2"]), + ], + [[Item(property=["test-1", "test-2"])]], + 3, + False, + ), + ], + ids=[ + "last-batch-with-size-smaller-than-batch-size", + "all-item-property-sizes-within-range-of-batch-size", + "property-count-greater-than-batch-size", + "total-property-counts-less-than-batch-size", + ], +) +def test_batch_by_property_size(items, expected_batches, batch_size, raises): + if raises: + with pytest.raises(ValueError): + for _ in batch_by_property_callback(items, lambda item: len(item.property), batch_size): + pass + else: + batches = [batch for batch in batch_by_property_callback(items, lambda item: len(item.property), batch_size)] + assert_that(batches).is_equal_to(expected_batches) + + +class TestAsyncUtils(unittest.TestCase): + def test_async_timeout_cache(self): + total_calls = 0 + + class FakeAsyncMethodProvider: + def very_expensive_function(self, param): + time.sleep(1) + nonlocal total_calls + total_calls += 1 + return param + + @utils.AsyncUtils.async_timeout_cache(timeout=10000) + async def async_method(self, param): + _async_very_expensive_function = utils.AsyncUtils.async_from_sync(self.very_expensive_function) + return await _async_very_expensive_function(param) + + unique_calls = 10 + repetitions = 15 + + executions = [] + expected_results = [] + for i in range(unique_calls): + for _ in range(repetitions): + executions.append(FakeAsyncMethodProvider().async_method(i)) + expected_results.append(i) + + results = asyncio.get_event_loop().run_until_complete(asyncio.gather(*executions)) + + assert_that(expected_results).contains_sequence(*results) + assert_that(unique_calls).is_equal_to(total_calls) diff --git a/cli/tests/pcluster/utils.py b/cli/tests/pcluster/utils.py index e656a66510..f94400c31f 100644 --- a/cli/tests/pcluster/utils.py +++ b/cli/tests/pcluster/utils.py @@ -52,6 +52,16 @@ def get_resources( ) +def get_asset_content_with_resource_name(assets, resource_name): + """Get the asset with a top-level resource matching the given logical ID from a list of assets.""" + for asset in assets: + asset_content = asset.get("content") + if asset_content: + if asset_content["Resources"].get(resource_name): + return asset_content + return None + + def get_head_node_policy(template, enforce_not_null=True): policy = get_resources(template, type="AWS::IAM::Policy", name="ParallelClusterPoliciesHeadNode").get( "ParallelClusterPoliciesHeadNode" diff --git a/cli/tests/pcluster/validators/test_all_validators.py b/cli/tests/pcluster/validators/test_all_validators.py index a33c9dd9c3..c46c65d7fc 100644 --- a/cli/tests/pcluster/validators/test_all_validators.py +++ b/cli/tests/pcluster/validators/test_all_validators.py @@ -11,12 +11,10 @@ from unittest.mock import PropertyMock, call from assertpy import assert_that -from pkg_resources import packaging from pcluster.aws.aws_resources import ImageInfo -from pcluster.constants import Feature from pcluster.schemas.cluster_schema import ClusterSchema -from pcluster.utils import get_installed_version, load_yaml_dict +from pcluster.utils import load_yaml_dict from pcluster.validators import ( cluster_validators, database_validators, @@ -26,15 +24,43 @@ iam_validators, instances_validators, kms_validators, + monitoring_validators, networking_validators, s3_validators, - scheduler_plugin_validators, + slurm_settings_validator, + tags_validators, ) -from pcluster.validators.common import Validator, ValidatorContext +from pcluster.validators.common import AsyncValidator, Validator, ValidatorContext from tests.pcluster.aws.dummy_aws_api import mock_aws_api -def _mock_all_validators(mocker, mockers, additional_modules=None): +def _is_validator_of_type(cls, name, validator_type): + return ( + isinstance(cls, type) + and issubclass(cls, validator_type) + and name != "Validator" + and name != "AsyncValidator" + and not name.startswith("_") + ) + + +def _mock_all_validators(mocker, additional_modules=None): + mockers = [] + async_mockers = [] + + # when python 3.7 support is dropped, this can be substituted with AsyncMockc + def create_validate_async_mock(): + _awaited = False + + async def _validate_async(*args, **kwargs): + nonlocal _awaited + _awaited = True + return [] + + _validate_async.assert_awaited = lambda: assert_that(_awaited).is_true() + + return _validate_async + modules = [ cluster_validators, database_validators, @@ -44,23 +70,31 @@ def _mock_all_validators(mocker, mockers, additional_modules=None): kms_validators, iam_validators, instances_validators, + monitoring_validators, networking_validators, s3_validators, + slurm_settings_validator, + tags_validators, ] if additional_modules: modules += additional_modules for module in modules: module_name = module.__name__ for name, cls in module.__dict__.items(): - if ( - isinstance(cls, type) - and issubclass(cls, Validator) - and name != "Validator" - and not name.startswith("_") - ): + if _is_validator_of_type(cls, name, AsyncValidator): + mock = create_validate_async_mock() + mocker.patch(f"{module_name}.{name}._validate_async", side_effect=mock) + async_mockers.append( + { + "name": name, + "mocker": mock, + } + ) + elif _is_validator_of_type(cls, name, Validator): mockers.append( {"name": name, "mocker": mocker.patch(f"{module_name}.{name}._validate", return_value=[])} ) + return mockers, async_mockers def _load_and_validate(config_path): @@ -85,8 +119,7 @@ def _assert_instance_architecture(expected_instance_architecture_validator_input def test_slurm_all_validators_are_called(test_datadir, mocker): """Verify that all validators are called during validation.""" - mockers = [] - _mock_all_validators(mocker, mockers) + mockers, async_mockers = _mock_all_validators(mocker) # mock properties that use boto3 calls mocker.patch( @@ -123,13 +156,18 @@ def test_slurm_all_validators_are_called(test_datadir, mocker): if m["name"] in ["TagKeyValidator", "ClusterNameValidator", "InstanceProfileValidator", "RoleValidator"]: # ToDo: Reserved tag keys to be aligned between cluster and image builder continue - print("Checking " + m["name"] + " is called") + print(f"Checking validator of class \"{m['name']}\" is called") m["mocker"].assert_called() + for m in async_mockers: + print(f"Checking validator of class \"{m['name']}\" is awaited") + m["mocker"].assert_awaited() + def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker): """Verify that validators are called with proper argument during validation.""" - _mock_all_validators(mocker, []) # To avoid failure of the test as soon as a new validator is added. + # To avoid failure of the test as soon as a new validator is added. + _mock_all_validators(mocker) validators_path = "pcluster.validators" @@ -154,6 +192,9 @@ def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker) cluster_validators + ".NumberOfStorageValidator._validate", return_value=[] ) deletion_policy_validator = mocker.patch(cluster_validators + ".DeletionPolicyValidator._validate", return_value=[]) + root_volume_encryption_consistency_validator = mocker.patch( + cluster_validators + ".RootVolumeEncryptionConsistencyValidator._validate", return_value=[] + ) ec2_validators = validators_path + ".ec2_validators" key_pair_validator = mocker.patch(ec2_validators + ".KeyPairValidator._validate", return_value=[]) instance_type_validator = mocker.patch(ec2_validators + ".InstanceTypeValidator._validate", return_value=[]) @@ -217,7 +258,13 @@ def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker) ) monitoring_validators = validators_path + ".monitoring_validators" log_rotation_validator = mocker.patch(monitoring_validators + ".LogRotationValidator._validate", return_value=[]) - + detailed_monitoring_validator = mocker.patch( + monitoring_validators + ".DetailedMonitoringValidator._validate", return_value=[] + ) + tags_validators = validators_path + ".tags_validators" + compute_resource_tags_validator = mocker.patch( + tags_validators + ".ComputeResourceTagsValidator._validate", return_value=[] + ) mocker.patch( "pcluster.config.cluster_config.HeadNode.architecture", new_callable=PropertyMock(return_value="x86_64") ) @@ -248,9 +295,10 @@ def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker) ) max_count_validator.assert_has_calls( [ - call(max_length=10, resource_name="SlurmQueues", resources_length=2), - call(max_length=5, resource_name="ComputeResources", resources_length=2), - call(max_length=5, resource_name="ComputeResources", resources_length=3), + call(resources_length=2, max_length=100, resource_name="SlurmQueues"), + call(resources_length=5, max_length=150, resource_name="ComputeResources per Cluster"), + call(resources_length=3, max_length=40, resource_name="ComputeResources per Queue"), + call(resources_length=2, max_length=40, resource_name="ComputeResources per Queue"), ], any_order=True, ) @@ -297,6 +345,9 @@ def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker) validator=instance_architecture_compatibility_validator, ) + root_volume_encryption_consistency_validator.assert_has_calls( + [call(encryption_settings=[("queue1", True), ("queue2", True)])] + ) ebs_volume_type_size_validator.assert_has_calls([call(volume_type="gp3", volume_size=35)]) kms_key_validator.assert_has_calls([call(kms_key_id="1234abcd-12ab-34cd-56ef-1234567890ab")]) kms_key_id_encrypted_validator.assert_has_calls( @@ -346,343 +397,5 @@ def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker) instance_type_accelerator_manufacturer_validator.assert_called() instance_type_placement_group_validator.assert_called() log_rotation_validator.assert_called() - - -def test_scheduler_plugin_all_validators_are_called(test_datadir, mocker): - """Verify that all validators are called during validation.""" - mockers = [] - _mock_all_validators(mocker, mockers, additional_modules=[scheduler_plugin_validators]) - - # mock properties that use boto3 calls - mocker.patch( - "pcluster.config.cluster_config.HeadNode.architecture", new_callable=PropertyMock(return_value="x86_64") - ) - mocker.patch( - "pcluster.config.cluster_config.SchedulerPluginComputeResource.architecture", - new_callable=PropertyMock(return_value="x86_64"), - ) - mocker.patch( - "pcluster.config.cluster_config.HeadNodeNetworking.availability_zone", - new_callable=PropertyMock(return_value="us-east-1a"), - ) - mocker.patch( - "pcluster.config.cluster_config.BaseClusterConfig.head_node_ami", - new_callable=PropertyMock(return_value="ami-12345678"), - ) - mocker.patch( - "pcluster.config.cluster_config.SchedulerPluginClusterConfig.get_instance_types_data", - ) - mocker.patch( - "pcluster.aws.ec2.Ec2Client.describe_image", - return_value=ImageInfo({"BlockDeviceMappings": [{"Ebs": {"VolumeSize": 35}}]}), - ) - - mock_aws_api(mocker) - - # Need to load two configuration files to execute all validators because there are mutually exclusive parameters. - _load_and_validate(test_datadir / "scheduler_plugin_1.yaml") - _load_and_validate(test_datadir / "scheduler_plugin_2.yaml") - - # FlexibleInstanceTypes Only supported in Slurm - flexible_instance_types_validators = [ - "InstancesCPUValidator", - "InstancesAcceleratorsValidator", - "InstancesEFAValidator", - "InstancesNetworkingValidator", - "InstancesAllocationStrategyValidator", - "InstancesMemorySchedulingValidator", - ] - - # Assert validators are called - for m in mockers: - if ( - m["name"] - in [ - "TagKeyValidator", - "ClusterNameValidator", - "InstanceProfileValidator", - "RoleValidator", - "MixedSecurityGroupOverwriteValidator", - "HostedZoneValidator", - "InstanceTypeMemoryInfoValidator", - "InstanceTypeAcceleratorManufacturerValidator", - "CapacityReservationValidator", - "CapacityReservationResourceGroupValidator", - "DatabaseUriValidator", - "InstanceTypePlacementGroupValidator", - ] - + flexible_instance_types_validators - ): - # ToDo: Reserved tag keys to be aligned between cluster and image builder - continue - print("Checking " + m["name"] + " is called") - m["mocker"].assert_called() - - -def test_scheduler_plugin_validators_are_called_with_correct_argument(test_datadir, mocker): - """Verify that validators are called with proper argument during validation.""" - _mock_all_validators( - mocker, [], additional_modules=[scheduler_plugin_validators] - ) # To avoid failure of the test as soon as a new validator is added. - - validators_path = "pcluster.validators" - - cluster_validators = validators_path + ".cluster_validators" - scheduler_os_validator = mocker.patch(cluster_validators + ".SchedulerOsValidator._validate", return_value=[]) - feature_validators = validators_path + ".feature_validators" - feature_region_validator = mocker.patch(feature_validators + ".FeatureRegionValidator._validate", return_value=[]) - compute_resource_size_validator = mocker.patch( - cluster_validators + ".ComputeResourceSizeValidator._validate", return_value=[] - ) - architecture_os_validator = mocker.patch(cluster_validators + ".ArchitectureOsValidator._validate", return_value=[]) - instance_architecture_compatibility_validator = mocker.patch( - cluster_validators + ".InstanceArchitectureCompatibilityValidator._validate", return_value=[] - ) - name_validator = mocker.patch(cluster_validators + ".NameValidator._validate", return_value=[]) - max_count_validator = mocker.patch(cluster_validators + ".MaxCountValidator._validate", return_value=[]) - fsx_architecture_os_validator = mocker.patch( - cluster_validators + ".FsxArchitectureOsValidator._validate", return_value=[] - ) - duplicate_mount_dir_validator = mocker.patch( - cluster_validators + ".DuplicateMountDirValidator._validate", return_value=[] - ) - number_of_storage_validator = mocker.patch( - cluster_validators + ".NumberOfStorageValidator._validate", return_value=[] - ) - - ec2_validators = validators_path + ".ec2_validators" - key_pair_validator = mocker.patch(ec2_validators + ".KeyPairValidator._validate", return_value=[]) - instance_type_validator = mocker.patch(ec2_validators + ".InstanceTypeValidator._validate", return_value=[]) - instance_type_base_ami_compatible_validator = mocker.patch( - ec2_validators + ".InstanceTypeBaseAMICompatibleValidator._validate", return_value=[] - ) - - networking_validators = validators_path + ".networking_validators" - security_groups_validator = mocker.patch( - networking_validators + ".SecurityGroupsValidator._validate", return_value=[] - ) - subnets_validator = mocker.patch(networking_validators + ".SubnetsValidator._validate", return_value=[]) - single_instance_type_subnet_validator = mocker.patch( - networking_validators + ".SingleInstanceTypeSubnetValidator._validate", return_value=[] - ) - - fsx_validators = validators_path + ".fsx_validators" - fsx_s3_validator = mocker.patch(fsx_validators + ".FsxS3Validator._validate", return_value=[]) - fsx_persistent_options_validator = mocker.patch( - fsx_validators + ".FsxPersistentOptionsValidator._validate", return_value=[] - ) - fsx_backup_options_validator = mocker.patch( - fsx_validators + ".FsxBackupOptionsValidator._validate", return_value=[] - ) - fsx_storage_type_options_validator = mocker.patch( - fsx_validators + ".FsxStorageTypeOptionsValidator._validate", return_value=[] - ) - fsx_storage_capacity_validator = mocker.patch( - fsx_validators + ".FsxStorageCapacityValidator._validate", return_value=[] - ) - fsx_backup_id_validator = mocker.patch(fsx_validators + ".FsxBackupIdValidator._validate", return_value=[]) - - ebs_validators = validators_path + ".ebs_validators" - ebs_volume_type_size_validator = mocker.patch( - ebs_validators + ".EbsVolumeTypeSizeValidator._validate", return_value=[] - ) - ebs_volume_throughput_validator = mocker.patch( - ebs_validators + ".EbsVolumeThroughputValidator._validate", return_value=[] - ) - ebs_volume_throughput_iops_validator = mocker.patch( - ebs_validators + ".EbsVolumeThroughputIopsValidator._validate", return_value=[] - ) - ebs_volume_iops_validator = mocker.patch(ebs_validators + ".EbsVolumeIopsValidator._validate", return_value=[]) - shared_ebs_volume_id_validator = mocker.patch( - ebs_validators + ".SharedEbsVolumeIdValidator._validate", return_value=[] - ) - ebs_volume_size_snapshot_validator = mocker.patch( - ebs_validators + ".EbsVolumeSizeSnapshotValidator._validate", return_value=[] - ) - - kms_validators = validators_path + ".kms_validators" - kms_key_validator = mocker.patch(kms_validators + ".KmsKeyValidator._validate", return_value=[]) - kms_key_id_encrypted_validator = mocker.patch( - kms_validators + ".KmsKeyIdEncryptedValidator._validate", return_value=[] - ) - - # Scheduler plugin related validators - scheduler_plugin = validators_path + ".scheduler_plugin_validators" - grant_sudo_privileges_validator = mocker.patch( - scheduler_plugin + ".GrantSudoPrivilegesValidator._validate", return_value=[] - ) - plugin_interface_version_validator = mocker.patch( - scheduler_plugin + ".PluginInterfaceVersionValidator._validate", return_value=[] - ) - scheduler_plugin_os_architecture_validator = mocker.patch( - scheduler_plugin + ".SchedulerPluginOsArchitectureValidator._validate", return_value=[] - ) - scheduler_plugin_region_validator = mocker.patch( - scheduler_plugin + ".SchedulerPluginRegionValidator._validate", return_value=[] - ) - sudo_privileges_validator = mocker.patch(scheduler_plugin + ".SudoPrivilegesValidator._validate", return_value=[]) - supported_versions_validator = mocker.patch( - scheduler_plugin + ".SupportedVersionsValidator._validate", return_value=[] - ) - user_name_validator = mocker.patch(scheduler_plugin + ".UserNameValidator._validate", return_value=[]) - - mocker.patch( - "pcluster.config.cluster_config.HeadNode.architecture", new_callable=PropertyMock(return_value="x86_64") - ) - mocker.patch( - "pcluster.config.cluster_config.SlurmComputeResource.architecture", - new_callable=PropertyMock(return_value="x86_64"), - ) - mocker.patch( - "pcluster.aws.ec2.Ec2Client.describe_image", - return_value=ImageInfo({"BlockDeviceMappings": [{"Ebs": {"VolumeSize": 35}}]}), - ) - - mock_aws_api(mocker) - - _load_and_validate(test_datadir / "scheduler_plugin.yaml") - - # Assert validators are called - scheduler_os_validator.assert_has_calls([call(os="centos7", scheduler="plugin")]) - feature_region_validator.assert_has_calls( - [call(feature=feature, region="us-east-1") for feature in Feature if feature is not Feature.BATCH], - any_order=True, - ) - compute_resource_size_validator.assert_has_calls( - [ - # Defaults of min_count=0, max_count=10 - call(min_count=0, max_count=10), - call(min_count=0, max_count=10), - call(min_count=0, max_count=10), - call(min_count=1, max_count=15), - ], - any_order=True, - ) - max_count_validator.assert_has_calls( - [ - call(max_length=5, resource_name="SchedulerQueues", resources_length=2), - call(max_length=3, resource_name="ComputeResources", resources_length=2), - call(max_length=3, resource_name="ComputeResources", resources_length=2), - ], - any_order=True, - ) - key_pair_validator.assert_has_calls([call(key_name="ec2-key-name")]) - instance_type_validator.assert_has_calls([call(instance_type="c5d.xlarge")]) - instance_type_base_ami_compatible_validator.assert_has_calls( - [ - call(instance_type="c5d.xlarge", image="ami-12345678"), - call(instance_type="c5.xlarge", image="ami-12345678"), - call(instance_type="c4.xlarge", image="ami-12345678"), - call(instance_type="c4.2xlarge", image="ami-23456789"), - call(instance_type="c5.2xlarge", image="ami-23456789"), - ], - any_order=True, - ) - subnets_validator.assert_has_calls([call(subnet_ids=["subnet-12345678", "subnet-23456789"])]) - single_instance_type_subnet_validator.assert_has_calls( - [ - call( - queue_name="queue1", - subnet_ids=["subnet-12345678"], - ), - call( - queue_name="queue2", - subnet_ids=["subnet-12345678"], - ), - ] - ) - security_groups_validator.assert_has_calls( - [call(security_group_ids=None), call(security_group_ids=None)], any_order=True - ) - architecture_os_validator.assert_has_calls( - [call(os="centos7", architecture="x86_64", custom_ami="ami-12345678", ami_search_filters=None)] - ) - _assert_instance_architecture( - expected_instance_architecture_validator_input=[ - {"instance_types": ["c5.xlarge"], "architecture": "x86_64"}, - {"instance_types": ["c4.xlarge"], "architecture": "x86_64"}, - {"instance_types": ["c4.2xlarge"], "architecture": "x86_64"}, - {"instance_types": ["c5.2xlarge"], "architecture": "x86_64"}, - ], - validator=instance_architecture_compatibility_validator, - ) - - ebs_volume_type_size_validator.assert_has_calls([call(volume_type="gp3", volume_size=35)]) - kms_key_validator.assert_has_calls([call(kms_key_id="1234abcd-12ab-34cd-56ef-1234567890ab")]) - kms_key_id_encrypted_validator.assert_has_calls( - [call(kms_key_id="1234abcd-12ab-34cd-56ef-1234567890ab", encrypted=True)] - ) - fsx_architecture_os_validator.assert_has_calls([call(architecture="x86_64", os="centos7")]) - # Scratch mount directories are retrieved from a set. So the order of them is not guaranteed. - # The first item in call_args is regular args, the second item is keyword args. - shared_storage_name_mount_dir_tuple_list = duplicate_mount_dir_validator.call_args[1][ - "shared_storage_name_mount_dir_tuple_list" - ] - shared_storage_name_mount_dir_tuple_list.sort(key=lambda tup: tup[1]) - assert_that(shared_storage_name_mount_dir_tuple_list).is_equal_to( - [ - ("name1", "/my/mount/point1"), - ("name2", "/my/mount/point2"), - ("name3", "/my/mount/point3"), - ("name4", "/my/mount/point4"), - ("name5", "/my/mount/point5"), - ] - ) - local_mount_dir_instance_types_dict = duplicate_mount_dir_validator.call_args[1][ - "local_mount_dir_instance_types_dict" - ] - assert_that(local_mount_dir_instance_types_dict).is_equal_to({"/scratch_head": {"c5d.xlarge"}}) - number_of_storage_validator.assert_has_calls( - [ - call(storage_type="EBS", max_number=5, storage_count=1), - call(storage_type="existing EFS", max_number=20, storage_count=0), - call(storage_type="existing FSx", max_number=20, storage_count=2), - call(storage_type="new EFS", max_number=1, storage_count=1), - call(storage_type="new FSx", max_number=1, storage_count=1), - call(storage_type="new RAID", max_number=1, storage_count=0), - ], - any_order=True, - ) - # Scheduler plugin related validators - plugin_interface_version_validator.assert_has_calls( - [ - call( - plugin_version="1.0", - support_version_high_range=packaging.version.Version("1.0"), - support_version_low_range=packaging.version.Version("1.0"), - ) - ] - ) - scheduler_plugin_os_architecture_validator.assert_has_calls( - [ - call( - architecture="x86_64", - os="centos7", - supported_arm64=["ubuntu1804"], - supported_x86=["ubuntu18", "centos7"], - ) - ] - ) - scheduler_plugin_region_validator.assert_has_calls( - [call(region="us-east-1", supported_regions=["cn-north-1", "us-east-1"])] - ) - sudo_privileges_validator.assert_has_calls([call(grant_sudo_privileges=True, requires_sudo_privileges=False)]) - supported_versions_validator.assert_has_calls( - [call(installed_version=get_installed_version(), supported_versions_string=">=3.1.0, <=3.4.2")] - ) - user_name_validator.assert_has_calls([call(user_name="user1"), call(user_name="user2")]) - - # No assertion on the argument for minor validators - name_validator.assert_called() - fsx_s3_validator.assert_called() - fsx_backup_options_validator.assert_called() - fsx_storage_type_options_validator.assert_called() - fsx_storage_capacity_validator.assert_called() - fsx_backup_id_validator.assert_called() - ebs_volume_throughput_validator.assert_called() - ebs_volume_throughput_iops_validator.assert_called() - ebs_volume_iops_validator.assert_called() - ebs_volume_size_snapshot_validator.assert_called() - shared_ebs_volume_id_validator.assert_called() - fsx_persistent_options_validator.assert_called() - grant_sudo_privileges_validator.assert_called() + detailed_monitoring_validator.assert_called() + compute_resource_tags_validator.assert_called() diff --git a/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_all_validators_are_called/scheduler_plugin_1.yaml b/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_all_validators_are_called/scheduler_plugin_1.yaml deleted file mode 100644 index dfe2c5c514..0000000000 --- a/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_all_validators_are_called/scheduler_plugin_1.yaml +++ /dev/null @@ -1,284 +0,0 @@ -Image: - Os: centos7 - CustomAmi: ami-12345678 -HeadNode: - InstanceType: t2.micro # t2.micro - Networking: - SubnetId: subnet-12345678 # subnet-xxx - ElasticIp: String # true|false|EIP-id - SecurityGroups: - - sg-12345678 - - sg-23456789 - Proxy: - HttpProxyAddress: String # https://proxy-address:port - DisableSimultaneousMultithreading: false - Ssh: - KeyName: ec2-key-name - AllowedIps: 1.2.3.4/32 - LocalStorage: - RootVolume: - Size: 40 - Encrypted: true - EphemeralVolume: - MountDir: /test - Dcv: - Enabled: true - Port: 8443 - AllowedIps: 0.0.0.0/0 - CustomActions: - OnNodeStart: - Script: https://test.tgz - Args: - - String - - stirng2 - OnNodeConfigured: - Script: https://test.tgz - Args: - - String - - stirng2 - Iam: - InstanceRole: arn:aws:iam::aws:role/CustomHeadNodeRole -Scheduling: - Scheduler: plugin - SchedulerSettings: - SchedulerDefinition: - PluginInterfaceVersion: "1.0" - Metadata: - Version: 1.0.0 - Name: my-scheduler - Documentation: link - Requirements: - SupportedDistros: - X86: ["ubuntu1804", "centos7"] - Arm64: ["ubuntu1804"] - SupportedRegions: ["cn-north-1", "us-east-1"] - QueueConstraints: - MaxCount: 5 - ComputeResourceConstraints: - MaxCount: 3 - RequiresSudoPrivileges: False - SupportsClusterUpdate: True - SupportedParallelClusterVersions: ">=3.1.0, <=3.4.2" - ClusterInfrastructure: - CloudFormation: - Template: https://bucket/scheduler_plugin/additional_cluster_infrastructure_no_jinja.cfn.yaml - Checksum: b4479e35f4e1f60b680f343b21d9dc30c958a6d239974e96a463b4479e35f4e1 - PluginResources: - ClusterSharedArtifacts: - - Source: s3://${Region}-aws-parallelcluster.s3.${Region}.${URLSuffix}/plugins/slurm/v1.0.0/artifacts.tar.gz - S3BucketOwner: "012345678910" - Checksum: b4479e35f4e1f60b680f343b21d9dc30c958a6d239974e96a463b4479e35f4e0 - - Source: s3://${Region}-aws-parallelcluster.s3.${Region}.${URLSuffix}/plugins/slurm/v1.0.0/artifacts2.tar.gz - Events: - HeadInit: - ExecuteCommand: - Command: env - HeadConfigure: - ExecuteCommand: - Command: artifacts/handlers/head_configure_plugin.sh - HeadFinalize: - ExecuteCommand: - Command: artifacts/handlers/head_finalize.sh - ComputeInit: - ExecuteCommand: - Command: env - ComputeConfigure: - ExecuteCommand: - Command: artifacts/handlers/compute_configure_plugin.sh - ComputeFinalize: - ExecuteCommand: - Command: artifacts/handlers/compute_finalize.sh - HeadClusterUpdate: - ExecuteCommand: - Command: artifacts/handlers/head_cluster_update.sh - HeadComputeFleetUpdate: - ExecuteCommand: - Command: artifacts/handlers/head_computefleet_update.sh - Monitoring: - Logs: - Files: - - FilePath: /var/log/slurmctld.log - TimestampFormat: "%Y-%m-%dT%H:%M:%S.%f" - NodeType: ALL - LogStreamName: slurmctld.log - - FilePath: /var/log/slurmd.log - TimestampFormat: "%Y-%m-%dT%H:%M:%S.%f" - NodeType: HEAD - LogStreamName: slurmctld.log - - FilePath: "/var/log/aws-autoscaler/logfile.log" - TimestampFormat: "%Y-%m-%d %H:%M:%S,%f" - NodeType: COMPUTE - LogStreamName: logfile.log - SystemUsers: - - Name: user1 - EnableImds: true # optional, defaults to 'false' - - Name: user2 - GrantSudoPrivileges: true - CustomSettings: # here you can add any custom setting - ScaledownIdletime: 10 - SchedulerType: "sched/backfill" - SuspendTimeout: 60 - SchedulerQueues: - - Name: queue1 - CapacityType: ONDEMAND - Networking: - SubnetIds: - - subnet-12345678 - CustomSettings: # here you can add any custom setting - Key1: String - ComputeResources: - - Name: computeresource1 - InstanceType: c5.xlarge - CustomSettings: # here you can add any custom setting - RealMemory: 185000 - - Name: computeresource2 - InstanceType: c4.xlarge - CustomActions: - OnNodeStart: - Script: https://test.tgz # s3:// | https:// - Args: - - arg1 - - arg2 - OnNodeConfigured: - Script: https://test.tgz # s3:// | https:// - Args: - - arg1 - - arg2 - Iam: - S3Access: - - BucketName: string1 - EnableWriteAccess: False - AdditionalIamPolicies: - - Policy: arn:aws:iam::aws:policy/AdministratorAccess - Image: - CustomAmi: ami-12345678 - - Name: queue2 - ComputeSettings: - LocalStorage: - RootVolume: - Size: 35 - Encrypted: true - VolumeType: gp2 - Iops: 100 - EphemeralVolume: - MountDir: /scratch - Networking: - SubnetIds: - - subnet-12345678 - AssignPublicIp: true - SecurityGroups: - - sg-12345678 - PlacementGroup: - Enabled: true - Id: String - Proxy: - HttpProxyAddress: https://proxy-address:port - ComputeResources: - - Name: computeresource1 - InstanceType: c4.2xlarge - - Name: computeresource2 - InstanceType: c5.2xlarge - MinCount: 1 - MaxCount: 15 - SpotPrice: 1.1 - DisableSimultaneousMultithreading: true - Efa: - Enabled: true - GdrSupport: false - - Name: pg-omit-odcr-arn - InstanceType: c6gn.xlarge - MinCount: 1 - MaxCount: 10 - CapacityReservationTarget: - CapacityReservationResourceGroupArn: 'arn:aws:resource-groups:us-west-2:944054287730:group/odcr-grp' - Iam: - InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile - Image: - CustomAmi: ami-23456789 -SharedStorage: - - MountDir: /my/mount/point1 - Name: name1 - StorageType: Ebs - EbsSettings: - VolumeType: gp2 # gp2 | gp3 | io1 | io2 | sc1 | st1 | standard - Iops: 100 - Size: 150 - Encrypted: True - KmsKeyId: String - SnapshotId: snap-12345678 - VolumeId: vol-12345678 - - MountDir: /my/mount/point2 - Name: name2 - StorageType: Efs - EfsSettings: - ThroughputMode: provisioned # bursting | provisioned - ProvisionedThroughput: 1024 - - MountDir: /my/mount/point3 - Name: name3 - StorageType: FsxLustre - FsxLustreSettings: - StorageCapacity: 3600 - DeploymentType: PERSISTENT_1 # PERSISTENT_1 | PERSISTENT_2 | SCRATCH_1 | SCRATCH_2 - ImportedFileChunkSize: 1024 - ExportPath: String # s3://bucket/folder - ImportPath: String # s3://bucket - WeeklyMaintenanceStartTime: "1:00:00" - AutomaticBackupRetentionDays: 0 - CopyTagsToBackups: true - DailyAutomaticBackupStartTime: 01:03 - PerUnitStorageThroughput: 200 - # BackupId: backup-fedcba98 # BackupId cannot coexist with some of the fields - KmsKeyId: String # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - # FileSystemId: fs-12345678123456789 # FileSystemId cannot coexist with some of the fields - AutoImportPolicy: NEW # NEW | NEW_CHANGED | NEW_CHANGED_DELETED - DriveCacheType: READ # READ - StorageType: HDD # HDD | SSD - - MountDir: /my/mount/point4 - Name: name4 - StorageType: FsxOntap - FsxOntapSettings: - VolumeId: "fsvol-00e6e91b8898ec4ef" - - MountDir: /my/mount/point5 - Name: name5 - StorageType: FsxOpenZfs - FsxOpenZfsSettings: - VolumeId: "fsvol-00e6e91b8898ec4ef" -Iam: - PermissionsBoundary: arn:aws:iam::aws:policy/boundary - ResourcePrefix: /path-prefix/name-prefix - Roles: - LambdaFunctionsRole: String # arn:aws:iam::aws:role/CustomResourcesLambdaRole -Monitoring: - DetailedMonitoring: true # false - Logs: - CloudWatch: - Enabled: true # true - RetentionInDays: 30 # 14 - Dashboards: - CloudWatch: - Enabled: true # true -AdditionalPackages: - IntelSoftware: - IntelHpcPlatform: true -Tags: - - Key: String - Value: String - - Key: two - Value: two22 -CustomS3Bucket: String -AdditionalResources: String # https://template.url -DevSettings: - ClusterTemplate: https://tests/aws-parallelcluster-template-3.0.tgz - Cookbook: - ChefCookbook: https://tests/aws-parallelcluster-cookbook-3.0.tgz - ExtraChefAttributes: | - {"cluster": {"scheduler_slots": "cores"}} - AwsBatchCliPackage: s3://test/aws-parallelcluster-batch-3.0.tgz - NodePackage: s3://test/aws-parallelcluster-node-3.0.tgz - Timeouts: - HeadNodeBootstrapTimeout: 1201 # Default 1800 (seconds) - ComputeNodeBootstrapTimeout: 1001 # Default 1800 (seconds) -DeploymentSettings: - LambdaFunctionsVpcConfig: - SecurityGroupIds: ["sg-028d73ae220157d96"] - SubnetIds: ["subnet-8e482ce8"] diff --git a/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_all_validators_are_called/scheduler_plugin_2.yaml b/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_all_validators_are_called/scheduler_plugin_2.yaml deleted file mode 100644 index 7f16304bd1..0000000000 --- a/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_all_validators_are_called/scheduler_plugin_2.yaml +++ /dev/null @@ -1,41 +0,0 @@ -Image: - Os: alinux2 -HeadNode: - InstanceType: t2.micro - Networking: - SubnetId: subnet-12345678 - Ssh: - KeyName: ec2-key-name -Scheduling: - Scheduler: plugin - SchedulerSettings: - SchedulerDefinition: - PluginInterfaceVersion: "1.0" - Metadata: - Version: 1.0.0 - Name: my-scheduler - Events: - HeadInit: - ExecuteCommand: - Command: env - SchedulerQueues: - - Name: queue1 - Networking: - SubnetIds: - - subnet-12345678 - ComputeResources: - - Name: compute-resource1 - InstanceType: c5.2xlarge -SharedStorage: - - MountDir: /my/mount/point2 - Name: name1 - StorageType: Efs - EfsSettings: - FileSystemId: fs-12345678123456789 - - MountDir: /my/mount/point3 - Name: name2 - StorageType: FsxLustre - FsxLustreSettings: - FileSystemId: fs-12345678123456789 -Iam: - ResourcePrefix: /path-prefix/name-prefix diff --git a/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_validators_are_called_with_correct_argument/scheduler_plugin.yaml b/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_validators_are_called_with_correct_argument/scheduler_plugin.yaml deleted file mode 100644 index 70f6d817c0..0000000000 --- a/cli/tests/pcluster/validators/test_all_validators/test_scheduler_plugin_validators_are_called_with_correct_argument/scheduler_plugin.yaml +++ /dev/null @@ -1,192 +0,0 @@ -Image: - Os: centos7 - CustomAmi: ami-12345678 -HeadNode: - InstanceType: c5d.xlarge - Networking: - SubnetId: subnet-23456789 - Ssh: - KeyName: ec2-key-name - LocalStorage: - EphemeralVolume: - MountDir: /scratch_head - Dcv: - Enabled: true -Scheduling: - Scheduler: plugin - SchedulerSettings: - SchedulerDefinition: - PluginInterfaceVersion: "1.0" - Metadata: - Version: 1.0.0 - Name: my-scheduler - Documentation: link - Requirements: - SupportedDistros: - X86: ["ubuntu18", "centos7"] - Arm64: ["ubuntu1804"] - SupportedRegions: ["cn-north-1", "us-east-1"] - QueueConstraints: - MaxCount: 5 - ComputeResourceConstraints: - MaxCount: 3 - RequiresSudoPrivileges: False - SupportsClusterUpdate: True - SupportedParallelClusterVersions: ">=3.1.0, <=3.4.2" - ClusterInfrastructure: - CloudFormation: - Template: https://bucket/scheduler_plugin/additional_cluster_infrastructure_no_jinja.cfn.yaml - Checksum: b4479e35f4e1f60b680f343b21d9dc30c958a6d239974e96a463b4479e35f4e1 - PluginResources: - ClusterSharedArtifacts: - - Source: s3://${Region}-aws-parallelcluster.s3.${Region}.${URLSuffix}/plugins/slurm/v1.0.0/artifacts.tar.gz - S3BucketOwner: "012345678910" - Checksum: b4479e35f4e1f60b680f343b21d9dc30c958a6d239974e96a463b4479e35f4e0 - - Source: s3://${Region}-aws-parallelcluster.s3.${Region}.${URLSuffix}/plugins/slurm/v1.0.0/artifacts2.tar.gz - Events: - HeadInit: - ExecuteCommand: - Command: env - HeadConfigure: - ExecuteCommand: - Command: artifacts/handlers/head_configure_plugin.sh - HeadFinalize: - ExecuteCommand: - Command: artifacts/handlers/head_finalize.sh - ComputeInit: - ExecuteCommand: - Command: env - ComputeConfigure: - ExecuteCommand: - Command: artifacts/handlers/compute_configure_plugin.sh - ComputeFinalize: - ExecuteCommand: - Command: artifacts/handlers/compute_finalize.sh - HeadClusterUpdate: - ExecuteCommand: - Command: artifacts/handlers/head_cluster_update.sh - HeadComputeFleetUpdate: - ExecuteCommand: - Command: artifacts/handlers/head_computefleet_update.sh - Monitoring: - Logs: - Files: - - FilePath: /var/log/slurmctld.log - TimestampFormat: "%Y-%m-%dT%H:%M:%S.%f" - NodeType: ALL - LogStreamName: slurmctld.log - - FilePath: /var/log/slurmd.log - TimestampFormat: "%Y-%m-%dT%H:%M:%S.%f" - NodeType: HEAD - LogStreamName: slurmctld.log - - FilePath: "/var/log/aws-autoscaler/logfile.log" - TimestampFormat: "%Y-%m-%d %H:%M:%S,%f" - NodeType: COMPUTE - LogStreamName: logfile.log - SystemUsers: - - Name: user1 - EnableImds: true # optional, defaults to 'false' - - Name: user2 - GrantSudoPrivileges: true - CustomSettings: # here you can add any custom setting - ScaledownIdletime: 10 - SchedulerType: "sched/backfill" - SuspendTimeout: 60 - SchedulerQueues: - - Name: queue1 - CapacityType: ONDEMAND - Networking: - SubnetIds: - - subnet-12345678 - CustomSettings: # here you can add any custom setting - Key1: String - ComputeResources: - - Name: computeresource1 - InstanceType: c5.xlarge - CustomSettings: # here you can add any custom setting - RealMemory: 185000 - - Name: computeresource2 - InstanceType: c4.xlarge - CustomActions: - OnNodeStart: - Script: https://test.tgz # s3:// | https:// - Args: - - arg1 - - arg2 - OnNodeConfigured: - Script: https://test.tgz # s3:// | https:// - Args: - - arg1 - - arg2 - Iam: - S3Access: - - BucketName: string1 - EnableWriteAccess: False - AdditionalIamPolicies: - - Policy: arn:aws:iam::aws:policy/AdministratorAccess - Image: - CustomAmi: ami-12345678 - - Name: queue2 - ComputeSettings: - LocalStorage: - RootVolume: - Size: 35 - Encrypted: true - VolumeType: gp2 - Iops: 100 - EphemeralVolume: - MountDir: /scratch - Networking: - SubnetIds: - - subnet-12345678 - AssignPublicIp: true - SecurityGroups: - - sg-12345678 - PlacementGroup: - Enabled: true - Id: String - Proxy: - HttpProxyAddress: https://proxy-address:port - ComputeResources: - - Name: computeresource1 - InstanceType: c4.2xlarge - - Name: computeresource2 - InstanceType: c5.2xlarge - MinCount: 1 - MaxCount: 15 - SpotPrice: 1.1 - DisableSimultaneousMultithreading: true - Efa: - Enabled: true - GdrSupport: false - Iam: - InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile - Image: - CustomAmi: ami-23456789 -SharedStorage: - - MountDir: /my/mount/point1 - Name: name1 - StorageType: Ebs - EbsSettings: - VolumeId: vol-12345678 - - MountDir: /my/mount/point2 - Name: name2 - StorageType: Efs - EfsSettings: - Encrypted: True - KmsKeyId: 1234abcd-12ab-34cd-56ef-1234567890ab - - MountDir: /my/mount/point3 - Name: name3 - StorageType: FsxLustre - FsxLustreSettings: - StorageCapacity: 3600 - - MountDir: /my/mount/point4 - Name: name4 - StorageType: FsxOntap - FsxOntapSettings: - VolumeId: "fsvol-00e6e91b8898ec4ef" - - MountDir: /my/mount/point5 - Name: name5 - StorageType: FsxOpenZfs - FsxOpenZfsSettings: - VolumeId: "fsvol-00e6e91b8898ec4ef" diff --git a/cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_1.yaml b/cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_1.yaml index 8e90648a51..43ccb67eae 100644 --- a/cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_1.yaml +++ b/cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_1.yaml @@ -45,15 +45,22 @@ Scheduling: Dns: DisableManagedDns: false HostedZoneId: 12345ABC + CustomSlurmSettingsIncludeFile: https://test.conf SlurmQueues: - Name: queue1 CapacityType: ONDEMAND Networking: SubnetIds: - subnet-12345678 + CustomSlurmSettings: + Param1: Value1 + Param2: Value2 ComputeResources: - Name: compute_resource1 InstanceType: c5.xlarge + CustomSlurmSettings: + Param1: Value1 + Param2: Value2 - Name: compute_resource2 InstanceType: c4.xlarge CustomActions: @@ -73,6 +80,11 @@ Scheduling: EnableWriteAccess: False AdditionalIamPolicies: - Policy: arn:aws:iam::aws:policy/AdministratorAccess + Tags: + - Key: queue_tag1 + Value: String + - Key: queue_tag2 + Value: String - Name: queue2 ComputeSettings: LocalStorage: @@ -95,6 +107,11 @@ Scheduling: ComputeResources: - Name: compute_resource_1 InstanceType: c4.2xlarge + Tags: + - Key: compute_tag1 + Value: String + - Key: compute_tag2 + Value: String - Name: compute_resource_2 InstanceType: c5.2xlarge MinCount: 1 diff --git a/cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_2.yaml b/cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_2.yaml index 7627fdb8d2..35f647f992 100644 --- a/cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_2.yaml +++ b/cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_2.yaml @@ -11,6 +11,11 @@ Scheduling: Scheduler: slurm SlurmSettings: EnableMemoryBasedScheduling: true + CustomSlurmSettings: + - Param1: Value1 + - Param2: Value2 + - NodeName: test-node[1-100] + CPUs: 100 Database: Uri: test.databaseserver.com UserName: test_admin diff --git a/cli/tests/pcluster/validators/test_all_validators/test_slurm_validators_are_called_with_correct_argument/slurm.yaml b/cli/tests/pcluster/validators/test_all_validators/test_slurm_validators_are_called_with_correct_argument/slurm.yaml index b1b0e0fab2..8fca3e41a6 100644 --- a/cli/tests/pcluster/validators/test_all_validators/test_slurm_validators_are_called_with_correct_argument/slurm.yaml +++ b/cli/tests/pcluster/validators/test_all_validators/test_slurm_validators_are_called_with_correct_argument/slurm.yaml @@ -20,6 +20,11 @@ Scheduling: ComputeResources: - Name: compute_resource1 InstanceType: t2.large + Tags: + - Key: compute_tag1 + Value: String + - Key: compute_tag2 + Value: String - Name: compute_resource2 InstanceType: c4.2xlarge - Name: queue2 @@ -37,6 +42,11 @@ Scheduling: - Name: compute_resource3 InstanceType: t2.large DisableSimultaneousMultithreading: true + Tags: + - Key: queue_tag1 + Value: String + - Key: queue_tag2 + Value: String SharedStorage: - MountDir: /my/mount/point1 Name: name1 @@ -53,4 +63,9 @@ SharedStorage: Name: name3 StorageType: FsxLustre FsxLustreSettings: - StorageCapacity: 3600 \ No newline at end of file + StorageCapacity: 3600 +Tags: + - Key: cluster_tag1 + Value: String + - Key: cluster_tag2 + Value: String \ No newline at end of file diff --git a/cli/tests/pcluster/validators/test_cluster_validators.py b/cli/tests/pcluster/validators/test_cluster_validators.py index 8b1bee2219..640005bff0 100644 --- a/cli/tests/pcluster/validators/test_cluster_validators.py +++ b/cli/tests/pcluster/validators/test_cluster_validators.py @@ -58,10 +58,12 @@ ManagedFsxMultiAzValidator, MaxCountValidator, MixedSecurityGroupOverwriteValidator, + MultiNetworkInterfacesInstancesValidator, NameValidator, NumberOfStorageValidator, OverlappingMountDirValidator, RegionValidator, + RootVolumeEncryptionConsistencyValidator, RootVolumeSizeValidator, SchedulableMemoryValidator, SchedulerOsValidator, @@ -77,11 +79,23 @@ MultiAzRootVolumeValidator, SharedEbsVolumeIdValidator, ) +from pcluster.validators.slurm_settings_validator import ( + SLURM_SETTINGS_DENY_LIST, + CustomSlurmNodeNamesValidator, + CustomSlurmSettingLevel, + CustomSlurmSettingsIncludeFileOnlyValidator, + CustomSlurmSettingsValidator, +) from tests.pcluster.aws.dummy_aws_api import mock_aws_api from tests.pcluster.validators.utils import assert_failure_level, assert_failure_messages from tests.utils import MockedBoto3Request +@pytest.fixture +def get_region(mocker): + mocker.patch("pcluster.config.cluster_config.get_region", return_value="WHATEVER_REGION") + + @pytest.fixture() def boto3_stubber_path(): return "pcluster.aws.common.boto3" @@ -185,6 +199,126 @@ def test_cluster_name_validator_slurm_accounting(cluster_name, scheduling, shoul assert_failure_messages(actual_failures, expected_message) +@pytest.mark.parametrize( + "description, custom_settings, deny_list, settings_level, expected_message", + [ + ( + "No error when custom settings are not in the deny_list", + [{"Allowed1": "Value1"}, {"Allowed2": "Value2"}], + SLURM_SETTINGS_DENY_LIST["SlurmConf"], # keep the deny-list lowercase + CustomSlurmSettingLevel.SLURM_CONF, + "", + ), + ( + "Fails when custom settings at SlurmConf level are in the deny_list, invalid parameters are reported", + [{"SlurmctldParameters": "SubPar1,Subpar2=1"}, {"CommunicationParameters": "SubPar1"}], + SLURM_SETTINGS_DENY_LIST["SlurmConf"]["Global"], # keep the deny-list lowercase + CustomSlurmSettingLevel.SLURM_CONF, + "Using the following custom Slurm settings at SlurmConf level is not allowed: " + "CommunicationParameters,SlurmctldParameters", + ), + ( + "No error when custom settings are not in the deny_list", + [{"Allowed1": "Value1", "Allowed2": "Value2"}], + ["denied1", "denied2"], # keep the deny-list lowercase + CustomSlurmSettingLevel.QUEUE, + "", + ), + ( + "Fails when custom settings are in the deny_list, invalid parameters are reported", + [{"Denied1": "Value1", "Denied2": "Value2"}], + ["denied1", "denied2"], + CustomSlurmSettingLevel.QUEUE, + "Using the following custom Slurm settings at Queue level is not allowed: Denied1,Denied2", + ), + ( + "No error when custom settings are not in the deny_list", + [{"Allowed1": "Value1", "Allowed2": "Value2"}], + ["denied1", "denied2"], + CustomSlurmSettingLevel.COMPUTE_RESOURCE, + "", + ), + ( + "Fails when custom settings are in the deny_list, invalid parameters are reported", + [{"Denied1": "Value1", "Denied2": "Value2"}], + ["denied1", "denied2"], + CustomSlurmSettingLevel.COMPUTE_RESOURCE, + "Using the following custom Slurm settings at ComputeResource level is not allowed: Denied1,Denied2", + ), + ( + "Case doesn't affect the result and duplicates are avoided, but only the first occurrence is reported", + [{"Denied1": "Value1", "Denied2": "Value2", "dEnIeD1": "Value1", "deNIeD2": "Value2"}], + ["denied1", "denied2"], + CustomSlurmSettingLevel.COMPUTE_RESOURCE, + "Using the following custom Slurm settings at ComputeResource level is not allowed: Denied1,Denied2", + ), + ], +) +def test_custom_slurm_settings_validator(description, custom_settings, deny_list, settings_level, expected_message): + actual_failures = CustomSlurmSettingsValidator().execute(custom_settings, deny_list, settings_level) + assert_failure_messages(actual_failures, expected_message) + + +@pytest.mark.parametrize( + "custom_slurm_settings, expected_message", + [ + # Generic case without custom Slurm nodes + ([{"Param1": "Value1"}, {"Param2": "Value2"}], ""), + # Generic case with custom Slurm nodes + ([{"NodeName": "test-node[1-100]", "CPUs": "16"}], ""), + # Generic case with custom Slurm nodes with bad name + ( + [{"NodeName": "test-st-node[1-100]", "CPUs": "16"}], + "Substrings '-st-' and '-dy-' in node names are reserved for nodes managed by ParallelCluster. " + "Please rename the following custom Slurm nodes: test-st-node[1-100]", + ), + # Generic case with custom Slurm nodes with bad name + ( + [{"NodeName": "test-dy-node[1-100]", "CPUs": "16"}], + "Substrings '-st-' and '-dy-' in node names are reserved for nodes managed by ParallelCluster. " + "Please rename the following custom Slurm nodes: test-dy-node[1-100]", + ), + # Generic case with multiple custom Slurm nodelists with bad node name + ( + [{"NodeName": "test-st-node[1-100]", "CPUs": "16"}, {"NodeName": "test-dy-node[1-100]", "CPUs": "16"}], + "Substrings '-st-' and '-dy-' in node names are reserved for nodes managed by ParallelCluster. " + "Please rename the following custom Slurm nodes: test-dy-node[1-100], test-st-node[1-100]", + ), + # Unrealistic corner case with custom Slurm nodes with names defined multiple times + ( + [{"NodeName": "test-dy-node[1-100]", "CPUs": "16", "nodename": "test-node[1-100]"}], + "Substrings '-st-' and '-dy-' in node names are reserved for nodes managed by ParallelCluster. " + "Please rename the following custom Slurm nodes: test-dy-node[1-100]", + ), + ], +) +def test_custom_slurm_node_names_validator(custom_slurm_settings, expected_message): + actual_failures = CustomSlurmNodeNamesValidator().execute(custom_slurm_settings) + assert_failure_messages(actual_failures, expected_message) + + +@pytest.mark.parametrize( + "custom_slurm_settings, custom_slurm_settings_include_file, expected_message", + [ + ([{"Param1": "Value1"}, {"Param2": "Value2"}], "", ""), + ([], "s3://test", ""), + ( + [{"Param1": "Value1"}, {"Param2": "Value2"}], + "s3://test", + "CustomSlurmsettings and CustomSlurmSettingsIncludeFile cannot be used together under SlurmSettings.", + ), + ], +) +def test_custom_slurm_settings_include_file_only_validator( + custom_slurm_settings, custom_slurm_settings_include_file, expected_message +): + actual_failures = CustomSlurmSettingsIncludeFileOnlyValidator().execute( + custom_slurm_settings, + custom_slurm_settings_include_file, + ) + assert_failure_messages(actual_failures, expected_message) + + @pytest.mark.parametrize( "region, expected_message", [ @@ -1171,7 +1305,7 @@ def test_shared_storage_mount_dir_validator(mount_dir, expected_message): (False, "alinux2", "t2.micro", None, None, None), # doesn't fail because DCV is disabled (True, "ubuntu1804", "m6g.xlarge", None, None, None), (True, "alinux2", "m6g.xlarge", None, None, None), - (True, "rhel8", "m6g.xlarge", None, None, "Please double check the os configuration"), + (True, "rhel8", "m6g.xlarge", None, None, None), (True, "ubuntu2004", "m6g.xlarge", None, None, "Please double check the os configuration"), ], ) @@ -1808,11 +1942,11 @@ def test_efs_id_validator( @pytest.mark.parametrize( - "queues, new_storage_count, failure_level, expected_message", + "queue_parameters_array, new_storage_count, failure_level, expected_message", [ ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( @@ -1826,14 +1960,14 @@ def test_efs_id_validator( ), ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue2", compute_resources=[], networking=SlurmQueueNetworking( @@ -1847,14 +1981,14 @@ def test_efs_id_validator( ), ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue2", compute_resources=[], networking=SlurmQueueNetworking( @@ -1868,14 +2002,14 @@ def test_efs_id_validator( ), ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue2", compute_resources=[], networking=SlurmQueueNetworking( @@ -1892,14 +2026,14 @@ def test_efs_id_validator( ), ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue2", compute_resources=[], networking=SlurmQueueNetworking( @@ -1913,14 +2047,14 @@ def test_efs_id_validator( ), ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue2", compute_resources=[], networking=SlurmQueueNetworking( @@ -1934,14 +2068,14 @@ def test_efs_id_validator( ), ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue2", compute_resources=[], networking=SlurmQueueNetworking( @@ -1955,14 +2089,14 @@ def test_efs_id_validator( ), ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue2", compute_resources=[], networking=SlurmQueueNetworking( @@ -1979,14 +2113,14 @@ def test_efs_id_validator( ), ( [ - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue2", compute_resources=[], networking=SlurmQueueNetworking( @@ -2003,25 +2137,29 @@ def test_efs_id_validator( ), ], ) -def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failure_level, expected_message): +@pytest.mark.usefixtures("get_region") +def test_new_storage_multiple_subnets_validator( + queue_parameters_array, new_storage_count, failure_level, expected_message +): + queues = [SlurmQueue(**queue_parameters) for queue_parameters in queue_parameters_array] actual_failures = ManagedFsxMultiAzValidator().execute(queues, new_storage_count) assert_failure_messages(actual_failures, expected_message) assert_failure_level(actual_failures, failure_level) @pytest.mark.parametrize( - "queues, subnet_az_mappings, fsx_az_list, failure_level, expected_messages", + "queue_parameters_array, subnet_az_mappings, fsx_az_list, failure_level, expected_messages", [ ( [ - SlurmQueue( + dict( name="different-az-queue", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-2"], ), ), - SlurmQueue( + dict( name="single-az-same-subnet-queue", compute_resources=[], networking=SlurmQueueNetworking( @@ -2040,14 +2178,14 @@ def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failu ), ( [ - SlurmQueue( + dict( name="same-az-same-subnet-queue", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="same-az-other-subnet-queue", compute_resources=[], networking=SlurmQueueNetworking( @@ -2062,14 +2200,14 @@ def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failu ), ( [ - SlurmQueue( + dict( name="one-az-match-queue", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="full-az-match-queue", compute_resources=[], networking=SlurmQueueNetworking( @@ -2084,14 +2222,14 @@ def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failu ), ( [ - SlurmQueue( + dict( name="multi-az-queue-match", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1", "subnet-2"], ), ), - SlurmQueue( + dict( name="multi-az-queue-match", compute_resources=[], networking=SlurmQueueNetworking( @@ -2109,7 +2247,7 @@ def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failu ), ( [ - SlurmQueue( + dict( name="multi-az-queue-mismatch", compute_resources=[], networking=SlurmQueueNetworking( @@ -2128,7 +2266,7 @@ def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failu ), ( [ - SlurmQueue( + dict( name="multi-az-queue-partial-match", compute_resources=[], networking=SchedulerPluginQueueNetworking( @@ -2147,7 +2285,7 @@ def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failu ), ( [ - SlurmQueue( + dict( name="multi-az-queue-match", compute_resources=[], networking=SchedulerPluginQueueNetworking( @@ -2162,14 +2300,14 @@ def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failu ), ( [ - SlurmQueue( + dict( name="different-az-queue-1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="different-az-queue-2", compute_resources=[], networking=SlurmQueueNetworking( @@ -2192,22 +2330,23 @@ def test_new_storage_multiple_subnets_validator(queues, new_storage_count, failu ], ) def test_unmanaged_fsx_multiple_az_validator( - mocker, queues, subnet_az_mappings, fsx_az_list, failure_level, expected_messages + mocker, queue_parameters_array, subnet_az_mappings, fsx_az_list, failure_level, expected_messages ): mock_aws_api(mocker) mocker.patch("pcluster.aws.ec2.Ec2Client.get_subnets_az_mapping", side_effect=subnet_az_mappings) + queues = [SlurmQueue(**queue_parameters) for queue_parameters in queue_parameters_array] actual_failures = UnmanagedFsxMultiAzValidator().execute(queues, fsx_az_list) assert_failure_messages(actual_failures, expected_messages) assert_failure_level(actual_failures, failure_level) @pytest.mark.parametrize( - "storage, is_managed, availability_zone, volume_description", + "storage_parameters, is_managed, availability_zone, volume_description", [ ( - SharedEbs( + dict( mount_dir="mount-dir", name="volume-name", kms_key_id="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", @@ -2222,7 +2361,7 @@ def test_unmanaged_fsx_multiple_az_validator( {"AvailabilityZone": "us-east-1a"}, ), ( - SharedEbs( + dict( mount_dir="mount-dir", name="volume-name", volume_id="volume-id", @@ -2235,13 +2374,14 @@ def test_unmanaged_fsx_multiple_az_validator( ) def test_shared_ebs_properties( mocker, - storage, + storage_parameters, is_managed, availability_zone, volume_description, ): os.environ["AWS_DEFAULT_REGION"] = "us-east-1" mocker.patch("pcluster.aws.ec2.Ec2Client.describe_volume", return_value=volume_description) + storage = SharedEbs(**storage_parameters) assert_that(storage.is_managed == is_managed).is_true() assert_that(storage.availability_zone == availability_zone).is_true() @@ -2254,20 +2394,20 @@ def __init__(self, name: str, availability_zone: str): @pytest.mark.parametrize( - "head_node_az, ebs_volumes, queues, subnet_az_mappings, failure_level, expected_messages", + "head_node_az, ebs_volumes, queue_parameters_array, subnet_az_mappings, failure_level, expected_messages", [ ( "us-east-1a", [DummySharedEbs("vol-1", "us-east-1a")], [ - SlurmQueue( + dict( name="different-az-queue-1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-2"], ), ), - SlurmQueue( + dict( name="different-az-queue-2", compute_resources=[], networking=SlurmQueueNetworking( @@ -2287,14 +2427,14 @@ def __init__(self, name: str, availability_zone: str): "us-east-1b", [DummySharedEbs("vol-1", "us-east-1a")], [ - SlurmQueue( + dict( name="same-az-queue-1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="same-az-queue-2", compute_resources=[], networking=SlurmQueueNetworking( @@ -2317,14 +2457,14 @@ def __init__(self, name: str, availability_zone: str): DummySharedEbs("vol-3", "us-east-1c"), ], [ - SlurmQueue( + dict( name="queue-1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue-2", compute_resources=[], networking=SlurmQueueNetworking( @@ -2348,21 +2488,21 @@ def __init__(self, name: str, availability_zone: str): "us-east-1a", [DummySharedEbs("vol-1", "us-east-1a")], [ - SlurmQueue( + dict( name="queue-1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1", "subnet-2"], ), ), - SlurmQueue( + dict( name="queue-2", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="queue-3", compute_resources=[], networking=SlurmQueueNetworking( @@ -2388,7 +2528,7 @@ def test_multi_az_shared_ebs_validator( mocker, head_node_az, ebs_volumes, - queues, + queue_parameters_array, subnet_az_mappings, failure_level, expected_messages, @@ -2396,6 +2536,7 @@ def test_multi_az_shared_ebs_validator( mock_aws_api(mocker) mocker.patch("pcluster.aws.ec2.Ec2Client.get_subnets_az_mapping", side_effect=subnet_az_mappings) + queues = [SlurmQueue(**queue_parameters) for queue_parameters in queue_parameters_array] actual_failures = MultiAzEbsVolumeValidator().execute(head_node_az, ebs_volumes, queues) assert_failure_messages(actual_failures, expected_messages) assert_failure_level(actual_failures, failure_level) @@ -2423,19 +2564,19 @@ def test_ec2_volume_validator(mocker): @pytest.mark.parametrize( - "head_node_az, queues, subnet_az_mappings, failure_level, expected_messages", + "head_node_az, queue_parameters_array, subnet_az_mappings, failure_level, expected_messages", [ ( "us-east-1a", [ - SlurmQueue( + dict( name="same-az-queue-1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1"], ), ), - SlurmQueue( + dict( name="different-az-queue-2", compute_resources=[], networking=SlurmQueueNetworking( @@ -2454,14 +2595,14 @@ def test_ec2_volume_validator(mocker): ( "us-east-1a", [ - SlurmQueue( + dict( name="multi-az-queue-1", compute_resources=[], networking=SlurmQueueNetworking( subnet_ids=["subnet-1", "subnet-2"], ), ), - SlurmQueue( + dict( name="different-az-queue-2", compute_resources=[], networking=SlurmQueueNetworking( @@ -2479,10 +2620,11 @@ def test_ec2_volume_validator(mocker): ), ], ) +@pytest.mark.usefixtures("get_region") def test_multi_az_root_volume_validator( mocker, head_node_az, - queues, + queue_parameters_array, subnet_az_mappings, failure_level, expected_messages, @@ -2490,6 +2632,7 @@ def test_multi_az_root_volume_validator( mock_aws_api(mocker) mocker.patch("pcluster.aws.ec2.Ec2Client.get_subnets_az_mapping", side_effect=subnet_az_mappings) + queues = [SlurmQueue(**queue_parameters) for queue_parameters in queue_parameters_array] actual_failures = MultiAzRootVolumeValidator().execute(head_node_az, queues) assert_failure_messages(actual_failures, expected_messages) assert_failure_level(actual_failures, failure_level) @@ -2512,12 +2655,13 @@ def test_are_subnets_covered_by_cidrs(mocker, ip_ranges, subnet_cidrs, covered): ).is_equal_to(covered) +@pytest.mark.usefixtures("get_region") class TestDictLaunchTemplateBuilder: @pytest.mark.parametrize( - "root_volume, image_os, expected_response", + "root_volume_parameters, image_os, region, expected_response", [ pytest.param( - RootVolume( + dict( size=10, encrypted=False, volume_type="mockVolumeType", @@ -2526,6 +2670,7 @@ class TestDictLaunchTemplateBuilder: delete_on_termination=False, ), "centos7", + "WHATEVER-NON-US-ISO-REGION", [ {"DeviceName": "/dev/xvdba", "VirtualName": "ephemeral0"}, {"DeviceName": "/dev/xvdbb", "VirtualName": "ephemeral1"}, @@ -2566,7 +2711,7 @@ class TestDictLaunchTemplateBuilder: id="test with all root volume fields populated", ), pytest.param( - RootVolume( + dict( encrypted=True, volume_type="mockVolumeType", iops=15, @@ -2574,6 +2719,7 @@ class TestDictLaunchTemplateBuilder: delete_on_termination=True, ), "alinux2", + "WHATEVER-NON-US-ISO-REGION", [ {"DeviceName": "/dev/xvdba", "VirtualName": "ephemeral0"}, {"DeviceName": "/dev/xvdbb", "VirtualName": "ephemeral1"}, @@ -2612,9 +2758,60 @@ class TestDictLaunchTemplateBuilder: ], id="test with missing volume size", ), + pytest.param( + dict( + encrypted=True, + volume_type="mockVolumeType", + iops=15, + throughput=20, + delete_on_termination=True, + ), + "alinux2", + "us-isoWHATEVER", + [ + {"DeviceName": "/dev/xvdba", "VirtualName": "ephemeral0"}, + {"DeviceName": "/dev/xvdbb", "VirtualName": "ephemeral1"}, + {"DeviceName": "/dev/xvdbc", "VirtualName": "ephemeral2"}, + {"DeviceName": "/dev/xvdbd", "VirtualName": "ephemeral3"}, + {"DeviceName": "/dev/xvdbe", "VirtualName": "ephemeral4"}, + {"DeviceName": "/dev/xvdbf", "VirtualName": "ephemeral5"}, + {"DeviceName": "/dev/xvdbg", "VirtualName": "ephemeral6"}, + {"DeviceName": "/dev/xvdbh", "VirtualName": "ephemeral7"}, + {"DeviceName": "/dev/xvdbi", "VirtualName": "ephemeral8"}, + {"DeviceName": "/dev/xvdbj", "VirtualName": "ephemeral9"}, + {"DeviceName": "/dev/xvdbk", "VirtualName": "ephemeral10"}, + {"DeviceName": "/dev/xvdbl", "VirtualName": "ephemeral11"}, + {"DeviceName": "/dev/xvdbm", "VirtualName": "ephemeral12"}, + {"DeviceName": "/dev/xvdbn", "VirtualName": "ephemeral13"}, + {"DeviceName": "/dev/xvdbo", "VirtualName": "ephemeral14"}, + {"DeviceName": "/dev/xvdbp", "VirtualName": "ephemeral15"}, + {"DeviceName": "/dev/xvdbq", "VirtualName": "ephemeral16"}, + {"DeviceName": "/dev/xvdbr", "VirtualName": "ephemeral17"}, + {"DeviceName": "/dev/xvdbs", "VirtualName": "ephemeral18"}, + {"DeviceName": "/dev/xvdbt", "VirtualName": "ephemeral19"}, + {"DeviceName": "/dev/xvdbu", "VirtualName": "ephemeral20"}, + {"DeviceName": "/dev/xvdbv", "VirtualName": "ephemeral21"}, + {"DeviceName": "/dev/xvdbw", "VirtualName": "ephemeral22"}, + {"DeviceName": "/dev/xvdbx", "VirtualName": "ephemeral23"}, + { + "DeviceName": "/dev/xvda", + "Ebs": { + "Encrypted": True, + "VolumeType": "mockVolumeType", + "VolumeSize": 35, + "Iops": 15, + "Throughput": 20, + "DeleteOnTermination": True, + }, + }, + ], + id="test with missing volume size in US isolated regions", + ), ], ) - def test_get_block_device_mappings(self, root_volume, image_os, expected_response): + def test_get_block_device_mappings(self, mocker, root_volume_parameters, image_os, region, expected_response): + mocker.patch("pcluster.config.cluster_config.get_region", return_value=region) + root_volume = RootVolume(**root_volume_parameters) assert_that(DictLaunchTemplateBuilder().get_block_device_mappings(root_volume, image_os)).is_equal_to( expected_response ) @@ -2661,10 +2858,10 @@ def test_get_instance_market_options(self, queue, compute_resource, expected_res ) @pytest.mark.parametrize( - "queue, compute_resource, expected_response", + "queue_parameters, compute_resource, expected_response", [ pytest.param( - SlurmQueue( + dict( name="queue1", capacity_reservation_target=CapacityReservationTarget( capacity_reservation_resource_group_arn="queue_cr_rg_arn", @@ -2687,7 +2884,7 @@ def test_get_instance_market_options(self, queue, compute_resource, expected_res id="test with queue and compute resource capacity reservation", ), pytest.param( - SlurmQueue( + dict( name="queue1", capacity_reservation_target=CapacityReservationTarget( capacity_reservation_id="queue_cr_id", @@ -2707,7 +2904,7 @@ def test_get_instance_market_options(self, queue, compute_resource, expected_res id="test with only queue capacity reservation", ), pytest.param( - SlurmQueue( + dict( name="queue1", compute_resources=[], networking=None, @@ -2721,7 +2918,142 @@ def test_get_instance_market_options(self, queue, compute_resource, expected_res ), ], ) - def test_get_capacity_reservation(self, queue, compute_resource, expected_response): + def test_get_capacity_reservation(self, queue_parameters, compute_resource, expected_response): + queue = SlurmQueue(**queue_parameters) assert_that(DictLaunchTemplateBuilder().get_capacity_reservation(queue, compute_resource)).is_equal_to( expected_response ) + + +@pytest.mark.parametrize( + "encryption_settings, expected_error_message", + [ + ( + [ + ("queue1", True), + ("queue2", False), + ], + "The Encryption parameter of the root volume of the queue queue2 is not consistent with the " + "value set for the queue queue1, and may cause a problem in case of Service Control Policies " + "(SCPs) enforcing encryption.", + ), + ( + [ + ("queue1", False), + ("queue2", True), + ], + "The Encryption parameter of the root volume of the queue queue2 is not consistent with the " + "value set for the queue queue1, and may cause a problem in case of Service Control Policies " + "(SCPs) enforcing encryption.", + ), + ([("queue1", True), ("queue2", True)], None), + ([("queue1", False), ("queue2", False)], None), + ], +) +def test_root_volume_encryption_consistency_validator( + encryption_settings, + expected_error_message, +): + actual_failures = RootVolumeEncryptionConsistencyValidator().execute(encryption_settings) + + if expected_error_message: + assert_failure_messages(actual_failures, [expected_error_message]) + assert_failure_level(actual_failures, FailureLevel.WARNING) + else: + assert_that(actual_failures).is_empty() + + +@pytest.mark.parametrize( + "num_cards, assign_public_ip, public_ip_subnets, expected_error_messages", + [ + pytest.param( + 1, + True, + [ + {"SubnetId": "subnet_1", "MapPublicIpOnLaunch": True}, + {"SubnetId": "subnet_2", "MapPublicIpOnLaunch": False}, + ], + None, + id="Test with single nic queue with assigned public ip and subnet with public ip", + ), + pytest.param( + 2, + False, + [ + {"SubnetId": "subnet_1", "MapPublicIpOnLaunch": False}, + {"SubnetId": "subnet_2", "MapPublicIpOnLaunch": False}, + ], + None, + id="Test with multi nic queue with neither assigned public ip nor subnet with public ip", + ), + pytest.param( + 2, + True, + [ + {"SubnetId": "subnet_1", "MapPublicIpOnLaunch": False}, + {"SubnetId": "subnet_2", "MapPublicIpOnLaunch": False}, + ], + [ + "The queue queue_1 contains an instance type with multiple network interfaces however the " + "AssignPublicIp value is set to true. AWS public IPs can only be assigned to instances " + "launched with a single network interface." + ], + id="Test with multi nic queue with assigned public ip and no subnet with public ip", + ), + pytest.param( + 2, + False, + [ + {"SubnetId": "subnet_1", "MapPublicIpOnLaunch": True}, + {"SubnetId": "subnet_2", "MapPublicIpOnLaunch": False}, + ], + [ + "The queue queue_1 contains an instance type with multiple network interfaces however the subnets " + "['subnet_1'] is configured to automatically assign public IPs. AWS public IPs can only be assigned " + "to instances launched with a single network interface." + ], + id="Test with multi nic queue with no assigned public ip and subnet with public ip", + ), + pytest.param( + 2, + True, + [ + {"SubnetId": "subnet_1", "MapPublicIpOnLaunch": True}, + {"SubnetId": "subnet_2", "MapPublicIpOnLaunch": False}, + ], + [ + "The queue queue_1 contains an instance type with multiple network interfaces however the " + "AssignPublicIp value is set to true. AWS public IPs can only be assigned to instances " + "launched with a single network interface.", + "The queue queue_1 contains an instance type with multiple network interfaces however the subnets " + "['subnet_1'] is configured to automatically assign public IPs. AWS public IPs can only be assigned " + "to instances launched with a single network interface.", + ], + id="Test with multi nic queue with assigned public ip and subnet with public ip", + ), + ], +) +@pytest.mark.usefixtures("get_region") +def test_multi_network_interfaces_instances_validator( + aws_api_mock, num_cards, assign_public_ip, public_ip_subnets, expected_error_messages +): + aws_api_mock.ec2.get_instance_type_info.return_value = InstanceTypeInfo( + {"NetworkInfo": {"MaximumNetworkCards": num_cards}} + ) + aws_api_mock.ec2.describe_subnets.return_value = public_ip_subnets + + queues = [ + SlurmQueue( + name="queue_1", + compute_resources=[SlurmComputeResource(name="compute_resource_1", instance_type="instance_type")], + networking=SlurmQueueNetworking(subnet_ids=["subnet_1", "subnet_2"], assign_public_ip=assign_public_ip), + ), + ] + + actual_failures = MultiNetworkInterfacesInstancesValidator().execute(queues) + + if expected_error_messages: + assert_failure_messages(actual_failures, expected_error_messages) + assert_failure_level(actual_failures, FailureLevel.ERROR) + else: + assert_that(actual_failures).is_empty() diff --git a/cli/tests/pcluster/validators/test_directory_service_validators.py b/cli/tests/pcluster/validators/test_directory_service_validators.py index 3bdf83522b..a76339a684 100644 --- a/cli/tests/pcluster/validators/test_directory_service_validators.py +++ b/cli/tests/pcluster/validators/test_directory_service_validators.py @@ -9,6 +9,7 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. import pytest +from assertpy import fail from pcluster.aws.common import AWSClientError from pcluster.validators.common import FailureLevel @@ -115,44 +116,160 @@ def test_ldap_tls_reqcert_validator(ldap_tls_reqcert, expected_message): @pytest.mark.parametrize( - "password_secret_arn, error_from_secrets_manager, expected_message, expected_failure_level", + "password_secret_arn, region, aws_service, error_from_aws_service, expected_message, expected_failure_level", [ - ( + pytest.param( "arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:NOT_ACCESSIBLE_SECRET", + "WHATEVER-NOT-us-isob-east-1", + "secretsmanager", "ResourceNotFoundExceptionSecrets", "The secret arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:NOT_ACCESSIBLE_SECRET does not exist.", FailureLevel.ERROR, + id="PasswordSecretArn as a Secret in Secrets Manager that does not exist, " + "in regions other than us-isob-east-1", ), - ( + pytest.param( "arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:ANY_SECRET", + "WHATEVER-NOT-us-isob-east-1", + "secretsmanager", "AccessDeniedException", "Cannot validate secret arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:ANY_SECRET " "due to lack of permissions. Please refer to ParallelCluster official documentation for more information.", FailureLevel.WARNING, + id="PasswordSecretArn as a Secret in Secrets Manager that is not accessible due to lack of permissions, " + "in regions other than us-isob-east-1", ), - ( + pytest.param( "arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:NOT_ACCESSIBLE_SECRET", + "WHATEVER-NOT-us-isob-east-1", + "secretsmanager", "ANOTHER_ERROR", "Cannot validate secret arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:NOT_ACCESSIBLE_SECRET. " "Please refer to ParallelCluster official documentation for more information.", FailureLevel.WARNING, + id="PasswordSecretArn as a Secret in Secrets Manager that is not accessible due to unexpected exception, " + "in regions other than us-isob-east-1", ), - ( + pytest.param( + "arn:PARTITION:secretsmanager:REGION:ACCOUNT:UNSUPPORTED_RESOURCE", + "WHATEVER-NOT-us-isob-east-1", + "secretsmanager", + None, + "The secret arn:PARTITION:secretsmanager:REGION:ACCOUNT:UNSUPPORTED_RESOURCE is not supported " + "in region WHATEVER-NOT-us-isob-east-1.", + FailureLevel.ERROR, + id="PasswordSecretArn as an unsupported resource of Secrets Manager, in regions other than us-isob-east-1", + ), + pytest.param( "arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:ACCESSIBLE_SECRET", + "WHATEVER-NOT-us-isob-east-1", + "secretsmanager", None, None, None, + id="PasswordSecretArn as a Secret in Secrets Manager that is accessible, " + "in regions other than us-isob-east-1", + ), + pytest.param( + "arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:WHATEVER_SECRET", + "us-isob-east-1", + "secretsmanager", + None, + "The secret arn:PARTITION:secretsmanager:REGION:ACCOUNT:secret:WHATEVER_SECRET is not supported " + "in region us-isob-east-1.", + FailureLevel.ERROR, + id="PasswordSecretArn as a Secret in Secrets Manager, in us-isob-east-1", + ), + pytest.param( + "arn:PARTITION:ssm:REGION:ACCOUNT:parameter/NOT_ACCESSIBLE_SECRET", + "us-isob-east-1", + "ssm", + "ParameterNotFound", + "The secret arn:PARTITION:ssm:REGION:ACCOUNT:parameter/NOT_ACCESSIBLE_SECRET does not exist.", + FailureLevel.ERROR, + id="PasswordSecretArn as a Parameter in SSM that does not exist, in us-isob-east-1", + ), + pytest.param( + "arn:PARTITION:ssm:REGION:ACCOUNT:parameter/ANY_SECRET", + "us-isob-east-1", + "ssm", + "AccessDeniedException", + "Cannot validate secret arn:PARTITION:ssm:REGION:ACCOUNT:parameter/ANY_SECRET " + "due to lack of permissions. Please refer to ParallelCluster official documentation for more information.", + FailureLevel.WARNING, + id="PasswordSecretArn as a Parameter in SSM that is not accessible due to lack of permissions, " + "in us-isob-east-1", + ), + pytest.param( + "arn:PARTITION:ssm:REGION:ACCOUNT:parameter/NOT_ACCESSIBLE_SECRET", + "us-isob-east-1", + "ssm", + "ANOTHER_ERROR", + "Cannot validate secret arn:PARTITION:ssm:REGION:ACCOUNT:parameter/NOT_ACCESSIBLE_SECRET. " + "Please refer to ParallelCluster official documentation for more information.", + FailureLevel.WARNING, + id="PasswordSecretArn as a Parameter in SSM that is not accessible due to unexpected exception, " + "in us-isob-east-1", + ), + pytest.param( + "arn:PARTITION:ssm:REGION:ACCOUNT:parameter/ACCESSIBLE_SECRET", + "us-isob-east-1", + "ssm", + None, + None, + None, + id="PasswordSecretArn as a Parameter in SSM that is accessible, in us-isob-east-1", + ), + pytest.param( + "arn:PARTITION:ssm:REGION:ACCOUNT:UNSUPPORTED_RESOURCE", + "us-isob-east-1", + "ssm", + None, + "The secret arn:PARTITION:ssm:REGION:ACCOUNT:UNSUPPORTED_RESOURCE is not supported.", + FailureLevel.ERROR, + id="PasswordSecretArn as an unsupported resource of SSM, in us-isob-east-1", + ), + pytest.param( + "arn:PARTITION:UNSUPPORTED_SERVICE:REGION:ACCOUNT:RESOURCE", + "us-isob-east-1", + "UNSUPPORTED_SERVICE", + None, + "The secret arn:PARTITION:UNSUPPORTED_SERVICE:REGION:ACCOUNT:RESOURCE is not supported.", + FailureLevel.ERROR, + id="PasswordSecretArn as a resource of an unsupported service, in us-isob-east-1", + ), + pytest.param( + "arn:PARTITION:ssm:REGION:ACCOUNT:parameter/WHATEVER_SECRET", + "WHATEVER-NOT-us-isob-east-1", + "UNSUPPORTED_SERVICE", + None, + "The secret arn:PARTITION:ssm:REGION:ACCOUNT:parameter/WHATEVER_SECRET is not supported " + "in region WHATEVER-NOT-us-isob-east-1.", + FailureLevel.ERROR, + id="PasswordSecretArn as a Parameter in SSM, in regions other than us-isob-east-1", ), ], ) def test_password_secret_arn_validator( - password_secret_arn, error_from_secrets_manager, expected_message, expected_failure_level, aws_api_mock + password_secret_arn, + region, + aws_service, + error_from_aws_service, + expected_message, + expected_failure_level, + aws_api_mock, ): - if error_from_secrets_manager: - aws_api_mock.secretsmanager.describe_secret.side_effect = AWSClientError( - function_name="A_FUNCTION_NAME", error_code=error_from_secrets_manager, message="AN_ERROR_MESSAGE" + if error_from_aws_service: + if aws_service == "secretsmanager": + aws_api_mocked_call = aws_api_mock.secretsmanager.describe_secret + elif aws_service == "ssm": + aws_api_mocked_call = aws_api_mock.ssm.get_parameter + else: + fail(f"Unsupported aws_service: {aws_service}") + aws_api_mocked_call.side_effect = AWSClientError( + function_name="A_FUNCTION_NAME", error_code=str(error_from_aws_service), message="AN_ERROR_MESSAGE" ) - actual_failures = PasswordSecretArnValidator().execute(password_secret_arn=password_secret_arn) + actual_failures = PasswordSecretArnValidator().execute(password_secret_arn=password_secret_arn, region=region) assert_failure_messages(actual_failures, expected_message) assert_failure_level(actual_failures, expected_failure_level) diff --git a/cli/tests/pcluster/validators/test_feature_validators.py b/cli/tests/pcluster/validators/test_feature_validators.py index 41b01202ff..08400cb043 100644 --- a/cli/tests/pcluster/validators/test_feature_validators.py +++ b/cli/tests/pcluster/validators/test_feature_validators.py @@ -41,11 +41,13 @@ (Feature.FSX_OPENZFS, "us-iso-west-1", "FSx OpenZfs is not supported in region 'us-iso-west-1'"), (Feature.FSX_OPENZFS, "us-isob-east-1", "FSx OpenZfs is not supported in region 'us-isob-east-1'"), (Feature.FSX_OPENZFS, "us-isoWHATEVER", "FSx OpenZfs is not supported in region 'us-isoWHATEVER'"), + (Feature.SLURM_DATABASE, "us-isoWHATEVER", "SLURM Database is not supported in region 'us-isoWHATEVER'"), (Feature.BATCH, "WHATEVER-ELSE", None), (Feature.DCV, "WHATEVER-ELSE", None), (Feature.FSX_LUSTRE, "WHATEVER-ELSE", None), (Feature.FSX_ONTAP, "WHATEVER-ELSE", None), (Feature.FSX_OPENZFS, "WHATEVER-ELSE", None), + (Feature.SLURM_DATABASE, "WHATEVER-ELSE", None), ], ) def test_feature_region_validator(feature, region, expected_message): diff --git a/cli/tests/pcluster/validators/test_monitoring_validators.py b/cli/tests/pcluster/validators/test_monitoring_validators.py index 086032c47a..159b106245 100644 --- a/cli/tests/pcluster/validators/test_monitoring_validators.py +++ b/cli/tests/pcluster/validators/test_monitoring_validators.py @@ -11,7 +11,7 @@ import pytest from pcluster.config.cluster_config import CloudWatchLogs, LogRotation, Logs -from pcluster.validators.monitoring_validators import LogRotationValidator +from pcluster.validators.monitoring_validators import DetailedMonitoringValidator, LogRotationValidator from tests.pcluster.validators.utils import assert_failure_messages @@ -33,3 +33,21 @@ def test_compute_console_logging_validator(logs, expected_message): actual_failures = LogRotationValidator().execute(logs) assert_failure_messages(actual_failures, expected_message) + + +@pytest.mark.parametrize( + "is_detailed_monitoring_enabled, expected_message", + [ + (False, None), + ( + True, + "Detailed Monitoring is enabled for EC2 instances in your compute fleet. The Amazon EC2 console will " + "display monitoring graphs with a 1-minute period for these instances. Note that this will increase " + "the cost. If you want to avoid this and use basic monitoring instead, please set " + "`Monitoring / DetailedMonitoring` to false.", + ), + ], +) +def test_detailed_monitoring_validator(is_detailed_monitoring_enabled, expected_message): + actual_failures = DetailedMonitoringValidator().execute(is_detailed_monitoring_enabled) + assert_failure_messages(actual_failures, expected_message) diff --git a/cli/tests/pcluster/validators/test_s3_validators.py b/cli/tests/pcluster/validators/test_s3_validators.py index 4d5d63ece2..2b06d4561a 100644 --- a/cli/tests/pcluster/validators/test_s3_validators.py +++ b/cli/tests/pcluster/validators/test_s3_validators.py @@ -43,10 +43,10 @@ None, ), ( - "https://test/cookbook.tgz", + "https://test/cookbook1.tgz", None, False, - "The url 'https://test/cookbook.tgz' causes ConnectionError", + "The url 'https://test/cookbook1.tgz' causes ConnectionError", ConnectionError(), None, ), diff --git a/cli/tests/pcluster/validators/test_tags_validators.py b/cli/tests/pcluster/validators/test_tags_validators.py new file mode 100644 index 0000000000..2b5e1e165c --- /dev/null +++ b/cli/tests/pcluster/validators/test_tags_validators.py @@ -0,0 +1,145 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from pcluster.config.cluster_config import Tag +from pcluster.validators.tags_validators import ComputeResourceTagsValidator +from tests.pcluster.validators.utils import assert_failure_messages + + +@pytest.mark.parametrize( + "cluster_tags, queue_tags, compute_resource_tags, expected_message", + [ + ( + [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")], + [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")], + None, + "The following Tag keys are defined in both under `Tags` and `SlurmQueue/Tags`: ['key1', 'key2', 'key3'] " + "and will be overridden by the value set in `SlurmQueue/Tags` for ComputeResource 'dummy_compute_resource' " + "in queue 'dummy_queue'.", + ), + ( + [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")], + [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")], + [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")], + "The following Tag keys are defined under `Tags`, `SlurmQueue/Tags` and `SlurmQueue/ComputeResources/Tags`:" + " ['key1', 'key2', 'key3'] " + "and will be overridden by the value set in `SlurmQueue/ComputeResources/Tags` for ComputeResource " + "'dummy_compute_resource' " + "in queue 'dummy_queue'.", + ), + ( + [Tag("key1", "value1"), Tag("clusterkey2", "value2"), Tag("clusterkey3", "value3")], + [Tag("key1", "value1"), Tag("queuekey2", "value2"), Tag("queuekey3", "value3")], + [Tag("key1", "value1"), Tag("computekey2", "value2"), Tag("computekey3", "value3")], + "The following Tag keys are defined under `Tags`, `SlurmQueue/Tags` and `SlurmQueue/ComputeResources/Tags`:" + " ['key1'] " + "and will be overridden by the value set in `SlurmQueue/ComputeResources/Tags` for ComputeResource " + "'dummy_compute_resource' " + "in queue 'dummy_queue'.", + ), + ( + [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")], + [Tag("key4", "value1"), Tag("key5", "value2"), Tag("key6", "value3")], + None, + None, + ), + ( + [Tag("key1", "value1"), Tag("key2", "value2")], + [Tag("key1", "value2"), Tag("key3", "value2")], + None, + "The following Tag keys are defined in both under `Tags` and `SlurmQueue/Tags`: ['key1'] and will be " + "overridden by the value set in `SlurmQueue/Tags` for ComputeResource 'dummy_compute_resource' in queue " + "'dummy_queue'.", + ), + ( + [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")], + None, + None, + None, + ), + ( + None, + [Tag("key1", "value1"), Tag("key2", "value2"), Tag("key3", "value3")], + None, + None, + ), + ( + None, + None, + None, + None, + ), + ( + [Tag("key1", "value1")], + [Tag("key2", "value2")], + [Tag("key3", "value3")], + None, + ), + ( + None, + [Tag("key1", "value1"), Tag("key2", "value2")], + [Tag("key1", "value1"), Tag("key3", "value3")], + "The following Tag keys are defined in both under `SlurmQueue/Tags` and `SlurmQueue/ComputeResources/Tags`:" + " ['key1'] and will be overridden by the value set in `SlurmQueue/ComputeResources/Tags` for " + "ComputeResource 'dummy_compute_resource' in queue 'dummy_queue'.", + ), + ( + [Tag("key1", "value1"), Tag("key2", "value2")], + None, + [Tag("key1", "value1"), Tag("key3", "value3")], + "The following Tag keys are defined in both under `Tags` and `SlurmQueue/ComputeResources/Tags`:" + " ['key1'] and will be overridden by the value set in `SlurmQueue/ComputeResources/Tags` for " + "ComputeResource 'dummy_compute_resource' in queue 'dummy_queue'.", + ), + ( + [Tag(f"key{i}", f"value{i}") for i in range(0, 41)], + None, + None, + "The number of tags (41) associated with ComputeResource 'dummy_compute_resource' in queue 'dummy_queue' " + "has exceeded the limit of 40.", + ), + ( + [Tag(f"key{i}", f"value{i}") for i in range(0, 20)], + [Tag(f"key{i}", f"value{i}") for i in range(20, 41)], + None, + "The number of tags (41) associated with ComputeResource 'dummy_compute_resource' in queue 'dummy_queue' " + "has exceeded the limit of 40.", + ), + ( + [Tag(f"key{i}", f"value{i}") for i in range(0, 20)], + None, + [Tag(f"key{i}", f"value{i}") for i in range(20, 41)], + "The number of tags (41) associated with ComputeResource 'dummy_compute_resource' in queue 'dummy_queue' " + "has exceeded the limit of 40.", + ), + ( + [Tag(f"key{i}", f"value{i}") for i in range(0, 10)], + [Tag(f"key{i}", f"value{i}") for i in range(10, 20)], + [Tag(f"key{i}", f"value{i}") for i in range(20, 41)], + "The number of tags (41) associated with ComputeResource 'dummy_compute_resource' in queue 'dummy_queue' " + "has exceeded the limit of 40.", + ), + ( + [Tag(f"key{i}", f"value{i}") for i in range(0, 10)], + [Tag(f"key{i}", f"value{i}") for i in range(10, 40)], + [Tag("key0", "value0")], + "The following Tag keys are defined in both under `Tags` and `SlurmQueue/ComputeResources/Tags`: ['key0'] " + "and will be overridden by the value set in `SlurmQueue/ComputeResources/Tags` for ComputeResource " + "'dummy_compute_resource' in queue 'dummy_queue'.", + ), + ], +) +def test_compute_resource_tags_validator(cluster_tags, queue_tags, compute_resource_tags, expected_message): + actual_failures = ComputeResourceTagsValidator().execute( + "dummy_queue", "dummy_compute_resource", cluster_tags, queue_tags, compute_resource_tags + ) + assert_failure_messages(actual_failures, expected_message) diff --git a/cli/tox.ini b/cli/tox.ini index 3fb69a7861..c5f9ead79e 100644 --- a/cli/tox.ini +++ b/cli/tox.ini @@ -18,7 +18,6 @@ allowlist_externals = deps = -rtests/requirements.txt pytest-travis-fold - cov: codecov extras = awslambda commands = @@ -26,7 +25,6 @@ commands = nocov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --ignore=src tests/ cov: python setup.py clean --all build_ext --force --inplace cov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --cov=src --cov-report=xml --cov-append tests/ - cov: codecov -e TOXENV # Section used to define common variables used by multiple testenvs. [vars] diff --git a/cloudformation/ad/ad-integration.yaml b/cloudformation/ad/ad-integration.yaml index 9517f88eba..b3536d4348 100644 --- a/cloudformation/ad/ad-integration.yaml +++ b/cloudformation/ad/ad-integration.yaml @@ -1,3 +1,6 @@ + +Description: AWS ParallelCluster ActiveDirectory Database + Parameters: DomainName: Description: AD Domain Name. diff --git a/cloudformation/custom_resource/cluster-1-click.yaml b/cloudformation/custom_resource/cluster-1-click.yaml new file mode 100644 index 0000000000..b22c3ba789 --- /dev/null +++ b/cloudformation/custom_resource/cluster-1-click.yaml @@ -0,0 +1,86 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: AWS ParallelCluster CloudFormation Cluster + +Parameters: + KeyName: + Description: KeyPair to login to the head node + Type: AWS::EC2::KeyPair::KeyName + AllowedPattern: ".+" # Required + + AvailabilityZone: + Description: Availability zone where instances will be launched + Type: AWS::EC2::AvailabilityZone::Name + AllowedPattern: ".+" # Required + +Mappings: + ParallelCluster: + Constants: + Version: 3.6.0 + Bucket: '' # For debug purposes only + +Conditions: + UseCustomBucket: !Not [!Equals [!FindInMap [ParallelCluster, Constants, Bucket], '']] + +Resources: + PclusterClusterProvider: + Type: AWS::CloudFormation::Stack + Properties: + Parameters: + CustomBucket: !If [ UseCustomBucket, !FindInMap [ParallelCluster, Constants, Bucket], !Ref AWS::NoValue ] + TemplateURL: !Sub + - https://${Bucket}.s3.${AWS::Region}.${AWS::URLSuffix}/parallelcluster/${Version}/templates/custom_resource/cluster.yaml + - { Version: !FindInMap [ParallelCluster, Constants, Version ], + Bucket: !If [ UseCustomBucket, !FindInMap [ParallelCluster, Constants, Bucket], !Sub "${AWS::Region}-aws-parallelcluster" ] + } + + PclusterVpc: + Type: AWS::CloudFormation::Stack + Properties: + Parameters: + PublicCIDR: 10.0.0.0/24 + PrivateCIDR: 10.0.16.0/20 + AvailabilityZone: !Ref AvailabilityZone + TemplateURL: !Sub + - https://${Bucket}.s3.${AWS::Region}.${AWS::URLSuffix}.com/parallelcluster/${Version}/templates/networking/public-private-${Version}.cfn.json + - { Version: !FindInMap [ParallelCluster, Constants, Version], + Bucket: !If [ UseCustomBucket, !FindInMap [ParallelCluster, Constants, Bucket], !Sub "${AWS::Region}-aws-parallelcluster" ] + } + + PclusterCluster: + Type: Custom::PclusterCluster + Properties: + ServiceToken: !GetAtt [ PclusterClusterProvider, Outputs.ServiceToken ] + ClusterName: !Sub 'c-${AWS::StackName}' + ClusterConfiguration: + DevSettings: !If + - UseCustomBucket + - + AmiSearchFilters: + Owner: self + - !Ref AWS::NoValue + Image: + Os: alinux2 + HeadNode: + InstanceType: t2.medium + Networking: + SubnetId: !GetAtt [ PclusterVpc, Outputs.PublicSubnetId ] + Ssh: + KeyName: !Ref KeyName + Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: queue0 + ComputeResources: + - Name: queue0-cr0 + InstanceType: t2.micro + Networking: + SubnetIds: + - !GetAtt [ PclusterVpc, Outputs.PrivateSubnetId ] + +Outputs: + HeadNodeIp: + Description: The Public IP address of the HeadNode + Value: !GetAtt [ PclusterCluster, headNode.publicIpAddress ] + ValidationMessages: + Description: Any warnings from cluster create or update operations. + Value: !GetAtt PclusterCluster.validationMessages diff --git a/cloudformation/custom_resource/cluster.yaml b/cloudformation/custom_resource/cluster.yaml new file mode 100644 index 0000000000..104237b1c2 --- /dev/null +++ b/cloudformation/custom_resource/cluster.yaml @@ -0,0 +1,306 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: AWS ParallelCluster Cluster Custom Resource Provider + +Parameters: + + CustomLambdaRole: + Description: Custom role to use for PC Lambda + Type: String + Default: '' + + AdditionalIamPolicies: + Description: Comma-delimited list of additional IAM Policies to add to the cluster (only used if CustomLambdaRole isn't provided). + Type: CommaDelimitedList + Default: '' + + CustomBucket: + Description: (Debug only) bucket to retrieve S3 artifacts for internal resources. + Type: String + Default: '' + + +Mappings: + ParallelCluster: + Constants: + Version: 3.6.0 # major.minor.patch+alpha/beta_identifier + +Conditions: + CustomRoleCondition: !Not [!Equals [!Ref CustomLambdaRole, '']] + UsePCPolicies: !Not [!Condition CustomRoleCondition ] + UseAdditionalIamPolicies: !Not [!Equals [!Join ['', !Ref AdditionalIamPolicies ], '']] + UseCustomBucket: !Not [!Equals [!Ref CustomBucket, '']] + +Resources: + PclusterLayer: + Type: AWS::Lambda::LayerVersion + Properties: + LayerName: !Sub + - PCLayer-${StackIdSuffix} + - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } + Description: Library which contains aws-parallelcluster python package and dependencies + Content: + S3Bucket: !If [ UseCustomBucket, !Ref CustomBucket, !Sub "${AWS::Region}-aws-parallelcluster" ] + S3Key: !Sub + - parallelcluster/${Version}/layers/aws-parallelcluster/lambda-layer.zip + - { Version: !FindInMap [ParallelCluster, Constants, Version] } + CompatibleRuntimes: + - python3.9 + + PclusterPolicies: + Condition: UsePCPolicies + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: !Sub + - https://${Bucket}.s3.${Region}.amazonaws.com/parallelcluster/${Version}/templates/policies/policies.yaml + - { Version: !FindInMap [ParallelCluster, Constants, Version ], + Bucket: !If [UseCustomBucket, !Ref CustomBucket, !Sub "${AWS::Region}-aws-parallelcluster" ], + Region: !Ref AWS::Region } + TimeoutInMinutes: 10 + Parameters: + EnableIamAdminAccess: true + + PclusterCfnFunctionLogGroup: + Type: AWS::Logs::LogGroup + DeletionPolicy: Retain + Properties: + RetentionInDays: 90 + LogGroupName: !Sub /aws/lambda/${PclusterCfnFunction} + + EventsPolicy: + Type: AWS::IAM::ManagedPolicy + Properties: + PolicyDocument: + Version: "2012-10-17" + Statement: + - Sid: EventsPolicy + Effect: Allow + Action: + - events:PutRule + - events:DeleteRule + - events:PutTargets + - events:RemoveTargets + Resource: !Sub arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/* + + PclusterLambdaRole: + Condition: UsePCPolicies + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Action: sts:AssumeRole + Principal: + Service: lambda.amazonaws.com + ManagedPolicyArns: !Split + - "," + - !Sub + - ${LambdaExecutionPolicy},${ClusterPolicy},${DefaultAdminPolicy},${EventsPolicy}${AdditionalIamPolicies} + - { LambdaExecutionPolicy: !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", + ClusterPolicy: !GetAtt [ PclusterPolicies, Outputs.ParallelClusterClusterPolicy ], + DefaultAdminPolicy: !GetAtt [ PclusterPolicies, Outputs.DefaultParallelClusterIamAdminPolicy ], + EventsPolicy: !Ref EventsPolicy, + AdditionalIamPolicies: !If [UseAdditionalIamPolicies, !Sub [",${AdditionalPolicies}", AdditionalPolicies: !Join [',', !Ref AdditionalIamPolicies ]], ''] } + + PclusterCfnFunction: + Type: AWS::Lambda::Function + Properties: + Tags: + - Key: "parallelcluster:version" + Value: !FindInMap [ParallelCluster, Constants, Version] + - Key: "parallelcluster:custom_resource" + Value: "cluster" + FunctionName: !Sub + - pcluster-cfn-${StackIdSuffix} + - { StackIdSuffix: !Select [2, !Split ['/', !Ref 'AWS::StackId']] } + TracingConfig: + Mode: Active + MemorySize: 2048 + Timeout: 60 + Code: + ZipFile: | + import datetime + import json + import logging + import os + import random + import re + import string + import sys + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + import pcluster.api.controllers.cluster_operations_controller + import pcluster.api.errors + import pcluster.utils + from pcluster.api import encoder + from pcluster.cli.exceptions import APIOperationException, ParameterException + from pcluster.api.errors import exception_message, NotFoundException + import pcluster.lib as pc + + crhelper_path = "/opt/python/pcluster/resources/custom_resources/custom_resources_code" + sys.path.insert(0, crhelper_path) + from crhelper import CfnResource + helper = CfnResource() + + def drop_keys(_dict, keys): + return {k: v for k, v in _dict.items() if k not in keys} + + def flatten(obj, ret={}, path=""): + """flatten a nested map using dot-notation for keys.""" + if isinstance(obj, list): # convert list to dictionary for flattening + return flatten({str(i): v for i, v in enumerate(obj)}, ret, path) + for k, v in obj.items(): + if isinstance(v, (dict, list)): # recurse on complex objects + flatten(v, ret, f"{path}{k}.") + else: # otherwise add with prefix + ret[path + str(k)] = v + return ret + + def update_response(data): + logger.info(data) + # Avoid limit on response object size, user has provided these, so drop them in the response + extra_keys = {"clusterConfiguration", "scheduler", "tags"} + # create / delete responses have cluster information nested in "cluster" key, + # flatten that portion while keeping other keys to propagate warnings. + if "cluster" in data: + helper.Data.update(flatten(drop_keys(data["cluster"], extra_keys))) + + validation_messages = json.dumps(data.get("validationMessages", [])) + validation_messages = "TRUNCATED:" + validation_messages[:2048] if len(validation_messages) > 2048 else validation_messages + helper.Data["validationMessages"] = validation_messages + else: # without "cluster" in the keys, this is a cluster object. + helper.Data.update(flatten(drop_keys(data, extra_keys))) + + def serialize(val): + return utils.to_iso_timestr(val) if isinstance(val, datetime.date) else val + + def create_or_update(event): + properties = event["ResourceProperties"] + request_type = event["RequestType"].upper() + helper.Data["validationMessages"] = "[]" # default value + + if properties.get("DeletionPolicy", "Delete") not in {"Retain", "Delete"}: + raise ValueError("DeletionPolicy must be one of [\"Retain\", \"Delete\"].") + if request_type == "CREATE" and "ClusterName" not in properties: + raise ValueError("Couldn't find a ClusterName in the properties.") + elif request_type == "UPDATE" and event["PhysicalResourceId"] != properties.get("ClusterName"): + raise ValueError("Cannot update the ClusterName in the properties.") + + cluster_name = properties["ClusterName"] + logger.info(f"{event['RequestType'].upper()}: {cluster_name}") + physical_resource_id = cluster_name + + try: + meta_keys = {"ServiceToken", "DeletionPolicy"} + kwargs = {**{pcluster.utils.to_snake_case(k): serialize(v) for k, v in drop_keys(properties, meta_keys).items()}, "wait": False} + func = {"CREATE": pc.create_cluster, "UPDATE": pc.update_cluster}[request_type] + update_response(func(**kwargs)) + except (APIOperationException, ParameterException, TypeError) as e: + logger.info(str(e)) + raise ValueError(str(e)) + except Exception as e: + message = pcluster.api.errors.exception_message(e) + # StatusReason is truncated, so skip changeset in output, still logged below + block_list = {"change_set"} + message_data = drop_keys(message.to_dict(), block_list) + logger.info(message_data) + + # sort more critical errors last + if "configuration_validation_errors" in message_data and message_data["configuration_validation_errors"]: + order = {k: i for i, k in enumerate(["INFO", "WARNING", "ERROR"])} + message_data["configuration_validation_errors"].sort(key=lambda e: order[e["level"]]) + + str_msg = encoder.JSONEncoder().encode(message_data) + if not re.search(r"No changes found", str_msg): + logger.info(encoder.JSONEncoder().encode(message)) + raise ValueError(str_msg) + logger.info(f"No changes found to update: {cluster_name}") + + return physical_resource_id + + @helper.create + def create(event, context): + return create_or_update(event) + + @helper.update + def update(event, context): + return create_or_update(event) + + @helper.delete + def delete(event, context): + properties = event["ResourceProperties"] + cluster_name = properties.get("ClusterName") + + deletion_policy = properties.get("DeletionPolicy", "Delete") + if deletion_policy not in {"Retain", "Delete"}: + raise ValueError("DeleetionPolicy must be one of [\"Retain\", \"Delete\"].") + if deletion_policy == "Retain": + return cluster_name + + logger.info(f"Deleting: {cluster_name}") + try: + update_response(pc.delete_cluster(cluster_name=cluster_name)) + except (ParameterException, NotFoundException): # cluster deleted or invalid name -- ignore here. + pass + except Exception as e: + message = exception_message(e) + raise ValueError(encoder.JSONEncoder().encode(message)) + + # Polling functionality for async CUD operations + + def poll(event): + log_group = os.getenv("AWS_LAMBDA_LOG_GROUP_NAME") + cluster_name = event["ResourceProperties"].get("ClusterName") + try: + cluster = pc.describe_cluster(cluster_name=cluster_name) + status = cluster.get("clusterStatus") + + if status in {"CREATE_COMPLETE", "UPDATE_COMPLETE"}: + update_response(cluster) + return cluster_name + elif status in {"CREATE_FAILED", "UPDATE_FAILED", "DELETE_FAILED"}: + reasons = ",".join(f["failureCode"] for f in cluster.get("failures", [])) + raise ValueError(f"{cluster_name}: {reasons} (LogGroup: {log_group})") + + # If create fails and we try to roll-back (e.g. delete), + # gracefully handle missing cluster. on the delete pathway, the + # only invalid parameter can be the name + except (ParameterException, NotFoundException): + if event["RequestType"].upper() == "DELETE": + # Returning a value here signifies that the delete is completed and we can stop polling + # not returning a value here causes cfn resource helper to keep polling. + return cluster_name + raise ValueError(f"{cluster_name} failed {event['RequestType'].upper()}. See LogGroup: {log_group}") + + @helper.poll_create + def poll_create(event, context): + return poll(event) + + @helper.poll_update + def poll_update(event, context): + return poll(event) + + @helper.poll_delete + def poll_delete(event, context): + return poll(event) + + def handler(event, context): + helper(event, context) + + Handler: index.handler + Runtime: python3.9 + Role: !If [CustomRoleCondition, !Ref CustomLambdaRole, !GetAtt PclusterLambdaRole.Arn] + Layers: + - !Ref PclusterLayer + +Outputs: + ServiceToken: + Description: Lambda for managing PCluster Resources + Value: !GetAtt PclusterCfnFunction.Arn + LogGroupArn: + Description: ARN of LogGroup for Lambda logging + Value: !GetAtt PclusterCfnFunctionLogGroup.Arn + LambaLayerArn: + Description: ARN for the ParallelCluster Lambda Layer + Value: !Ref PclusterLayer diff --git a/cloudformation/database/serverless-database.yaml b/cloudformation/database/serverless-database.yaml index dcbea96bf5..7e8b08eb38 100644 --- a/cloudformation/database/serverless-database.yaml +++ b/cloudformation/database/serverless-database.yaml @@ -1,6 +1,5 @@ AWSTemplateFormatVersion: 2010-09-09 -Description: >- - This template is provided as part of a tutorial on how to enable Slurm Accounting using Parallel Cluster. +Description: AWS ParallelCluster Slurm Accounting Database Metadata: AWS::CloudFormation::Interface: ParameterGroups: diff --git a/cloudformation/networking/public-private.cfn.json b/cloudformation/networking/public-private.cfn.json index c1015e4a95..308e234315 100644 --- a/cloudformation/networking/public-private.cfn.json +++ b/cloudformation/networking/public-private.cfn.json @@ -30,7 +30,7 @@ ] } }, - "Description": "Public/Private Network for AWS ParallelCluster", + "Description": "AWS ParallelCluster Public/Private Network", "Outputs": { "VpcId": { "Value": { diff --git a/cloudformation/networking/public.cfn.json b/cloudformation/networking/public.cfn.json index fd1f60e863..1813596979 100644 --- a/cloudformation/networking/public.cfn.json +++ b/cloudformation/networking/public.cfn.json @@ -30,7 +30,7 @@ ] } }, - "Description": "Public Network for AWS ParallelCluster", + "Description": "AWS ParallelCluster Public Network", "Outputs": { "VpcId": { "Value": { diff --git a/cloudformation/policies/parallelcluster-policies.yaml b/cloudformation/policies/parallelcluster-policies.yaml index 0f5cc6bb44..e71641a250 100644 --- a/cloudformation/policies/parallelcluster-policies.yaml +++ b/cloudformation/policies/parallelcluster-policies.yaml @@ -1,5 +1,5 @@ AWSTemplateFormatVersion: '2010-09-09' -Description: 'Template for the ParallelCluster Policies' +Description: 'AWS ParallelCluster Policies' Parameters: Region: @@ -394,16 +394,37 @@ Resources: Effect: Allow Sid: Route53HostedZones - Action: - - cloudformation:* - Resource: '*' + - cloudformation:CreateStack + Resource: !Sub + - arn:*:cloudformation:${RequestedRegion}:${AWS::AccountId}:stack/* + - RequestedRegion: !If [IsMultiRegion, '*', !Ref Region] Effect: Allow - Condition: !If - - IsMultiRegion - - !Ref AWS::NoValue - - StringEquals: - aws:RequestedRegion: - - !Ref Region - Sid: CloudFormation + Condition: + ForAnyValue:StringEquals: + aws:TagKeys: ["parallelcluster:cluster-name"] + Sid: CloudFormationCreate + - Action: + - cloudformation:UpdateStack + Resource: !Sub + - arn:*:cloudformation:${RequestedRegion}:${AWS::AccountId}:stack/* + - RequestedRegion: !If [IsMultiRegion, '*', !Ref Region] + Effect: Allow + Condition: + ForAnyValue:StringLike: + aws:ResourceTag/parallelcluster:cluster-name: "*" + Sid: CloudFormationUpdate + - Action: + - cloudformation:DeleteStack + - cloudformation:DescribeStacks + - cloudformation:DescribeStackEvents + - cloudformation:DescribeStackResources + - cloudformation:GetTemplate + - cloudformation:ListStacks + Resource: !Sub + - arn:*:cloudformation:${RequestedRegion}:${AWS::AccountId}:stack/* + - RequestedRegion: !If [IsMultiRegion, '*', !Ref Region] + Effect: Allow + Sid: CloudFormationReadAndDelete - Action: - cloudwatch:PutDashboard - cloudwatch:ListDashboards @@ -527,6 +548,9 @@ Resources: - logs:CreateLogGroup - logs:TagResource - logs:UntagResource + - logs:DescribeMetricFilters + - logs:PutMetricFilter + - logs:deleteMetricFilter Resource: '*' Effect: Allow Condition: !If diff --git a/cloudformation/tests/conftest.py b/cloudformation/tests/conftest.py index f8da54b492..d778017031 100644 --- a/cloudformation/tests/conftest.py +++ b/cloudformation/tests/conftest.py @@ -1,10 +1,55 @@ """Additional pytest configuration.""" +import os import random import string import boto3 import pytest +PUBPRIV_TEMPLATE = "../networking/public-private.cfn.json" + + +def pytest_addoption(parser): + """Add the pytest parameter options with defaults from environment.""" + for arg in ["bucket", "private_subnet_id", "public_subnet_id", "service_token"]: + parser.addoption(f"--{arg}", action="store", default=os.environ.get(arg.upper(), "")) + + +def random_str(): + """Generate a random string.""" + alnum = string.ascii_uppercase + string.ascii_lowercase + string.digits + start = random.choice(string.ascii_uppercase + string.ascii_lowercase) + return start + "".join(random.choice(alnum) for _ in range(8)) + + +def cfn_stack_generator(path, name, parameters=None, capabilities=None): + """Create a stack, wait for completion and yield it.""" + cfn = boto3.client("cloudformation") + with open(path, encoding="utf-8") as templ: + template = templ.read() + + parameters = parameters or {} + + # Create networking using CloudFormation, block on completion + cfn.create_stack( + StackName=name, + TemplateBody=template, + Capabilities=capabilities or ["CAPABILITY_IAM"], + Parameters=[{"ParameterKey": k, "ParameterValue": v} for k, v in parameters.items()], + ) + cfn.get_waiter("stack_create_complete").wait(StackName=name) + + try: + outputs = cfn.describe_stacks(StackName=name)["Stacks"][0]["Outputs"] + yield {o["OutputKey"]: o["OutputValue"] for o in outputs} + + # Delete the stack through CFN and wait for delete to complete + cfn.delete_stack(StackName=name) + cfn.get_waiter("stack_delete_complete").wait(StackName=name) + except Exception as exc: + cfn.delete_stack(StackName=name) + raise exc + def pytest_collection_modifyitems(items, config): """Augment the tests to add unmarked marker to tests that aren't marked.""" @@ -16,8 +61,31 @@ def pytest_collection_modifyitems(items, config): @pytest.fixture(name="random_stack_name") def random_stack_name_fixture(): """Provide a short random id that can be used in a aack name.""" - alnum = string.ascii_uppercase + string.ascii_lowercase + string.digits - return "".join(random.choice(alnum) for _ in range(8)) + return random_str() + + +@pytest.fixture(scope="session") +def service_token(pytestconfig): + """Bucket returned from pytest arguments for retrieving artifacts.""" + return pytestconfig.getoption("service_token") + + +@pytest.fixture(scope="session", name="bucket") +def bucket_fixture(pytestconfig): + """Bucket returned from pytest arguments for retrieving artifacts.""" + return pytestconfig.getoption("bucket") + + +@pytest.fixture(scope="session", name="private_subnet_id") +def private_subnet_id_fixture(pytestconfig): + """public_subnet_id returned from pytest arguments for HeadNode.""" + return pytestconfig.getoption("private_subnet_id") + + +@pytest.fixture(scope="session", name="public_subnet_id") +def public_subnet_id_fixture(pytestconfig): + """private_subnet_id returned from pytest argumenets for Compute Nodes.""" + return pytestconfig.getoption("public_subnet_id") @pytest.fixture(scope="session", name="cfn") @@ -25,3 +93,18 @@ def cfn_fixture(): """Create a CloudFormation Boto3 client.""" client = boto3.client("cloudformation") return client + + +@pytest.fixture(scope="module", name="default_vpc") +def default_vpc_fixture(private_subnet_id, public_subnet_id): + """Create our default VPC networking and return the stack name.""" + if private_subnet_id != "" and public_subnet_id != "": + yield {"PublicSubnetId": public_subnet_id, "PrivateSubnetId": private_subnet_id} + return + + ec2 = boto3.client("ec2") + azs = ec2.describe_availability_zones()["AvailabilityZones"] + stack_name = random_str() + parameters = {"AvailabilityZone": azs[0]["ZoneName"]} + + yield from cfn_stack_generator(PUBPRIV_TEMPLATE, stack_name, parameters) diff --git a/cloudformation/tests/test_policies.py b/cloudformation/tests/test_policies.py index 59d12bb8c8..d1ef094e3c 100644 --- a/cloudformation/tests/test_policies.py +++ b/cloudformation/tests/test_policies.py @@ -3,7 +3,6 @@ import botocore import pytest from assertpy import assert_that -from cfn_flip import load_yaml TEMPLATE = "../policies/parallelcluster-policies.yaml" @@ -49,54 +48,3 @@ def test_policies(cfn, random_stack_name, parameters): cfn.get_waiter("stack_delete_complete").wait(StackName=stack_name) status = cfn.describe_stacks(StackName=stack_id)["Stacks"][0]["StackStatus"] assert_that(status).is_equal_to("DELETE_COMPLETE") - - -def test_match_api(): - """Validate minimal changes with the API's yaml.""" - source_path = "../../api/infrastructure/parallelcluster-api.yaml" - policies_path = "../policies/parallelcluster-policies.yaml" - - with open(source_path, encoding="utf-8") as source_file: - source = load_yaml(source_file.read()) - - with open(policies_path, encoding="utf-8") as policies_file: - policies = load_yaml(policies_file.read()) - - # These params match except for the description - for key in ["Region", "EnableFSxS3Access", "FsxS3Buckets", "PermissionsBoundaryPolicy"]: - drop_keys = {"Description"} - dest_dict = {k: v for k, v in policies["Parameters"][key].items() if k not in drop_keys} - source_dict = {k: v for k, v in source["Parameters"][key].items() if k not in drop_keys} - assert_that(dest_dict).is_equal_to(source_dict) - - for key in policies["Resources"].keys(): - drop_keys = {"Condition"} - - source_key = { - "ParallelClusterFSxS3AccessPolicy": "FSxS3AccessPolicy", - "ParallelClusterLambdaRole": "ParallelClusterUserRole", - }.get(key, key) - - source_dict = {k: v for k, v in source["Resources"][source_key].items() if k not in drop_keys} - dest_dict = {k: v for k, v in policies["Resources"][key].items() if k not in drop_keys} - - if key == "ParallelClusterLambdaRole": - - def remove_batch_if(arn): - return arn if ("Ref" in arn or "Fn::Sub" in arn) else arn["Fn::If"][1] - - dest_dict["Properties"]["ManagedPolicyArns"] = list( - map(remove_batch_if, dest_dict["Properties"]["ManagedPolicyArns"]) - ) - - # Rename UserRole to LambdaRole, ignore policy name mismatch - if key == "ParallelClusterFSxS3AccessPolicy": - source_dict["Properties"]["Roles"][0]["Ref"] = "ParallelClusterLambdaRole" - del source_dict["Properties"]["PolicyName"] - del dest_dict["Properties"]["PolicyName"] - - # Rename UserRole to LambdaRole - if key == "DefaultParallelClusterIamAdminPolicy": - source_dict["Properties"]["Roles"][0]["Ref"] = "ParallelClusterLambdaRole" - - assert_that(dest_dict).is_equal_to(source_dict) diff --git a/tests/iam_policies/cluster-roles.cfn.yaml b/tests/iam_policies/cluster-roles.cfn.yaml index cb80ed3b2e..a460396ece 100644 --- a/tests/iam_policies/cluster-roles.cfn.yaml +++ b/tests/iam_policies/cluster-roles.cfn.yaml @@ -79,6 +79,7 @@ Resources: - dynamodb:GetItem - dynamodb:UpdateItem - dynamodb:BatchWriteItem + - dynamodb:BatchGetItem Resource: !Sub arn:${AWS::Partition}:dynamodb:${AWS::Region}:${AWS::AccountId}:table/parallelcluster-* Effect: Allow - Action: ec2:TerminateInstances @@ -117,6 +118,11 @@ Resources: Resource: - !Sub arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/* - !Sub arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:volume/* + - Action: + - ec2:GetConsoleOutput + Effect: Allow + Resource: + - !Sub arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/* - Action: - cloudformation:DescribeStackResource - cloudformation:SignalResource diff --git a/tests/iam_policies/user-role.cfn.yaml b/tests/iam_policies/user-role.cfn.yaml index fca3e21e36..9b473bf386 100644 --- a/tests/iam_policies/user-role.cfn.yaml +++ b/tests/iam_policies/user-role.cfn.yaml @@ -486,6 +486,9 @@ Resources: - logs:CreateLogGroup - logs:TagResource - logs:UntagResource + - logs:DescribeMetricFilters + - logs:PutMetricFilter + - logs:deleteMetricFilter Resource: '*' Effect: Allow Condition: !If @@ -501,6 +504,14 @@ Resources: Resource: '*' Effect: Allow Sid: ResourceGroupRead + - Action: "secretsmanager:GetSecretValue" + Resource: !Sub arn:${AWS::Partition}:secretsmanager:${Region}:${AWS::AccountId}:secret:* + Effect: Allow + Sid: DirectoryServicePasswordReadFromSecretsManager + - Action: "ssm:GetParameter" + Resource: !Sub arn:${AWS::Partition}:ssm:${Region}:${AWS::AccountId}:parameter/* + Effect: Allow + Sid: DirectoryServicePasswordReadFromSsm ### IMAGE ACTIONS POLICIES @@ -1044,6 +1055,12 @@ Resources: - cloudwatch:DeleteAlarms - cloudwatch:DescribeAlarms Resource: "*" +# - Effect: Allow # TODO: Refactor it, comment it out now to workaround exceeds quota for PolicySize: 6144 +# Action: +# - logs:DescribeMetricFilters +# - logs:PutMetricFilter +# - logs:deleteMetricFilter +# Resource: "*" Outputs: ParallelClusterUserRole: diff --git a/tests/integration-tests/.gitignore b/tests/integration-tests/.gitignore new file mode 100644 index 0000000000..b6645c4f17 --- /dev/null +++ b/tests/integration-tests/.gitignore @@ -0,0 +1 @@ +/my-test-runner*.sh diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md index 3a03d9ab6c..c34a5f2464 100644 --- a/tests/integration-tests/README.md +++ b/tests/integration-tests/README.md @@ -42,7 +42,7 @@ usage: test_runner.py [-h] --key-name KEY_NAME --key-path KEY_PATH [-n PARALLELI [--custom-cookbook-url CUSTOM_COOKBOOK_URL] [--createami-custom-cookbook-url CREATEAMI_CUSTOM_COOKBOOK_URL] [--createami-custom-node-url CREATEAMI_CUSTOM_NODE_URL] [--custom-awsbatchcli-url CUSTOM_AWSBATCHCLI_URL] [--pre-install PRE_INSTALL] [--post-install POST_INSTALL] [--instance-types-data INSTANCE_TYPES_DATA] [--custom-ami CUSTOM_AMI] [--pcluster-git-ref PCLUSTER_GIT_REF] [--cookbook-git-ref COOKBOOK_GIT_REF] [--node-git-ref NODE_GIT_REF] [--ami-owner AMI_OWNER] [--benchmarks] [--benchmarks-target-capacity BENCHMARKS_TARGET_CAPACITY] [--benchmarks-max-time BENCHMARKS_MAX_TIME] - [--api-definition-s3-uri API_DEFINITION_S3_URI] [--api-infrastructure-s3-uri API_INFRASTRUCTURE_S3_URI] [--public-ecr-image-uri PUBLIC_ECR_IMAGE_URI] [--api-uri API_URI] [--vpc-stack VPC_STACK] [--cluster CLUSTER] + [--api-definition-s3-uri API_DEFINITION_S3_URI] [--api-infrastructure-s3-uri API_INFRASTRUCTURE_S3_URI] [--api-uri API_URI] [--policies-uri POLICIES_URI] [--vpc-stack VPC_STACK] [--cluster CLUSTER] [--lambda-layer-source LAMBDA_LAYER_SOURCE] [--no-delete] [--delete-logs-on-success] [--stackname-suffix STACKNAME_SUFFIX] [--dry-run] [--directory-stack-name DIRECTORY_STACK_NAME] [--ldaps-nlb-stack-name LDAPS_NLB_STACK_NAME] [--external-shared-storage-stack-name SHARED_STORAGE_STACK_NAME] Run integration tests suite. @@ -126,14 +126,22 @@ AMI selection parameters: Benchmarks: --benchmarks Run benchmarks tests. Benchmarks tests will be run together with functionality tests. (default: False) +CloudFormation / Custom Resource options: + --cluster-custom-resource-service-token CLUSTER_CUSTOM_RESOURCE_SERVICE_TOKEN + ServiceToken (ARN) Cluster CloudFormation custom resource provider (default: None) + --resource-bucket RESOURCE_BUCKET + Name of bucket to use to to retrieve standard hosted resources like CloudFormation templates. (default: None) + --lambda-layer-source LAMBDA_LAYER_SOURCE + S3 URI of lambda layer to copy instead of building. (default: None) + API options: --api-definition-s3-uri API_DEFINITION_S3_URI - URI of the Docker image for the Lambda of the ParallelCluster API (default: None) + URI of the OpenAPI spec of the ParallelCluster API (default: None) --api-infrastructure-s3-uri API_INFRASTRUCTURE_S3_URI URI of the CloudFormation template for the ParallelCluster API (default: None) - --public-ecr-image-uri PUBLIC_ECR_IMAGE_URI - S3 URI of the ParallelCluster API spec (default: None) --api-uri API_URI URI of an existing ParallelCluster API (default: None) + --policies-uri POLICIES_URI + Use an existing policies URI instead of uploading one. (default: None) Debugging/Development options: --vpc-stack VPC_STACK diff --git a/tests/integration-tests/benchmarks/common/metrics_reporter.py b/tests/integration-tests/benchmarks/common/metrics_reporter.py index b725a4dc31..992ec0981e 100644 --- a/tests/integration-tests/benchmarks/common/metrics_reporter.py +++ b/tests/integration-tests/benchmarks/common/metrics_reporter.py @@ -17,7 +17,7 @@ import boto3 from retrying import RetryError, retry from time_utils import seconds -from utils import _describe_cluster_instances +from utils import describe_cluster_instances METRIC_WIDGET_TEMPLATE = """ {{ @@ -93,7 +93,7 @@ def _watch_compute_nodes_allocation(): Namespace="ParallelCluster/benchmarking/{cluster_name}".format(cluster_name=cluster_name), MetricData=[{"MetricName": "ComputeNodesCount", "Value": compute_nodes, "Unit": "Count"}], ) - ec2_instances_count = len(_describe_cluster_instances(cluster_name, region, filter_by_node_type="Compute")) + ec2_instances_count = len(describe_cluster_instances(cluster_name, region, filter_by_node_type="Compute")) logging.info("Publishing EC2 compute metric: count={0}".format(ec2_instances_count)) cw_client.put_metric_data( Namespace="ParallelCluster/benchmarking/{cluster_name}".format(cluster_name=cluster_name), diff --git a/tests/integration-tests/cfn_stacks_factory.py b/tests/integration-tests/cfn_stacks_factory.py index 31297e9a67..98a32adbde 100644 --- a/tests/integration-tests/cfn_stacks_factory.py +++ b/tests/integration-tests/cfn_stacks_factory.py @@ -226,6 +226,42 @@ def delete_stack(self, name, region): "Couldn't find stack with name {0} in region {1}. Skipping deletion.".format(name, region) ) + @retry( + stop_max_attempt_number=10, + wait_fixed=5000, + retry_on_exception=lambda exception: isinstance(exception, ClientError), + ) + def update_stack(self, name, region, parameters, stack_is_under_test=False): + """Update a created cfn stack.""" + with aws_credential_provider(region, self.__credentials): + internal_id = self.__get_stack_internal_id(name, region) + if internal_id in self.__created_stacks: + logging.info("Updating stack {0} in region {1}".format(name, region)) + try: + stack = self.__created_stacks[internal_id] + cfn_client = boto3.client("cloudformation", region_name=stack.region) + cfn_client.update_stack(StackName=stack.name, UsePreviousTemplate=True, Parameters=parameters) + final_status = self.__wait_for_stack_update(stack.cfn_stack_id, cfn_client) + self.__assert_stack_status( + final_status, + {"UPDATE_COMPLETE", "UPDATE_COMPLETE_CLEANUP_IN_PROGRESS"}, + stack_name=stack.cfn_stack_id, + region=region, + stack_is_under_test=stack_is_under_test, + ) + # Update the stack data while still in the credential context + stack.init_stack_data() + except Exception as e: + logging.error( + "Update of stack {0} in region {1} failed with exception: {2}".format(name, region, e) + ) + raise + logging.info("Stack {0} updated successfully in region {1}".format(name, region)) + else: + logging.warning( + "Couldn't find stack with name {0} in region {1}. Skipping update.".format(name, region) + ) + def delete_all_stacks(self): """Destroy all created stacks.""" logging.debug("Destroying all cfn stacks") @@ -255,13 +291,22 @@ def __wait_for_stack_creation(self, name, cfn_client): def __wait_for_stack_deletion(self, name, cfn_client): return self.__get_stack_status(name, cfn_client) + @retry( + retry_on_result=lambda result: result == "UPDATE_IN_PROGRESS", + wait_fixed=5000, + retry_on_exception=lambda exception: isinstance(exception, ClientError) and "Rate exceeded" in str(exception), + ) + def __wait_for_stack_update(self, name, cfn_client): + return self.__get_stack_status(name, cfn_client) + @staticmethod def __get_stack_status(name, cfn_client): return cfn_client.describe_stacks(StackName=name).get("Stacks")[0].get("StackStatus") @staticmethod def __assert_stack_status(status, expected_status, region, stack_name=None, stack_is_under_test=False): - if status != expected_status: + expected_status = {expected_status} if not isinstance(expected_status, set) else expected_status + if status not in expected_status: message = ( f"Stack status {status} for {stack_name} differs " f"from the expected status of {expected_status} in region {region}" diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index abafe5a6de..3c8d52fe08 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -84,6 +84,8 @@ def update(self, config_file, raise_on_error=True, log_error=True, **kwargs): # update the cluster logging.info("Updating cluster %s with config %s", self.name, config_file) command = ["pcluster", "update-cluster", "--cluster-configuration", config_file, "--cluster-name", self.name] + # This changes the default behavior of the update-cluster command and makes it wait for the cluster update to + # finish before returning. if kwargs.pop("wait", True): command.append("--wait") for k, val in kwargs.items(): @@ -479,6 +481,8 @@ def _build_command(cluster, kwargs): cluster.name, ] + # This changes the default behavior of the create-cluster command and makes it wait for the cluster creation to + # finish before returning. wait = kwargs.pop("wait", True) if wait: command.append("--wait") @@ -505,8 +509,8 @@ def destroy_cluster(self, name, test_passed): logging.info("Destroying cluster {0}".format(name)) if name in self.__created_clusters: delete_logs = test_passed and self._delete_logs_on_success and self.__created_clusters[name].create_complete + cluster = self.__created_clusters[name] try: - cluster = self.__created_clusters[name] cluster.delete(delete_logs=delete_logs) except Exception as e: logging.error( diff --git a/tests/integration-tests/configs/.gitignore b/tests/integration-tests/configs/.gitignore new file mode 100644 index 0000000000..76f801c0b4 --- /dev/null +++ b/tests/integration-tests/configs/.gitignore @@ -0,0 +1 @@ +/my-test*.yaml diff --git a/tests/integration-tests/configs/common.jinja2 b/tests/integration-tests/configs/common.jinja2 index 4c1b71619c..923619b216 100644 --- a/tests/integration-tests/configs/common.jinja2 +++ b/tests/integration-tests/configs/common.jinja2 @@ -25,11 +25,3 @@ "{{ instance_key }}" {%- endif -%} {%- endmacro -%} - -{%- set OSS_COMMERCIAL_X86_NO_RHEL8 = ["alinux2", "centos7", "ubuntu1804", "ubuntu2004"] -%} -{%- set OSS_COMMERCIAL_ARM_NO_RHEL8 = ["alinux2", "ubuntu1804", "ubuntu2004"] -%} -{%- set OSS_CHINA_X86_NO_RHEL8 = ["alinux2", "ubuntu1804", "ubuntu2004"] -%} -{%- set OSS_CHINA_ARM_NO_RHEL8 = ["alinux2", "ubuntu1804", "ubuntu2004"] -%} -{%- set OSS_GOVCLOUD_X86_NO_RHEL8 = ["alinux2", "ubuntu1804", "ubuntu2004"] -%} -{%- set OSS_GOVCLOUD_ARM_NO_RHEL8 = ["alinux2", "ubuntu1804", "ubuntu2004"] -%} -{%- set OSS_ONE_PER_DISTRO_NO_RHEL8 = ["centos7", "alinux2", "ubuntu2004"] -%} diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index 0f50eeb668..aa76b45da8 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -3,17 +3,13 @@ ad_integration: dimensions: - regions: [ "ap-southeast-1" ] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["alinux2", "rhel8"] schedulers: ["slurm"] benchmarks: - mpi_variants: ["openmpi"] - num_instances: [100] + num_instances: [5] osu_benchmarks: - collective: ["osu_allreduce", "osu_alltoall"] - - regions: ["ap-southeast-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1804"] - schedulers: ["slurm"] + collective: ["osu_alltoall"] - regions: ["eu-west-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["ubuntu2004", "centos7"] @@ -26,25 +22,12 @@ arm_pl: instances: {{ common.INSTANCES_DEFAULT_ARM }} oss: {{ common.OSS_COMMERCIAL_ARM }} schedulers: ["slurm"] -{%- if SCHEDULER_PLUGIN_TESTS is not defined or SCHEDULER_PLUGIN_TESTS == true %} -scheduler_plugin: - test_scheduler_plugin.py::test_scheduler_plugin_integration: - dimensions: - - regions: ["ca-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["plugin"] - - regions: ["ap-northeast-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} - schedulers: ["plugin"] -{%- endif %} cfn-init: test_cfn_init.py::test_replace_compute_on_failure: dimensions: - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO_NO_RHEL8 }} + oss: ["rhel8"] schedulers: ["slurm"] test_cfn_init.py::test_install_args_quotes: dimensions: @@ -62,7 +45,7 @@ cli_commands: cloudwatch_logging: test_cloudwatch_logging.py::test_cloudwatch_logging: dimensions: - # 1) run the test for all of the schedulers with alinux2 + # 1) run the test for all the schedulers with alinux2 - regions: ["cn-northwest-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] @@ -71,15 +54,10 @@ cloudwatch_logging: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["awsbatch"] - # 2) run the test for all x86 OSes with slurm - - regions: ["ap-east-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} - schedulers: ["slurm"] - # 3) run the test for all ARM OSes on an ARM instance + # 2) run the test for all OSes with slurm - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} + oss: ["ubuntu2004", "rhel8", "centos7"] schedulers: ["slurm"] test_compute_console_output_logging.py::test_console_output_with_monitoring_disabled: dimensions: @@ -90,19 +68,15 @@ cloudwatch_logging: test_compute_console_output_logging.py::test_custom_action_error: dimensions: - regions: ["ap-east-1"] - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["rhel8"] instances: {{ common.INSTANCES_DEFAULT_X86 }} schedulers: ["slurm"] - - regions: ["us-west-2"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} - schedulers: ["slurm"] configure: test_pcluster_configure.py::test_pcluster_configure: dimensions: - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO }} + oss: ["ubuntu2004"] schedulers: ["slurm"] - regions: ["ap-southeast-2"] instances: {{ common.INSTANCES_DEFAULT_ARM }} @@ -135,7 +109,7 @@ configure: dimensions: - regions: ["us-east-1"] instances: {{ common.INSTANCES_EFA_UNSUPPORTED_X86 }} - oss: ["alinux2"] + oss: ["rhel8"] schedulers: ["slurm"] create: @@ -167,14 +141,14 @@ create: dimensions: - regions: ["ap-northeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - schedulers: ["slurm", "awsbatch"] + schedulers: ["slurm"] oss: ["alinux2"] test_create.py::test_cluster_creation_with_invalid_ebs: dimensions: - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} schedulers: ["slurm"] - oss: ["ubuntu1804", "ubuntu2004"] + oss: ["ubuntu2004"] createami: test_createami.py::test_invalid_config: dimensions: @@ -185,18 +159,20 @@ createami: dimensions: - regions: ["eu-west-3"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO_NO_RHEL8 }} + oss: ["ubuntu2004", "alinux2", "ubuntu1804"] test_createami.py::test_kernel4_build_image_run_cluster: dimensions: - regions: ["eu-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - schedulers: ["awsbatch", "slurm"] + schedulers: ["slurm"] oss: ["alinux2"] test_createami.py::test_build_image_custom_components: + # Test arn custom component with combination (eu-west-1, m6g.xlarge, alinux2) + # Test script custom component with combination (ap-southeast-2, c5.xlarge, ubuntu2004) dimensions: - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["rhel8"] + oss: ["ubuntu2004"] - regions: ["eu-west-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} oss: ["alinux2"] @@ -205,41 +181,32 @@ createami: - regions: ["ca-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] -dashboard_and_alarms: - test_dashboard_and_alarms.py::test_dashboard_and_alarms: +monitoring: + test_monitoring.py::test_monitoring: dimensions: - regions: ["ap-northeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["centos7"] schedulers: ["slurm"] + test_structured_log_events.py::test_custom_compute_action_failure: + dimensions: + - regions: ["af-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu2004"] + schedulers: ["slurm"] dcv: test_dcv.py::test_dcv_configuration: dimensions: # DCV on GPU enabled instance - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well instances: ["g4dn.2xlarge"] - oss: {{common.OSS_COMMERCIAL_X86_NO_RHEL8 }} - schedulers: ["slurm"] - # DCV on ARM - - regions: ["sa-east-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2", "ubuntu1804"] + oss: ["ubuntu2004"] schedulers: ["slurm"] # DCV on ARM + GPU - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well instances: ["g5g.2xlarge"] - oss: ["alinux2", "ubuntu1804"] + oss: ["centos7"] schedulers: ["slurm"] - # DCV on Batch - - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well - instances: ["g4dn.2xlarge"] - oss: ["alinux2"] - schedulers: ["awsbatch"] - # DCV on Batch + ARM - - regions: ["us-east-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2"] - schedulers: ["awsbatch"] # DCV in cn regions and non GPU enabled instance - regions: ["cn-northwest-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} @@ -259,17 +226,13 @@ dcv: disable_hyperthreading: test_disable_hyperthreading.py::test_hit_disable_hyperthreading: dimensions: - - regions: ["us-west-1"] - instances: ["m4.xlarge"] - oss: ["alinux2", "centos7"] - schedulers: ["slurm"] - regions: ["us-west-1"] instances: ["c5.xlarge"] - oss: ["ubuntu1804", "ubuntu2004"] + oss: ["ubuntu2004"] schedulers: ["slurm"] benchmarks: - mpi_variants: ["openmpi", "intelmpi"] - num_instances: [20] # Change the head node instance type if you'd test more than 30 instances + num_instances: [5] # Change the head node instance type if you'd test more than 30 instances slots_per_instance: 2 partition: "ht-disabled" osu_benchmarks: @@ -279,7 +242,7 @@ dns: dimensions: - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} schedulers: ["slurm"] test_dns.py::test_existing_hosted_zone: dimensions: @@ -292,7 +255,7 @@ efa: dimensions: - regions: ["sa-east-1"] instances: ["c5n.9xlarge"] - oss: ["alinux2"] + oss: ["rhel8"] schedulers: ["slurm"] - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well instances: ["p4d.24xlarge"] @@ -300,11 +263,11 @@ efa: schedulers: ["slurm"] - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well instances: ["c6gn.16xlarge"] - oss: ["alinux2", "ubuntu1804", "ubuntu2004"] + oss: ["centos7"] schedulers: ["slurm"] - regions: ["use2-az2"] # do not move, unless instance type support is moved as well instances: [{{ common.instance("instance_type_1") }}] - oss: [ "alinux2", "centos7", "ubuntu2004" ] + oss: ["ubuntu2004"] schedulers: [ "slurm" ] iam: test_iam.py::test_iam_policies: @@ -328,8 +291,8 @@ iam: dimensions: - regions: ["eu-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] + oss: ["rhel8"] + schedulers: ["slurm"] test_iam.py::test_iam_resource_prefix: dimensions: - regions: [ "eu-north-1" ] @@ -354,8 +317,8 @@ networking: dimensions: - regions: ["me-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] + oss: ["rhel8"] + schedulers: ["slurm"] test_networking.py::test_public_network_topology: dimensions: - regions: ["af-south-1", "us-gov-east-1", "cn-northwest-1"] @@ -368,11 +331,11 @@ networking: # S3 bucket belonging to the same region and S3 VPC Endpoints only work within the region. - regions: ["us-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["rhel8"] schedulers: ["slurm"] - regions: ["us-east-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} + oss: ["centos7"] schedulers: ["slurm"] - regions: ["cn-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} @@ -387,27 +350,19 @@ networking: - regions: ["ap-northeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] + schedulers: ["slurm"] test_security_groups.py::test_additional_sg_and_ssh_from: dimensions: - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["centos7"] schedulers: ["slurm"] - - regions: ["eu-north-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["awsbatch"] test_security_groups.py::test_overwrite_sg: dimensions: - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["centos7"] schedulers: ["slurm"] - - regions: ["eu-north-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["awsbatch"] test_placement_group.py::test_placement_group: dimensions: - regions: ["eu-central-1"] @@ -423,22 +378,22 @@ scaling: dimensions: - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["rhel8"] schedulers: ["slurm"] - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} + oss: ["ubuntu2004"] schedulers: ["slurm"] test_mpi.py::test_mpi_ssh: dimensions: - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} + oss: ["centos7"] schedulers: ["slurm"] schedulers: test_awsbatch.py::test_awsbatch: dimensions: - - regions: ["eu-north-1", "us-gov-west-1", "cn-north-1"] + - regions: ["us-gov-west-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["awsbatch"] @@ -448,7 +403,7 @@ schedulers: schedulers: ["awsbatch"] test_awsbatch.py::test_awsbatch_defaults: dimensions: - - regions: ["eu-north-1", "us-gov-west-1", "cn-north-1"] + - regions: ["cn-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["awsbatch"] @@ -456,36 +411,20 @@ schedulers: dimensions: - regions: ["eu-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} + oss: ["ubuntu2004"] schedulers: ["slurm"] -{%- if SCHEDULER_PLUGIN_TESTS is not defined or SCHEDULER_PLUGIN_TESTS == true %} - - regions: ["eu-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO_NO_RHEL8 }} - schedulers: ["slurm_plugin"] -{%- endif %} test_slurm.py::test_slurm_pmix: # TODO: include in main test_slurm to reduce number of created clusters dimensions: - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["centos7"] schedulers: ["slurm"] - regions: ["ap-northeast-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} + oss: ["rhel8"] schedulers: ["slurm"] test_slurm.py::test_slurm_scaling: dimensions: - - regions: ["us-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO_NO_RHEL8 }} - schedulers: ["slurm"] -{%- if SCHEDULER_PLUGIN_TESTS is not defined or SCHEDULER_PLUGIN_TESTS == true %} - - regions: ["me-south-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm_plugin"] -{%- endif %} - regions: ["use2-az2"] # do not move, unless instance type support is moved as well instances: [{{ common.instance("instance_type_1") }}] oss: [ "alinux2" ] @@ -496,30 +435,17 @@ schedulers: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["slurm"] -{%- if SCHEDULER_PLUGIN_TESTS is not defined or SCHEDULER_PLUGIN_TESTS == true %} - - regions: ["ca-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm_plugin"] -{%- endif %} test_slurm.py::test_slurm_protected_mode: dimensions: - regions: ["ca-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["rhel8"] schedulers: ["slurm"] -# Temporarily disable test_protected_mode for the Slurm scheduler plugin -#{%- if SCHEDULER_PLUGIN_TESTS is not defined or SCHEDULER_PLUGIN_TESTS == true %} -# - regions: ["ca-central-1"] -# instances: {{ common.INSTANCES_DEFAULT_X86 }} -# oss: ["alinux2"] -# schedulers: ["slurm_plugin"] -#{%- endif %} test_slurm.py::test_slurm_protected_mode_on_cluster_create: dimensions: - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["centos7"] schedulers: ["slurm"] test_slurm.py::test_fast_capacity_failover: dimensions: @@ -531,13 +457,13 @@ schedulers: dimensions: - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["ubuntu1804"] schedulers: ["slurm"] test_slurm.py::test_slurm_memory_based_scheduling: dimensions: - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["ubuntu2004"] schedulers: ["slurm"] test_slurm.py::test_scontrol_reboot: dimensions: @@ -565,15 +491,15 @@ schedulers: schedulers: ["slurm"] test_slurm_accounting.py::test_slurm_accounting: dimensions: - - regions: ["us-east-1", "ap-south-1"] + - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2", "ubuntu2004"] + oss: ["ubuntu2004"] schedulers: ["slurm"] test_slurm_accounting.py::test_slurm_accounting_disabled_to_enabled_update: dimensions: - regions: ["us-west-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7", "ubuntu1804"] + oss: ["centos7"] schedulers: ["slurm"] test_slurm.py::test_slurm_reconfigure_race_condition: dimensions: @@ -594,7 +520,7 @@ storage: dimensions: - regions: ["eu-west-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["rhel8"] + oss: ["alinux2"] schedulers: ["slurm"] - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} @@ -606,17 +532,17 @@ storage: dimensions: - regions: ["eu-west-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["rhel8"] schedulers: ["slurm"] benchmarks: - - mpi_variants: ["openmpi", "intelmpi"] - num_instances: [20] # Change the head node instance type if you'd test more than 30 instances + - mpi_variants: ["openmpi"] + num_instances: [5] # Change the head node instance type if you'd test more than 30 instances slots_per_instance: 2 osu_benchmarks: - collective: ["osu_allreduce", "osu_alltoall"] + collective: ["osu_alltoall"] - regions: ["eu-west-2"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} + oss: ["ubuntu2004"] schedulers: ["slurm"] - regions: ["cn-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} @@ -630,7 +556,7 @@ storage: dimensions: - regions: ["eu-west-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["centos7"] schedulers: ["slurm"] test_fsx_lustre.py::test_fsx_lustre_configuration_options: dimensions: @@ -640,13 +566,9 @@ storage: schedulers: ["slurm"] test_fsx_lustre.py::test_fsx_lustre_backup: dimensions: - - regions: ["eu-south-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO_NO_RHEL8 }} - schedulers: ["slurm"] - regions: ["ap-southeast-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} + oss: ["ubuntu1804"] schedulers: ["slurm"] # EFS tests can be done in any region. test_efs.py::test_efs_compute_az: @@ -654,7 +576,7 @@ storage: - regions: ["ca-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] + schedulers: ["slurm"] test_efs.py::test_efs_same_az: dimensions: - regions: ["ca-central-1"] @@ -665,23 +587,23 @@ storage: # We should consider this when assigning dimensions to each test. test_efs.py::test_multiple_efs: dimensions: - - regions: ["ca-central-1", "cn-northwest-1"] + - regions: ["cn-northwest-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_BATCH }} + oss: ["alinux2"] schedulers: ["awsbatch"] - regions: [ "ca-central-1" ] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} + oss: ["ubuntu2004"] schedulers: [ "slurm" ] benchmarks: - - mpi_variants: ["openmpi", "intelmpi"] - num_instances: [20] # Change the head node instance type if you'd test more than 30 instances + - mpi_variants: ["intelmpi"] + num_instances: [5] # Change the head node instance type if you'd test more than 30 instances slots_per_instance: 2 osu_benchmarks: - collective: ["osu_allreduce", "osu_alltoall"] + collective: ["osu_alltoall"] - regions: ["us-gov-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_GOVCLOUD_X86_NO_RHEL8 }} + oss: ["rhel8"] schedulers: ["slurm"] test_raid.py::test_raid_fault_tolerance_mode: dimensions: @@ -693,39 +615,29 @@ storage: dimensions: - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["ubuntu2004", "centos7"] schedulers: ["slurm"] - regions: ["us-gov-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_GOVCLOUD_X86_NO_RHEL8 }} + oss: ["rhel8"] schedulers: ["slurm"] - - regions: ["ap-south-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_BATCH }} - schedulers: ["awsbatch"] test_ebs.py::test_ebs_multiple: dimensions: - - regions: ["us-east-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["awsbatch"] - regions: ["us-east-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["ubuntu1804"] schedulers: ["slurm"] test_ebs.py::test_ebs_single: dimensions: - {%- for region, oss in [("us-east-2", common.OSS_COMMERCIAL_X86_NO_RHEL8), ("cn-northwest-1", common.OSS_CHINA_X86_NO_RHEL8), ("us-gov-west-1", common.OSS_GOVCLOUD_X86_NO_RHEL8)] %} - - regions: ["{{ region }}"] + - regions: ["us-gov-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ oss }} + oss: ["alinux2"] schedulers: ["slurm"] - {%- endfor %} test_ebs.py::test_ebs_snapshot: dimensions: - regions: ["us-east-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["rhel8"] schedulers: ["slurm"] - regions: ["cn-northwest-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} @@ -735,7 +647,7 @@ storage: dimensions: - regions: ["us-east-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] + oss: ["rhel8"] schedulers: ["slurm"] test_deletion_policy.py::test_retain_on_deletion: dimensions: @@ -746,7 +658,7 @@ storage: # Ephemeral test requires instance type with instance store test_ephemeral.py::test_head_node_stop: dimensions: - - regions: ["us-east-1"] + - regions: ["use1-az4"] instances: ["m5d.xlarge", "h1.2xlarge"] oss: ["alinux2"] schedulers: ["slurm"] @@ -767,47 +679,46 @@ update: dimensions: - regions: ["eu-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["ubuntu2004"] test_update.py::test_update_compute_ami: dimensions: - regions: ["eu-west-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["rhel8"] test_update.py::test_update_instance_list: dimensions: - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["rhel8"] schedulers: ["slurm"] test_update.py::test_queue_parameters_update: dimensions: - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] + oss: ["rhel8"] schedulers: ["slurm"] test_update.py::test_dynamic_file_systems_update: dimensions: - regions: ["eu-west-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} + oss: ["ubuntu2004"] schedulers: ["slurm"] - regions: ["ap-northeast-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} + oss: ["centos7"] schedulers: ["slurm"] test_update.py::test_multi_az_create_and_update: dimensions: - regions: [ "eu-west-2" ] schedulers: [ "slurm" ] oss: ["alinux2"] -# Temporarily disabling p4d tests -# multiple_nics: -# test_multiple_nics.py::test_multiple_nics: -# dimensions: -# - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well -# instances: ["p4d.24xlarge"] -# oss: {{ common.OSS_COMMERCIAL_X86 }} -# schedulers: ["slurm"] +multiple_nics: + test_multiple_nics.py::test_multiple_nics: + dimensions: + - regions: ["use1-az6"] + instances: ["c6in.32xlarge"] + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] resource_bucket: test_resource_bucket.py::test_resource_bucket: dimensions: @@ -820,9 +731,5 @@ log_rotation: dimensions: - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] - - regions: ["ap-northeast-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2", "ubuntu1804"] + oss: ["ubuntu2004"] schedulers: ["slurm"] diff --git a/tests/integration-tests/configs/custom_resource.yaml b/tests/integration-tests/configs/custom_resource.yaml new file mode 100644 index 0000000000..dd51feb316 --- /dev/null +++ b/tests/integration-tests/configs/custom_resource.yaml @@ -0,0 +1,32 @@ +{%- import 'common.jinja2' as common -%} +--- +test-suites: + custom_resource: + test_cluster_custom_resource.py::test_cluster_create: + dimensions: + - oss: ["alinux2"] + regions: ["us-east-2"] + test_cluster_custom_resource.py::test_cluster_create_invalid: + dimensions: + - oss: ["alinux2"] + regions: ["us-east-2"] + test_cluster_custom_resource.py::test_cluster_update: + dimensions: + - oss: ["alinux2"] + regions: ["us-east-2"] + test_cluster_custom_resource.py::test_cluster_update_invalid: + dimensions: + - oss: ["alinux2"] + regions: ["us-east-2"] + test_cluster_custom_resource.py::test_cluster_delete_out_of_band: + dimensions: + - oss: ["alinux2"] + regions: ["us-east-2"] + test_cluster_custom_resource.py::test_cluster_delete_retain: + dimensions: + - oss: ["alinux2"] + regions: ["us-east-2"] + test_cluster_custom_resource.py::test_cluster_create_with_custom_policies: + dimensions: + - oss: ["alinux2"] + regions: ["us-east-2"] diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index 5233800bff..94af74adbd 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -23,15 +23,15 @@ test-suites: schedulers: ["slurm"] - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well instances: ["c6gn.16xlarge"] - oss: ["alinux2", "ubuntu1804", "ubuntu2004"] + oss: ["rhel8"] schedulers: ["slurm"] - regions: ["use2-az2"] # do not move, unless instance type support is moved as well instances: ["hpc6id.32xlarge"] - oss: ["alinux2"] + oss: ["centos7"] schedulers: [ "slurm" ] - regions: ["use2-az2"] # do not move, unless instance type support is moved as well instances: [{{ common.instance("instance_type_1") }}] - oss: [ "alinux2", "centos7", "ubuntu2004" ] + oss: ["ubuntu2004"] schedulers: [ "slurm" ] test_fabric.py::test_fabric: dimensions: @@ -64,51 +64,39 @@ test-suites: scaling: # FixMe: MPI tests configs are duplications of the test in common.yaml. # The duplications are necessary because the scaling section here overwrites the scaling section in common.yaml. - test_mpi.py::test_mpi: + test_mpi.py::test_mpi: # TODO: move outside of the scaling dir dimensions: - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} + oss: ["rhel8"] schedulers: ["slurm"] - regions: ["ca-central-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} + oss: ["ubuntu2004"] schedulers: ["slurm"] test_mpi.py::test_mpi_ssh: dimensions: - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} + oss: ["centos7"] schedulers: ["slurm"] test_scaling.py::test_multiple_jobs_submission: dimensions: - - regions: {{ common.REGIONS_COMMERCIAL }} - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86_NO_RHEL8 }} - schedulers: ["slurm"] - - regions: {{ common.REGIONS_CHINA }} - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_CHINA_X86_NO_RHEL8 }} - schedulers: ["slurm"] - - regions: {{ common.REGIONS_GOVCLOUD }} - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_GOVCLOUD_X86_NO_RHEL8 }} - schedulers: ["slurm"] - - regions: ["us-west-2"] + - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM_NO_RHEL8 }} - schedulers: {{ common.SCHEDULERS_TRAD }} + oss: ["ubuntu2004"] + schedulers: ["slurm"] - regions: ["cn-north-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_CHINA_ARM_NO_RHEL8 }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] schedulers: ["slurm"] - regions: ["us-gov-east-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_GOVCLOUD_ARM_NO_RHEL8 }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["rhel8"] schedulers: ["slurm"] trainium: test_trainium.py::test_trainium: dimensions: - regions: ["usw2-az4"] # do not move, unless instance type support is moved as well schedulers: ["slurm"] - oss: ["alinux2", "ubuntu1804", "ubuntu2004"] + oss: ["ubuntu2004"] diff --git a/tests/integration-tests/configs/installer.yaml b/tests/integration-tests/configs/installer.yaml new file mode 100644 index 0000000000..c15be760aa --- /dev/null +++ b/tests/integration-tests/configs/installer.yaml @@ -0,0 +1,10 @@ +{%- import 'common.jinja2' as common with context -%} +--- +test-suites: + cli_commands: + test_cli_commands.py::test_slurm_cli_commands: + dimensions: + - regions: [ "ap-northeast-2" ] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: [ "ubuntu1804" ] + schedulers: [ "slurm" ] diff --git a/tests/integration-tests/configs/isolated_regions.yaml b/tests/integration-tests/configs/isolated_regions.yaml new file mode 100644 index 0000000000..c351ef2a6d --- /dev/null +++ b/tests/integration-tests/configs/isolated_regions.yaml @@ -0,0 +1,547 @@ +{%- import 'common.jinja2' as common with context -%} +{%- set REGIONS = ["us-isob-east-1"] -%} +{%- set INSTANCES = ["c5.xlarge"] -%} +{%- set OSS = ["alinux2"] -%} +{%- set SCHEDULERS = ["slurm"] -%} +--- +test-suites: +# We must fix the integ test logic to make this integration test to work in us-isob-east-1 +# ad_integration: +# test_ad_integration.py::test_ad_integration: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + cfn-init: + test_cfn_init.py::test_replace_compute_on_failure: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_cfn_init.py::test_install_args_quotes: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + cli_commands: + test_cli_commands.py::test_slurm_cli_commands: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + cloudwatch_logging: + test_cloudwatch_logging.py::test_cloudwatch_logging: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_compute_console_output_logging.py::test_console_output_with_monitoring_disabled: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_compute_console_output_logging.py::test_custom_action_error: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + configure: + test_pcluster_configure.py::test_pcluster_configure: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} +# This test cannot be executed in US isolated regions +# because its logic is us-east-1 specific. +# test_pcluster_configure.py::test_pcluster_configure_avoid_bad_subnets: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + test_pcluster_configure.py::test_region_without_t2micro: + dimensions: + - regions: {{ REGIONS }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_pcluster_configure.py::test_efa_and_placement_group: + dimensions: + - regions: {{ REGIONS }} + instances: ["c5n.9xlarge"] + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_pcluster_configure.py::test_efa_unsupported: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + create: +# This test cannot be executed in US isolated regions +# because it relies on having AMIs for different OS in the region +# and this is not the case in these regions. +# test_create.py::test_create_wrong_os: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} +# This test cannot be executed in US isolated regions +# because it relies on having AMIs for different pcluster versions in the region +# and this is not the case in these regions. +# test_create.py::test_create_wrong_pcluster_version: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + test_create.py::test_create_imds_secured: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_create.py::test_cluster_creation_with_problematic_preinstall_script: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_create.py::test_cluster_creation_timeout: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_create.py::test_cluster_creation_with_invalid_ebs: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + monitoring: + test_monitoring.py::test_monitoring: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + disable_hyperthreading: + test_disable_hyperthreading.py::test_hit_disable_hyperthreading: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + benchmarks: + - mpi_variants: ["openmpi", "intelmpi"] + num_instances: [20] # Change the head node instance type if you'd test more than 30 instances + slots_per_instance: 2 + partition: "ht-disabled" + osu_benchmarks: + collective: ["osu_allreduce", "osu_alltoall"] + dns: + test_dns.py::test_hit_no_cluster_dns_mpi: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_dns.py::test_existing_hosted_zone: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + efa: + test_efa.py::test_efa: + dimensions: + - regions: {{ REGIONS }} + instances: ["c5n.18xlarge"] + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} +# This test cannot be executed in US isolated regions +# because its logic relies on downloading an external +# dependency from GitHub, which is not possible +# from within an air-gaped region. +# test_fabric.py::test_fabric: +# dimensions: +# - regions: {{ REGIONS }} +# instances: [ "p4d.24xlarge" ] +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + iam: + test_iam.py::test_iam_policies: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_iam.py::test_iam_roles: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} +# This test cannot be executed in US isolated regions +# because it makes use of build-image, which is not supported in these regions. +# test_iam_image.py::test_iam_roles: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} + test_iam.py::test_s3_read_write_resource: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_iam.py::test_iam_resource_prefix: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} +# This test cannot be executed in US isolated regions +# because it requires Centos7 clusters, but we currently support +# only Amazon Linux 2 in US isolated regions. +# intel_hpc: +# test_intel_hpc.py::test_intel_hpc: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + networking: +# This test is considered out of scope for isolated regions +# We will evaluate to re-include it after release 3.5.1 +# test_cluster_networking.py::test_cluster_in_private_subnet: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + test_cluster_networking.py::test_existing_eip: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_networking.py::test_public_network_topology: + dimensions: + - regions: {{ REGIONS }} + test_networking.py::test_public_private_network_topology: + dimensions: + - regions: {{ REGIONS }} + test_cluster_networking.py::test_cluster_in_no_internet_subnet: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_multi_cidr.py::test_multi_cidr: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_security_groups.py::test_additional_sg_and_ssh_from: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_security_groups.py::test_overwrite_sg: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_placement_group.py::test_placement_group: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} +# This test cannot be executed in US isolated regions +# because it relies on a CloudFormation stack using resources +# that are not supported by CloudFormation in ADC, +# i.e. CapacityReservation and ResourceGroup. +# test_on_demand_capacity_reservation.py::test_on_demand_capacity_reservation: +# dimensions: +# - regions: {{ REGIONS }} +# oss: {{ OSS }} +# This test cannot be executed in US isolated regions +# because ParallelCluster API is not supported in these regions. +# pcluster_api: +# test_api.py::test_cluster_slurm: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + resource_bucket: + test_resource_bucket.py::test_resource_bucket: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + scaling: + test_mpi.py::test_mpi: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_scaling.py::test_multiple_jobs_submission: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_mpi.py::test_mpi_ssh: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + schedulers: + test_slurm.py::test_slurm: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_slurm_pmix: # TODO: include in main test_slurm to reduce number of created clusters + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_slurm_scaling: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_error_handling: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_slurm_protected_mode: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_slurm_protected_mode_on_cluster_create: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_fast_capacity_failover: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_slurm_config_update: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_slurm_memory_based_scheduling: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_scontrol_reboot: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_scontrol_reboot_ec2_health_checks: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_slurm_overrides: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_slurm.py::test_scontrol_update_nodelist_sorting: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} +# These tests cannot be executed in US isolated regions +# because Slurm Accounting requires Secrets Manager, +# which is not supported in these regions. +# test_slurm_accounting.py::test_slurm_accounting: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} +# test_slurm_accounting.py::test_slurm_accounting_disabled_to_enabled_update: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + test_slurm.py::test_slurm_reconfigure_race_condition: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} +# This test cannot be executed in US isolated regions +# because Spot instances are not supported in these regions. +# spot: +# test_spot.py::test_spot_default: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} + storage: + test_efs.py::test_efs_compute_az: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_efs.py::test_efs_same_az: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_efs.py::test_multiple_efs: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + benchmarks: + - mpi_variants: ["openmpi", "intelmpi"] + num_instances: [20] # Change the head node instance type if you'd test more than 30 instances + slots_per_instance: 2 + osu_benchmarks: + collective: ["osu_allreduce", "osu_alltoall"] + test_raid.py::test_raid_fault_tolerance_mode: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_raid.py::test_raid_performance_mode: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_ebs.py::test_ebs_multiple: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_ebs.py::test_ebs_single: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_ebs.py::test_ebs_snapshot: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_ebs.py::test_ebs_existing: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_deletion_policy.py::test_retain_on_deletion: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_ephemeral.py::test_head_node_stop: + dimensions: + - regions: {{ REGIONS }} + instances: ["m5d.xlarge"] + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + tags: + test_tag_propagation.py::test_tag_propagation: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + update: + test_update.py::test_update_slurm: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} +# This test cannot be executed in US isolated regions +# because it relies on having different ParallelCluster AMIs in the region +# and this is not the case in these regions. +# test_update.py::test_update_compute_ami: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} + test_update.py::test_update_instance_list: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} + test_update.py::test_queue_parameters_update: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} +# We must fix the integ test logic to make this integration test to work in us-isob-east-1 +# test_update.py::test_dynamic_file_systems_update: +# dimensions: +# - regions: {{ REGIONS }} +# instances: {{ INSTANCES }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} +# This test cannot be executed in US isolated regions +# because it relies on a CloudFormation stack using resources +# that are not supported by CloudFormation in ADC, +# i.e. CapacityReservation and ResourceGroup. +# test_update.py::test_multi_az_create_and_update: +# dimensions: +# - regions: {{ REGIONS }} +# oss: {{ OSS }} +# schedulers: {{ SCHEDULERS }} diff --git a/tests/integration-tests/configs/new_os.yaml b/tests/integration-tests/configs/new_os.yaml index d9dc8180b0..412be6556a 100644 --- a/tests/integration-tests/configs/new_os.yaml +++ b/tests/integration-tests/configs/new_os.yaml @@ -1,131 +1,344 @@ -{%- import 'common.jinja2' as common -%} -{%- set REGION = ["##PLACEHOLDER##"] -%} +{%- import 'common.jinja2' as common with context -%} {%- set NEW_OS = ["##PLACEHOLDER##"] -%} --- test-suites: - scaling: - test_mpi.py::test_mpi: + pcluster_api: + test_api.py::test_cluster_slurm: dimensions: - - regions: {{ REGION }} + - regions: [ "sa-east-1" ] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: {{ NEW_OS }} schedulers: [ "slurm" ] - - regions: {{ REGION }} - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - schedulers: - test_slurm.py::test_slurm: + ad_integration: + test_ad_integration.py::test_ad_integration: dimensions: - - regions: {{ REGION }} + - regions: ["ap-southeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - - regions: {{ REGION }} + schedulers: ["slurm"] + benchmarks: + - mpi_variants: [ "openmpi", "intelmpi" ] + num_instances: [ 4 ] + slots_per_instance: 2 + osu_benchmarks: + collective: [ "osu_alltoall" ] + arm_pl: + test_arm_pl.py::test_arm_pl: + dimensions: + - regions: ["ap-southeast-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - test_awsbatch.py::test_awsbatch: + oss: ["ubuntu2004"] + schedulers: ["slurm"] + cfn-init: + test_cfn_init.py::test_replace_compute_on_failure: dimensions: - - regions: {{ REGION }} + - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - - regions: {{ REGION }} - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} + oss: ["rhel8"] schedulers: [ "slurm" ] - storage: - test_fsx_lustre.py::test_fsx_lustre: + cli_commands: + test_cli_commands.py::test_slurm_cli_commands: dimensions: - - regions: {{ REGION }} + - regions: ["ap-northeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - - regions: {{ REGION }} - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - test_efs.py::test_efs_compute_az: + schedulers: ["slurm"] + cloudwatch_logging: + test_cloudwatch_logging.py::test_cloudwatch_logging: dimensions: - - regions: {{ REGION }} + # 2) run the test for all x86 OSes with slurm + - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - - regions: {{ REGION }} + oss: ["ubuntu2004"] + schedulers: ["slurm"] + # 3) run the test for all ARM OSes on an ARM instance + - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - test_ebs.py::test_ebs_single: + oss: ["alinux2"] + schedulers: ["slurm"] + test_compute_console_output_logging.py::test_custom_action_error: dimensions: - - regions: {{ REGION }} + - regions: [ "ap-east-1" ] + oss: ["rhel8"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - - regions: {{ REGION }} - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} schedulers: [ "slurm" ] - test_ephemeral.py::test_head_node_stop: + configure: + test_pcluster_configure.py::test_pcluster_configure: dimensions: - - regions: {{ REGION }} + - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - - regions: {{ REGION }} - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] + oss: {{ OSS_ONE_PER_DISTRO }} + schedulers: ["slurm"] + createami: + test_createami.py::test_build_image: + dimensions: + - regions: ["eu-west-3"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2", "ubuntu2004", "centos7", "rhel8"] + test_createami.py::test_kernel4_build_image_run_cluster: + dimensions: + - regions: ["eu-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + schedulers: ["awsbatch", "slurm"] + oss: ["alinux2"] dcv: test_dcv.py::test_dcv_configuration: dimensions: # DCV on GPU enabled instance - - regions: {{ REGION }} - instances: ["g3.8xlarge"] - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - # DCV on non GPU enabled instance - - regions: {{ REGION }} + - regions: ["us-east-1"] + instances: ["g4dn.2xlarge"] + oss: ["ubuntu1804"] + schedulers: ["slurm"] + # DCV on ARM + GPU + - regions: ["us-east-1"] + instances: ["g5g.2xlarge"] + oss: ["alinux2"] + schedulers: ["slurm"] + disable_hyperthreading: + test_disable_hyperthreading.py::test_hit_disable_hyperthreading: + dimensions: + - regions: ["us-west-1"] + instances: ["m4.xlarge"] + oss: ["alinux2", "centos7", "rhel8"] + schedulers: ["slurm"] + benchmarks: + - mpi_variants: [ "openmpi", "intelmpi" ] + num_instances: [ 4 ] + slots_per_instance: 2 + osu_benchmarks: + collective: [ "osu_alltoall" ] + dns: + test_dns.py::test_hit_no_cluster_dns_mpi: + dimensions: + - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - - regions: {{ REGION }} - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] + oss: {{ OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] efa: test_efa.py::test_efa: dimensions: - - regions: {{ REGION }} + - regions: ["sa-east-1"] + instances: ["c5n.9xlarge"] + oss: ["alinux2", "rhel8"] + schedulers: ["slurm"] + - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well + instances: ["p4d.24xlarge"] + oss: ["rhel8"] + schedulers: ["slurm"] + - regions: ["us-east-1"] + instances: ["c6gn.16xlarge"] + oss: ["ubuntu2004", "rhel8"] + schedulers: ["slurm"] + intel_hpc: + test_intel_hpc.py::test_intel_hpc: + dimensions: + - regions: ["us-east-2"] + instances: ["c5.18xlarge"] + oss: ["centos7"] + schedulers: ["slurm"] + networking: + test_cluster_networking.py::test_cluster_in_private_subnet: + dimensions: + - regions: ["me-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ NEW_OS }} + oss: ["centos7", "rhel8"] + schedulers: ["slurm"] + test_cluster_networking.py::test_cluster_in_no_internet_subnet: + dimensions: + # The region needs to be the same of the Jenkins server since default pre/post install scripts are hosted in an + # S3 bucket belonging to the same region and S3 VPC Endpoints only work within the region. + - regions: ["us-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + test_multi_cidr.py::test_multi_cidr: + dimensions: + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["rhel8"] + schedulers: ["slurm"] + test_on_demand_capacity_reservation.py::test_on_demand_capacity_reservation: + dimensions: + - regions: [ "us-west-2" ] + oss: [ "rhel8" ] + scaling: + test_scaling.py::test_multiple_jobs_submission: + dimensions: + - regions: {{ common.REGIONS_COMMERCIAL }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["rhel8"] + schedulers: [ "slurm" ] + - regions: {{ common.REGIONS_CHINA }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["rhel8"] + schedulers: [ "slurm" ] + - regions: {{ common.REGIONS_GOVCLOUD }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["rhel8"] schedulers: [ "slurm" ] - - regions: {{ REGION }} + - regions: [ "us-west-2" ] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} + oss: ["rhel8"] + schedulers: {{ common.SCHEDULERS_TRAD }} + - regions: [ "cn-north-1" ] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["rhel8"] schedulers: [ "slurm" ] - configure: - test_pcluster_configure.py::test_pcluster_configure: + - regions: [ "us-gov-east-1" ] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["rhel8"] + schedulers: [ "slurm" ] + test_mpi.py::test_mpi: dimensions: - - regions: {{ REGION }} + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ OSS_COMMERCIAL_ARM }} + schedulers: ["slurm"] + test_mpi.py::test_mpi_ssh: + dimensions: + - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ NEW_OS }} + oss: {{ OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + schedulers: + test_awsbatch.py::test_awsbatch: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + test_slurm.py::test_slurm: + dimensions: + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + test_slurm.py::test_slurm_pmix: + dimensions: + - regions: ["ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + - regions: ["ap-southeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ OSS_COMMERCIAL_ARM }} + schedulers: ["slurm"] + test_slurm.py::test_slurm_scaling: + dimensions: + - regions: ["us-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ OSS_ONE_PER_DISTRO }} + schedulers: ["slurm"] + storage: + # Commercial regions that can't test FSx: ap-northeast-1, ap-southeast-1, ap-southeast-2, eu-central-1, eu-north-1, eu-west-1, eu-west-2, us-east-1, us-east-2, us-west-1, us-west-2 + test_fsx_lustre.py::test_fsx_lustre: + dimensions: + - regions: ["eu-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7", "rhel8"] + schedulers: ["slurm"] + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["ubuntu2004", "rhel8"] + schedulers: ["slurm"] + # The checks performed in test_multiple_fsx is the same as test_fsx_lustre. + # We should consider this when assigning dimensions to each test. + test_fsx_lustre.py::test_multiple_fsx: + dimensions: + - regions: ["eu-west-2"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ OSS_COMMERCIAL_ARM }} + schedulers: ["slurm"] + benchmarks: + - mpi_variants: [ "openmpi", "intelmpi" ] + num_instances: [ 4 ] + slots_per_instance: 2 + osu_benchmarks: + collective: [ "osu_alltoall" ] + test_fsx_lustre.py::test_fsx_lustre_configuration_options: + dimensions: + - regions: ["us-east-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2", "rhel8"] + schedulers: ["slurm"] + test_fsx_lustre.py::test_fsx_lustre_backup: + dimensions: + - regions: [ "eu-south-1" ] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["rhel8"] schedulers: [ "slurm" ] - - regions: {{ REGION }} + test_efs.py::test_multiple_efs: + dimensions: + - regions: [ "ca-central-1" ] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ NEW_OS }} + oss: {{ OSS_COMMERCIAL_ARM }} schedulers: [ "slurm" ] - networking: - test_cluster_networking.py::test_cluster_in_private_subnet: + benchmarks: + - mpi_variants: [ "openmpi", "intelmpi" ] + num_instances: [ 4 ] + slots_per_instance: 2 + osu_benchmarks: + collective: [ "osu_alltoall" ] + test_raid.py::test_raid_performance_mode: dimensions: - - regions: {{ REGION }} + - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ NEW_OS }} - schedulers: [ "slurm" ] - # Useful for instances with multiple network interfaces - test_multi_cidr.py::test_multi_cidr: + oss: {{ OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + test_ebs.py::test_ebs_multiple: dimensions: - - regions: {{ REGION }} - instances: [ "p4d.24xlarge" ] - oss: {{ NEW_OS }} + - regions: ["me-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804", "rhel8"] + schedulers: ["slurm"] + test_ebs.py::test_ebs_existing: + dimensions: + - regions: ["eu-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7", "rhel8"] + schedulers: ["slurm"] + # Ephemeral test requires instance type with instance store + test_ephemeral.py::test_head_node_stop: + dimensions: + - regions: ["use1-az4"] + instances: ["m5d.xlarge", "h1.2xlarge"] + oss: ["alinux2", "rhel8"] + schedulers: ["slurm"] + update: + test_update.py::test_update_slurm: + dimensions: + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu2004", "rhel8"] + test_update.py::test_update_compute_ami: + dimensions: + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7", "rhel8"] + test_update.py::test_update_instance_list: + dimensions: + - regions: ["ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2", "rhel8"] + schedulers: ["slurm"] + test_update.py::test_queue_parameters_update: + dimensions: + - regions: ["ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2", "rhel8"] + schedulers: ["slurm"] + test_update.py::test_dynamic_file_systems_update: + dimensions: + - regions: ["eu-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804", "rhel8"] + schedulers: ["slurm"] + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["ubuntu1804", "rhel8"] + schedulers: ["slurm"] + test_update.py::test_multi_az_create_and_update: + dimensions: + - regions: [ "eu-west-2" ] schedulers: [ "slurm" ] + oss: ["alinux2", "rhel8"] diff --git a/tests/integration-tests/configs/new_region.yaml b/tests/integration-tests/configs/new_region.yaml index 70200c12ac..87148df0de 100644 --- a/tests/integration-tests/configs/new_region.yaml +++ b/tests/integration-tests/configs/new_region.yaml @@ -62,8 +62,8 @@ test-suites: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["slurm"] - dashboard_and_alarms: - test_dashboard_and_alarms.py::test_dashboard_and_alarms: + monitoring: + test_monitoring.py::test_monitoring: dimensions: - regions: {{ NEW_REGION }} instances: {{ common.INSTANCES_DEFAULT_X86 }} diff --git a/tests/integration-tests/configs/p4d-efa.yaml b/tests/integration-tests/configs/p4d-efa.yaml deleted file mode 100644 index 3fc18b7384..0000000000 --- a/tests/integration-tests/configs/p4d-efa.yaml +++ /dev/null @@ -1,13 +0,0 @@ -{%- import 'common.jinja2' as common -%} -{%- set regions = ["use1-az6"] -%} # do not move, unless capacity reservation is moved as well -{%- set instances = ["p4d.24xlarge"] -%} - ---- -test-suites: - efa: - test_efa.py::test_efa: - dimensions: - - regions: {{ regions }} - instances: {{ instances }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] diff --git a/tests/integration-tests/configs/p4d-multinics.yaml b/tests/integration-tests/configs/p4d-multinics.yaml deleted file mode 100644 index 549530a31a..0000000000 --- a/tests/integration-tests/configs/p4d-multinics.yaml +++ /dev/null @@ -1,13 +0,0 @@ -{%- import 'common.jinja2' as common -%} -{%- set regions = ["use1-az6"] -%} # do not move, unless capacity reservation is moved as well -{%- set instances = ["p4d.24xlarge"] -%} - ---- -test-suites: - multiple_nics: - test_multiple_nics.py::test_multiple_nics: - dimensions: - - regions: {{ regions }} - instances: {{ instances }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] diff --git a/tests/integration-tests/configs/redhat8.yaml b/tests/integration-tests/configs/redhat8.yaml index ae6ae94aa6..3867e55f09 100644 --- a/tests/integration-tests/configs/redhat8.yaml +++ b/tests/integration-tests/configs/redhat8.yaml @@ -1,7 +1,4 @@ {%- import 'common.jinja2' as common with context -%} -{%- set OSS_COMMERCIAL_X86_RH8 = ["alinux2", "centos7", "ubuntu1804", "ubuntu2004", "rhel8"] -%} -{%- set OSS_COMMERCIAL_ARM_RH8 = ["alinux2", "ubuntu1804", "ubuntu2004", "rhel8"] -%} -{%- set OSS_ONE_PER_DISTRO_RH8 = ["centos7", "alinux2", "ubuntu1804", "rhel8"] -%} --- test-suites: @@ -10,14 +7,14 @@ test-suites: dimensions: - regions: [ "sa-east-1" ] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: [ "rhel8" ] + oss: {{ NEW_OS }} schedulers: [ "slurm" ] ad_integration: test_ad_integration.py::test_ad_integration: dimensions: - regions: ["ap-southeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1804"] + oss: {{ OSS_COMMERCIAL_X86 }} schedulers: ["slurm"] benchmarks: - mpi_variants: [ "openmpi", "intelmpi" ] @@ -30,21 +27,21 @@ test-suites: dimensions: - regions: ["ap-southeast-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} + oss: {{ NEW_OS }} schedulers: ["slurm"] cfn-init: test_cfn_init.py::test_replace_compute_on_failure: dimensions: - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["rhel8"] + oss: {{ NEW_OS }} schedulers: [ "slurm" ] cli_commands: test_cli_commands.py::test_slurm_cli_commands: dimensions: - regions: ["ap-northeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1804", "rhel8"] + oss: {{ NEW_OS }} schedulers: ["slurm"] cloudwatch_logging: test_cloudwatch_logging.py::test_cloudwatch_logging: @@ -52,17 +49,17 @@ test-suites: # 2) run the test for all x86 OSes with slurm - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} + oss: ["ubuntu2004"] schedulers: ["slurm"] # 3) run the test for all ARM OSes on an ARM instance - regions: ["ap-east-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} + oss: ["alinux2"] schedulers: ["slurm"] test_compute_console_output_logging.py::test_custom_action_error: dimensions: - regions: [ "ap-east-1" ] - oss: ["rhel8"] + oss: {{ NEW_OS }} instances: {{ common.INSTANCES_DEFAULT_X86 }} schedulers: [ "slurm" ] configure: @@ -70,39 +67,39 @@ test-suites: dimensions: - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ OSS_ONE_PER_DISTRO_RH8 }} + oss: {{ OSS_ONE_PER_DISTRO }} schedulers: ["slurm"] createami: test_createami.py::test_build_image: dimensions: - regions: ["eu-west-3"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2", "ubuntu2004", "centos7", "rhel8"] + oss: {{ NEW_OS }} test_createami.py::test_kernel4_build_image_run_cluster: dimensions: - regions: ["eu-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} schedulers: ["awsbatch", "slurm"] - oss: ["alinux2"] + oss: {{ NEW_OS }} dcv: test_dcv.py::test_dcv_configuration: dimensions: # DCV on GPU enabled instance - regions: ["us-east-1"] instances: ["g4dn.2xlarge"] - oss: {{common.OSS_COMMERCIAL_X86}} + oss: {{ NEW_OS }} schedulers: ["slurm"] # DCV on ARM + GPU - regions: ["us-east-1"] instances: ["g5g.2xlarge"] - oss: ["alinux2", "ubuntu1804"] + oss: {{ NEW_OS }} schedulers: ["slurm"] disable_hyperthreading: test_disable_hyperthreading.py::test_hit_disable_hyperthreading: dimensions: - regions: ["us-west-1"] instances: ["m4.xlarge"] - oss: ["alinux2", "centos7", "rhel8"] + oss: {{ NEW_OS }} schedulers: ["slurm"] benchmarks: - mpi_variants: [ "openmpi", "intelmpi" ] @@ -115,36 +112,36 @@ test-suites: dimensions: - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ OSS_COMMERCIAL_X86_RH8 }} + oss: {{ NEW_OS }} schedulers: ["slurm"] efa: test_efa.py::test_efa: dimensions: - regions: ["sa-east-1"] instances: ["c5n.9xlarge"] - oss: ["alinux2", "rhel8"] + oss: {{ NEW_OS }} schedulers: ["slurm"] - regions: ["use1-az6"] # do not move, unless capacity reservation is moved as well instances: ["p4d.24xlarge"] - oss: ["alinux2", "rhel8"] + oss: {{ NEW_OS }} schedulers: ["slurm"] - regions: ["us-east-1"] instances: ["c6gn.16xlarge"] - oss: ["ubuntu2004", "rhel8"] + oss: {{ NEW_OS }} schedulers: ["slurm"] intel_hpc: test_intel_hpc.py::test_intel_hpc: dimensions: - regions: ["us-east-2"] instances: ["c5.18xlarge"] - oss: ["centos7"] + oss: {{ NEW_OS }} schedulers: ["slurm"] networking: test_cluster_networking.py::test_cluster_in_private_subnet: dimensions: - regions: ["me-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7", "rhel8"] + oss: {{ NEW_OS }} schedulers: ["slurm"] test_cluster_networking.py::test_cluster_in_no_internet_subnet: dimensions: @@ -152,56 +149,56 @@ test-suites: # S3 bucket belonging to the same region and S3 VPC Endpoints only work within the region. - regions: ["us-east-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ OSS_COMMERCIAL_X86_RH8 }} + oss: {{ NEW_OS }} schedulers: ["slurm"] test_multi_cidr.py::test_multi_cidr: dimensions: - regions: ["ap-northeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["rhel8"] + oss: {{ NEW_OS }} schedulers: ["slurm"] test_on_demand_capacity_reservation.py::test_on_demand_capacity_reservation: dimensions: - regions: [ "us-west-2" ] - oss: [ "alinux2", "rhel8"] + oss: {{ NEW_OS }} scaling: test_scaling.py::test_multiple_jobs_submission: dimensions: - regions: {{ common.REGIONS_COMMERCIAL }} instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["rhel8"] + oss: {{ NEW_OS }} schedulers: [ "slurm" ] - regions: {{ common.REGIONS_CHINA }} instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["rhel8"] + oss: {{ NEW_OS }} schedulers: [ "slurm" ] - regions: {{ common.REGIONS_GOVCLOUD }} instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["rhel8"] + oss: {{ NEW_OS }} schedulers: [ "slurm" ] - regions: [ "us-west-2" ] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["rhel8"] + oss: {{ NEW_OS }} schedulers: {{ common.SCHEDULERS_TRAD }} - regions: [ "cn-north-1" ] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["rhel8"] + oss: {{ NEW_OS }} schedulers: [ "slurm" ] - regions: [ "us-gov-east-1" ] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["rhel8"] + oss: {{ NEW_OS }} schedulers: [ "slurm" ] test_mpi.py::test_mpi: dimensions: - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ OSS_COMMERCIAL_ARM_RH8 }} + oss: {{ NEW_OS }} schedulers: ["slurm"] test_mpi.py::test_mpi_ssh: dimensions: - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ OSS_COMMERCIAL_X86_RH8 }} + oss: {{ OSS_COMMERCIAL_X86 }} schedulers: ["slurm"] schedulers: test_awsbatch.py::test_awsbatch: @@ -214,23 +211,23 @@ test-suites: dimensions: - regions: ["eu-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ OSS_COMMERCIAL_X86_RH8 }} + oss: {{ OSS_COMMERCIAL_X86 }} schedulers: ["slurm"] test_slurm.py::test_slurm_pmix: dimensions: - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ OSS_COMMERCIAL_X86_RH8 }} + oss: {{ OSS_COMMERCIAL_X86 }} schedulers: ["slurm"] - regions: ["ap-southeast-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ OSS_COMMERCIAL_ARM_RH8 }} + oss: {{ OSS_COMMERCIAL_ARM }} schedulers: ["slurm"] test_slurm.py::test_slurm_scaling: dimensions: - regions: ["us-west-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ OSS_ONE_PER_DISTRO_RH8 }} + oss: {{ OSS_ONE_PER_DISTRO }} schedulers: ["slurm"] storage: # Commercial regions that can't test FSx: ap-northeast-1, ap-southeast-1, ap-southeast-2, eu-central-1, eu-north-1, eu-west-1, eu-west-2, us-east-1, us-east-2, us-west-1, us-west-2 @@ -250,7 +247,7 @@ test-suites: dimensions: - regions: ["eu-west-2"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ OSS_COMMERCIAL_ARM_RH8 }} + oss: {{ OSS_COMMERCIAL_ARM }} schedulers: ["slurm"] benchmarks: - mpi_variants: [ "openmpi", "intelmpi" ] @@ -274,7 +271,7 @@ test-suites: dimensions: - regions: [ "ca-central-1" ] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ OSS_COMMERCIAL_ARM_RH8 }} + oss: {{ OSS_COMMERCIAL_ARM }} schedulers: [ "slurm" ] benchmarks: - mpi_variants: [ "openmpi", "intelmpi" ] @@ -286,7 +283,7 @@ test-suites: dimensions: - regions: ["ap-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ OSS_COMMERCIAL_X86_RH8 }} + oss: {{ OSS_COMMERCIAL_X86 }} schedulers: ["slurm"] test_ebs.py::test_ebs_multiple: dimensions: @@ -296,14 +293,14 @@ test-suites: schedulers: ["slurm"] test_ebs.py::test_ebs_existing: dimensions: - - regions: ["me-south-1"] + - regions: ["eu-west-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["centos7", "rhel8"] schedulers: ["slurm"] # Ephemeral test requires instance type with instance store test_ephemeral.py::test_head_node_stop: dimensions: - - regions: ["us-east-1"] + - regions: ["use1-az4"] instances: ["m5d.xlarge", "h1.2xlarge"] oss: ["alinux2", "rhel8"] schedulers: ["slurm"] @@ -312,7 +309,7 @@ test-suites: dimensions: - regions: ["eu-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu2204", "rhel8"] + oss: ["ubuntu2004", "rhel8"] test_update.py::test_update_compute_ami: dimensions: - regions: ["eu-west-1"] diff --git a/tests/integration-tests/configs/schedulers.yaml b/tests/integration-tests/configs/schedulers.yaml new file mode 100644 index 0000000000..6a6e2bb8d9 --- /dev/null +++ b/tests/integration-tests/configs/schedulers.yaml @@ -0,0 +1,212 @@ +{%- import 'common.jinja2' as common with context -%} +--- +test-suites: + cli_commands: + test_cli_commands.py::test_slurm_cli_commands: + dimensions: + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804"] + schedulers: ["slurm"] + efa: + test_efa.py::test_efa: + dimensions: + - regions: ["sa-east-1"] + instances: ["c5n.9xlarge"] + oss: ["alinux2"] + schedulers: ["slurm"] + intel_hpc: + test_intel_hpc.py::test_intel_hpc: + dimensions: + - regions: ["us-east-2"] + instances: ["c5.18xlarge"] + oss: ["centos7"] + schedulers: ["slurm"] + scaling: + test_mpi.py::test_mpi: # TODO: move outside of the scaling dir + dimensions: + - regions: ["ap-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["slurm"] + test_mpi.py::test_mpi_ssh: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + schedulers: + test_awsbatch.py::test_awsbatch: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + test_awsbatch.py::test_awsbatch_defaults: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + test_slurm.py::test_slurm: + dimensions: + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + test_slurm.py::test_slurm_pmix: # TODO: include in main test_slurm to reduce number of created clusters + dimensions: + - regions: ["ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["slurm"] + test_slurm.py::test_slurm_scaling: + dimensions: + - regions: ["us-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: ["slurm"] + - regions: ["use2-az2"] # do not move, unless instance type support is moved as well + instances: [{{ common.instance("instance_type_1") }}] + oss: [ "alinux2" ] + schedulers: [ "slurm" ] + test_slurm.py::test_error_handling: + dimensions: + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_slurm.py::test_slurm_protected_mode: + dimensions: + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_slurm.py::test_slurm_protected_mode_on_cluster_create: + dimensions: + - regions: ["ap-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_slurm.py::test_fast_capacity_failover: + dimensions: + - regions: ["ap-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_slurm.py::test_slurm_config_update: + dimensions: + - regions: ["ap-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_slurm.py::test_slurm_memory_based_scheduling: + dimensions: + - regions: ["ap-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_slurm.py::test_scontrol_reboot: + dimensions: + - regions: ["us-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu2004"] + schedulers: ["slurm"] + test_slurm.py::test_scontrol_reboot_ec2_health_checks: + dimensions: + - regions: ["us-east-2"] + instances: ["t2.medium"] + oss: ["ubuntu2004"] + schedulers: ["slurm"] + test_slurm.py::test_slurm_overrides: + dimensions: + - regions: ["me-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu2004"] + schedulers: ["slurm"] + test_slurm.py::test_scontrol_update_nodelist_sorting: + dimensions: + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_slurm_accounting.py::test_slurm_accounting: + dimensions: + - regions: ["us-east-1", "ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2", "ubuntu2004"] + schedulers: ["slurm"] + test_slurm_accounting.py::test_slurm_accounting_disabled_to_enabled_update: + dimensions: + - regions: ["us-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7", "ubuntu1804"] + schedulers: ["slurm"] + test_slurm.py::test_slurm_reconfigure_race_condition: + dimensions: + - regions: ["af-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: [ "slurm" ] + test_slurm.py::test_slurm_custom_config_parameters: + dimensions: + - regions: ["euw1-az1"] + instances: ["c5.xlarge"] + oss: ["alinux2"] + schedulers: ["slurm"] + update: + test_update.py::test_update_awsbatch: + dimensions: + - regions: ["eu-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + test_update.py::test_update_slurm: + dimensions: + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + test_update.py::test_update_compute_ami: + dimensions: + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + test_update.py::test_update_instance_list: + dimensions: + - regions: ["ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_update.py::test_queue_parameters_update: + dimensions: + - regions: ["ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_update.py::test_dynamic_file_systems_update: + dimensions: + - regions: ["eu-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["slurm"] + test_update.py::test_multi_az_create_and_update: + dimensions: + - regions: [ "eu-west-2" ] + schedulers: [ "slurm" ] + oss: ["alinux2"] + diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 189dbbae74..40737c3376 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -82,6 +82,7 @@ set_logger_formatter, to_pascal_case, ) +from xdist import get_xdist_worker_id from tests.common.osu_common import run_osu_benchmarks from tests.common.schedulers_common import get_scheduler_commands @@ -96,7 +97,7 @@ ) from tests.storage.snapshots_factory import EBSSnapshotsFactory -pytest_plugins = ["conftest_networking"] +pytest_plugins = ["conftest_networking", "conftest_resource_bucket"] def pytest_addoption(parser): @@ -132,10 +133,17 @@ def pytest_addoption(parser): parser.addoption("--post-install", help="url to post install script") parser.addoption("--vpc-stack", help="Name of an existing vpc stack.") parser.addoption("--cluster", help="Use an existing cluster instead of creating one.") - parser.addoption("--public-ecr-image-uri", help="S3 URI of the ParallelCluster API spec") + parser.addoption("--policies-uri", help="Use an existing policies URI instead of uploading one.") parser.addoption( - "--api-definition-s3-uri", help="URI of the Docker image for the Lambda of the ParallelCluster API" + "--cluster-custom-resource-service-token", + help="(Optional) ServiceToken (ARN) of the CloudFormation Cluster custom resource provider.", ) + parser.addoption( + "--resource-bucket", + help="(Optional) Name of bucket to use to look for standard resources like hosted CloudFormation templates.", + ) + parser.addoption("--lambda-layer-source", help="(Optional) S3 URI of lambda layer to copy rather than building.") + parser.addoption("--api-definition-s3-uri", help="URI of the OpenAPI spec of the ParallelCluster API") parser.addoption( "--api-infrastructure-s3-uri", help="URI of the CloudFormation template for the ParallelCluster API" ) @@ -157,10 +165,7 @@ def pytest_addoption(parser): help="use default IAM creds when running pcluster commands", action="store_true", ) - parser.addoption( - "--iam-user-role-stack-name", - help="Name of CFN stack providing IAM user roles.", - ) + parser.addoption("--iam-user-role-stack-name", help="Name of CFN stack providing IAM user roles.") parser.addoption( "--directory-stack-name", help="Name of CFN stack providing AD domain to be used for testing AD integration feature.", @@ -174,10 +179,7 @@ def pytest_addoption(parser): "--slurm-database-stack-name", help="Name of CFN stack providing database stack to be used for testing Slurm accounting feature.", ) - parser.addoption( - "--external-shared-storage-stack-name", - help="Name of existing external shared storage stack.", - ) + parser.addoption("--external-shared-storage-stack-name", help="Name of existing external shared storage stack.") def pytest_generate_tests(metafunc): @@ -414,9 +416,9 @@ def _cluster_factory(cluster_config, upper_case_cluster_name=False, custom_cli_c factory.destroy_all_clusters(test_passed=test_passed) -@pytest.fixture(scope="session") +@pytest.fixture(scope="class") def api_server_factory( - cfn_stacks_factory, request, public_ecr_image_uri, api_definition_s3_uri, api_infrastructure_s3_uri + cfn_stacks_factory, request, resource_bucket, policies_uri, api_definition_s3_uri, api_infrastructure_s3_uri ): """Creates a factory for deploying API servers on-demand to each region.""" api_servers = {} @@ -430,12 +432,14 @@ def _api_server_factory(server_region): ] if api_definition_s3_uri: params.append({"ParameterKey": "ApiDefinitionS3Uri", "ParameterValue": api_definition_s3_uri}) - if public_ecr_image_uri: - params.append({"ParameterKey": "PublicEcrImageUri", "ParameterValue": public_ecr_image_uri}) + if policies_uri: + params.append({"ParameterKey": "PoliciesTemplateUri", "ParameterValue": policies_uri}) + if resource_bucket: + params.append({"ParameterKey": "CustomBucket", "ParameterValue": resource_bucket}) template = ( api_infrastructure_s3_uri - or f"https://{server_region}-aws-parallelcluster.s3.{server_region}.amazonaws.com" + or f"https://{resource_bucket}.s3.{server_region}.amazonaws.com" f"{'.cn' if server_region.startswith('cn') else ''}" f"/parallelcluster/{get_installed_parallelcluster_version()}/api/parallelcluster-api.yaml" ) @@ -564,12 +568,7 @@ def pcluster_config_reader(test_datadir, vpc_stack, request, region, scheduler_p :return: a _config_renderer(**kwargs) function which gets as input a dictionary of values to replace in the template """ - def _config_renderer( - config_file="pcluster.config.yaml", - benchmarks=None, - output_file=None, - **kwargs, - ): + def _config_renderer(config_file="pcluster.config.yaml", benchmarks=None, output_file=None, **kwargs): config_file_path = test_datadir / config_file if not os.path.isfile(config_file_path): raise FileNotFoundError(f"Cluster config file not found in the expected dir {config_file_path}") @@ -782,11 +781,9 @@ def _get_default_template_values(vpc_stack: CfnVpcStack, request): ): default_values["scheduler"] = "plugin" default_values["imds_secured"] = default_values.get("scheduler") in SCHEDULERS_SUPPORTING_IMDS_SECURED - default_values["scheduler_prefix"] = { - "slurm": "Slurm", - "awsbatch": "AwsBatch", - "plugin": "Scheduler", - }.get(default_values.get("scheduler")) + default_values["scheduler_prefix"] = {"slurm": "Slurm", "awsbatch": "AwsBatch", "plugin": "Scheduler"}.get( + default_values.get("scheduler") + ) return default_values @@ -929,19 +926,19 @@ def read_template(template_path): logging.warning("Skipping deletion of IAM roles stack because --no-delete option is set") -@pytest.fixture(scope="session") -def public_ecr_image_uri(request): - return request.config.getoption("public_ecr_image_uri") - - @pytest.fixture(scope="session") def api_uri(request): return request.config.getoption("api_uri") -@pytest.fixture(scope="session") -def api_definition_s3_uri(request): - return request.config.getoption("api_definition_s3_uri") +@pytest.fixture(scope="class") +def api_definition_s3_uri(request, resource_bucket): + if request.config.getoption("api_definition_s3_uri"): + return request.config.getoption("api_definition_s3_uri") + return ( + f"s3://{resource_bucket}/parallelcluster/{get_installed_parallelcluster_version()}/" + f"api/ParallelCluster.openapi.yaml" + ) @pytest.fixture(scope="session") @@ -949,6 +946,16 @@ def api_infrastructure_s3_uri(request): return request.config.getoption("api_infrastructure_s3_uri") +@pytest.fixture(scope="session") +def cluster_custom_resource_service_token(request): + return request.config.getoption("cluster_custom_resource_service_token") + + +@pytest.fixture(scope="session") +def lambda_layer_source(request): + return request.config.getoption("lambda_layer_source") + + @pytest.fixture(scope="class") def s3_bucket_factory(request, region): """ @@ -1090,9 +1097,10 @@ def serial_execution_by_instance(request, instance): lock_file = f"{outdir}/{instance}.lock" lock = FileLock(lock_file=lock_file) logging.info("Acquiring lock file %s", lock.lock_file) - with lock.acquire(poll_interval=15, timeout=7200): + with lock.acquire(poll_interval=15, timeout=12000): + logging.info(f"The lock is acquired by worker ID {get_xdist_worker_id(request)}: {os.getpid()}") yield - logging.info("Releasing lock file %s", lock.lock_file) + logging.info(f"Releasing lock file {lock.lock_file} by {get_xdist_worker_id(request)}: {os.getpid()}") lock.release() else: logging.info("Ignoring serial execution for instance %s", instance) @@ -1180,20 +1188,20 @@ def odcr_stack(request, region, placement_group_stack, cfn_stacks_factory, vpc_s odcr_template.set_version() odcr_template.set_description("ODCR stack to test open, targeted, and PG ODCRs") public_subnet = vpc_stack.get_public_subnet() - public_subnets = vpc_stack.get_all_public_subnets().copy() - public_subnets.remove(public_subnet) - availability_zone = boto3.resource("ec2").Subnet(public_subnet).availability_zone - availability_zone_2 = boto3.resource("ec2").Subnet(public_subnets[0]).availability_zone + public_subnets = vpc_stack.get_all_public_subnets() + default_public_az = boto3.resource("ec2").Subnet(public_subnet).availability_zone + availability_zone_1 = boto3.resource("ec2").Subnet(public_subnets[0]).availability_zone + availability_zone_2 = boto3.resource("ec2").Subnet(public_subnets[1]).availability_zone open_odcr = ec2.CapacityReservation( "integTestsOpenOdcr", - AvailabilityZone=availability_zone, + AvailabilityZone=default_public_az, InstanceCount=4, InstancePlatform="Linux/UNIX", InstanceType="m5.2xlarge", ) target_odcr = ec2.CapacityReservation( "integTestsTargetOdcr", - AvailabilityZone=availability_zone, + AvailabilityZone=default_public_az, InstanceCount=4, InstancePlatform="Linux/UNIX", InstanceType="r5.xlarge", @@ -1202,7 +1210,7 @@ def odcr_stack(request, region, placement_group_stack, cfn_stacks_factory, vpc_s pg_name = placement_group_stack.cfn_resources["PlacementGroup"] pg_odcr = ec2.CapacityReservation( "integTestsPgOdcr", - AvailabilityZone=availability_zone, + AvailabilityZone=default_public_az, InstanceCount=2, InstancePlatform="Linux/UNIX", InstanceType="m5.xlarge", @@ -1250,7 +1258,7 @@ def odcr_stack(request, region, placement_group_stack, cfn_stacks_factory, vpc_s # odcr resources for MultiAZ integ-tests az1_odcr = ec2.CapacityReservation( "az1Odcr", - AvailabilityZone=availability_zone, + AvailabilityZone=availability_zone_1, InstanceCount=2, InstancePlatform="Linux/UNIX", InstanceType="t3.micro", @@ -1375,18 +1383,10 @@ def _copy_image(image_id, test_name): # Created tag for copied image to be filtered by cleanup ami pipeline client.create_tags( - Resources=[ - f"{copy_ami_id}", - ], + Resources=[f"{copy_ami_id}"], Tags=[ - { - "Key": "parallelcluster:image_id", - "Value": f"aws-parallelcluster-copied-image-{test_name}", - }, - { - "Key": "parallelcluster:build_status", - "Value": "available", - }, + {"Key": "parallelcluster:image_id", "Value": f"aws-parallelcluster-copied-image-{test_name}"}, + {"Key": "parallelcluster:build_status", "Value": "available"}, ], ) return copy_ami_id @@ -1456,10 +1456,7 @@ def _run_benchmarks(remote_command_executor, scheduler_commands, **kwargs): dimensions, ) for metric_data in metric_data_list: - cloudwatch_client.put_metric_data( - Namespace=metric_namespace, - MetricData=metric_data, - ) + cloudwatch_client.put_metric_data(Namespace=metric_namespace, MetricData=metric_data) logging.info("Finished benchmarks for %s", function_name) yield _run_benchmarks @@ -1475,9 +1472,7 @@ def scheduler_plugin_configuration(request, region, scheduler_plugin_definitions scheduler_definition_url = scheduler_plugin_definitions.get(scheduler, {}).get(region, {}) if scheduler_definition_url: logging.info( - "Adding scheduler plugin (%s) scheduler-definition-url to be (%s)", - scheduler, - scheduler_definition_url, + "Adding scheduler plugin (%s) scheduler-definition-url to be (%s)", scheduler, scheduler_definition_url ) scheduler_plugin["scheduler-definition-url"] = scheduler_definition_url @@ -1534,12 +1529,7 @@ def _fsx_factory(ports, ip_protocols, file_system_type, num=1, **kwargs): "FSxSecurityGroup", GroupDescription="SecurityGroup for testing existing FSx", SecurityGroupIngress=[ - ec2.SecurityGroupRule( - IpProtocol=ip_protocol, - FromPort=port, - ToPort=port, - CidrIp="0.0.0.0/0", - ) + ec2.SecurityGroupRule(IpProtocol=ip_protocol, FromPort=port, ToPort=port, CidrIp="0.0.0.0/0") for port in ports for ip_protocol in ip_protocols ], @@ -1593,9 +1583,7 @@ def _svm_factory(file_system_id, num_volumes=1): fsx_svm_template.set_description("Create Storage Virtual Machine stack") fsx_svm = StorageVirtualMachine( - title="StorageVirtualMachineFileSystemResource", - Name="fsx", - FileSystemId=file_system_id, + title="StorageVirtualMachineFileSystemResource", Name="fsx", FileSystemId=file_system_id ) fsx_svm_template.add_resource(fsx_svm) @@ -1733,9 +1721,7 @@ def create_mount_targets(efs_ids): vpc_id = vpc_stack.cfn_outputs["VpcId"] security_group = template.add_resource( ec2.SecurityGroup( - "SecurityGroupResource", - GroupDescription="custom security group for EFS mount targets", - VpcId=vpc_id, + "SecurityGroupResource", GroupDescription="custom security group for EFS mount targets", VpcId=vpc_id ) ) # Allow inbound connection though NFS port within the VPC diff --git a/tests/integration-tests/conftest_resource_bucket.py b/tests/integration-tests/conftest_resource_bucket.py new file mode 100644 index 0000000000..5bac9de246 --- /dev/null +++ b/tests/integration-tests/conftest_resource_bucket.py @@ -0,0 +1,143 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + + +import logging +import os +import re +import shutil +import subprocess +import sys +import tarfile +import tempfile +from pathlib import Path + +import boto3 +import pkg_resources +import pytest +import urllib3 +from framework.fixture_utils import xdist_session_fixture + +from tests.common.utils import get_installed_parallelcluster_version + +logger = logging.getLogger() +NODE_VERSION = "v16.19.0" # maintenance version compatible with alinux2's GLIBC + + +def install_pc(basepath, pc_version): + """Install ParallelCluster to a temporary directory""" + tempdir = Path(basepath) / "python" + root = Path(pkg_resources.resource_filename(__name__, "/../..")) + cli_dir = root / "cli" + try: + logger.info("installing ParallelCluster packages...") + subprocess.check_call([sys.executable, "-m", "pip", "install", f"{cli_dir}[awslambda]", "-t", tempdir]) + # The following are provided by the lambda runtime + shutil.rmtree(tempdir / "botocore") + shutil.rmtree(tempdir / "boto3") + except subprocess.CalledProcessError: + logger.info(f"Error while installing ParallelCluster {get_installed_parallelcluster_version()}") + sys.exit(-1) + + +def install_node(basepath, node_version): + """Install Node to a temporary directory""" + node_root = f"node-{node_version}-linux-x64" + node_file = f"{node_root}.tar.xz" + node_url = f"https://nodejs.org/dist/{node_version}/{node_file}" + logger.info(f"Node URL: {node_url}") + + logger.info(f"Retrieving Node {node_version}") + http = urllib3.PoolManager() + resp = http.request("GET", node_url) + + with tempfile.TemporaryDirectory() as nodetmp: + with open(f"{nodetmp}/{node_file}", "wb") as fout: + fout.write(resp.data) + fout.close() + + with tarfile.open(f"{nodetmp}/{node_file}", mode="r:xz") as tar: + tar.extractall(f"{nodetmp}/node_install") + + tempdir = Path(basepath) / "bin" + os.makedirs(tempdir, exist_ok=True) + shutil.copy(f"{nodetmp}/node_install/{node_root}/bin/node", tempdir / "node") + + +@pytest.fixture(scope="class", name="policies_uri") +def policies_uri_fixture(request, region, resource_bucket): + if request.config.getoption("policies_uri"): + yield request.config.getoption("policies_uri") + return + + yield ( + f"https://{resource_bucket}.s3.{region}.amazonaws.com{'.cn' if region.startswith('cn') else ''}" + f"/parallelcluster/{get_installed_parallelcluster_version()}/templates/policies/policies.yaml" + ) + + +def get_resource_map(): + version = get_installed_parallelcluster_version() + prefix = f"parallelcluster/{version}" + resources = { + "api/infrastructure/parallelcluster-api.yaml": f"{prefix}/api/parallelcluster-api.yaml", + "api/spec/openapi/ParallelCluster.openapi.yaml": f"{prefix}/api/ParallelCluster.openapi.yaml", + "cloudformation/custom_resource/cluster.yaml": f"{prefix}/templates/custom_resource/cluster.yaml", + "cloudformation/networking/public.cfn.json": f"{prefix}/templates/networking/public-{version}.cfn.json", + "cloudformation/networking/public-private.cfn.json": ( + f"{prefix}/templates/networking/public-private-{version}.cfn.json" + ), + "cloudformation/policies/parallelcluster-policies.yaml": f"{prefix}/templates/policies/policies.yaml", + } + return resources + + +@xdist_session_fixture() +def resource_bucket_shared(request, s3_bucket_factory_shared, lambda_layer_source): + root = Path(pkg_resources.resource_filename(__name__, "/../..")) + if request.config.getoption("resource_bucket"): + return # short-circuit this fixture if a resource-bucket is provided + + for region, s3_bucket in s3_bucket_factory_shared.items(): + logger.info(f"Uploading artifacts to: {s3_bucket}[{region}]") + for file, key in get_resource_map().items(): + logger.info(f" {root / file} -> {s3_bucket}/{key}") + boto3.resource("s3").Bucket(s3_bucket).upload_file(str(root / file), key) + + layer_key = ( + f"parallelcluster/{get_installed_parallelcluster_version()}/layers/aws-parallelcluster/lambda-layer.zip" + ) + if lambda_layer_source: + bucket, key = re.search(r"s3://([^/]*)/(.*)", lambda_layer_source).groups() + source = {"Bucket": bucket, "Key": key} + logger.info(f"Copying Lambda Layer from: s3://{bucket}/{key} -> s3://{s3_bucket}/{layer_key}") + boto3.resource("s3").Bucket(s3_bucket).copy(source, layer_key) + else: + with tempfile.TemporaryDirectory() as basepath: + install_pc(basepath, get_installed_parallelcluster_version()) + install_node(basepath, NODE_VERSION) + + with tempfile.NamedTemporaryFile(suffix=".zip") as zipfile: + zipfilename = Path(zipfile.name) + logger.info(f" {zipfilename} -> {s3_bucket}/{layer_key}") + shutil.make_archive(zipfilename.with_suffix(""), format="zip", root_dir=basepath) + boto3.resource("s3").Bucket(s3_bucket).upload_file(str(zipfilename), layer_key) + + logger.info(s3_bucket_factory_shared) + return s3_bucket_factory_shared + + +@pytest.fixture(scope="class") +def resource_bucket(request, region, resource_bucket_shared): + if request.config.getoption("resource_bucket"): + return request.config.getoption("resource_bucket") + return resource_bucket_shared[region] diff --git a/tests/integration-tests/resources/cluster_custom_resource.yaml b/tests/integration-tests/resources/cluster_custom_resource.yaml new file mode 100644 index 0000000000..f48ecb2ef0 --- /dev/null +++ b/tests/integration-tests/resources/cluster_custom_resource.yaml @@ -0,0 +1,93 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: AWS ParallelCluster CloudFormation Cluster + +Parameters: + ClusterName: + Description: Name of cluster. Note this must be different than the stack name. + Type: String + HeadNodeSubnet: + Description: Subnet for the HeadNode + Type: String + ComputeNodeSubnet: + Description: Subnet for the ComputeNode + Type: String + ComputeInstanceMax: + Description: Maximum number of compute instances + Type: Number + Default: 16 + ServiceToken: + Description: ARN of Lambda Function backing the Cluster Resource + Type: String + Os: + Description: Operating system for nodes + Type: String + Default: 'alinux2' + OnNodeConfigured: + Description: Script to run on HeadNode configured + Type: String + Default: '' + CustomBucketAccess: + Description: Name of a bucket to provide access to on the HeadNode + Type: String + Default: '' + DeletionPolicy: + Type: String + Default: Delete + AllowedValues: + - Delete + - Retain + Description: Enter Retain or Delete to define the operation when the stack is deleted. Default is to Delete. + +Conditions: + OnNodeConfiguredCondition: !Not [!Equals [!Ref OnNodeConfigured, '']] + CustomBucketCondition: !Not [!Equals [!Ref CustomBucketAccess, '']] + +Resources: + PclusterCluster: + Type: Custom::PclusterCluster + Properties: + ServiceToken: !Ref ServiceToken + DeletionPolicy: !Ref DeletionPolicy + ClusterName: !Ref ClusterName + ClusterConfiguration: + DevSettings: + AmiSearchFilters: + Owner: self + Image: + Os: !Ref Os + HeadNode: + InstanceType: t2.small + Networking: + SubnetId: !Ref HeadNodeSubnet + CustomActions: !If + - OnNodeConfiguredCondition + - + OnNodeConfigured: + Script: !Ref OnNodeConfigured + - !Ref AWS::NoValue + Iam: !If + - CustomBucketCondition + - + S3Access: + - BucketName: !Ref CustomBucketAccess + EnableWriteAccess: false + - !Ref AWS::NoValue + Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: queue0 + ComputeResources: + - Name: queue0-cr0 + InstanceType: t2.micro + MaxCount: !Ref ComputeInstanceMax + Networking: + SubnetIds: + - !Ref ComputeNodeSubnet + +Outputs: + HeadNodeIp: + Description: The Public IP address of the HeadNode + Value: !GetAtt [ PclusterCluster, headNode.publicIpAddress ] + ValidationMessages: + Description: Any warnings from cluster create or update operations. + Value: !GetAtt PclusterCluster.validationMessages diff --git a/tests/integration-tests/test_runner.py b/tests/integration-tests/test_runner.py index 486844310a..3a18f027d9 100644 --- a/tests/integration-tests/test_runner.py +++ b/tests/integration-tests/test_runner.py @@ -69,9 +69,9 @@ "vpc_stack": None, "api_uri": None, "cluster": None, + "policies_uri": None, "api_definition_s3_uri": None, "api_infrastructure_s3_uri": None, - "public_ecr_image_uri": None, "no_delete": False, "benchmarks": False, "benchmarks_target_capacity": 200, @@ -86,6 +86,9 @@ "ldaps_nlb_stack_name": None, "slurm_database_stack_name": None, "external_shared_storage_stack_name": None, + "cluster_custom_resource_service_token": None, + "resource_bucket": None, + "lambda_layer_source": None, } @@ -304,10 +307,28 @@ def _init_argparser(): type=int, ) + custom_resource_group = parser.add_argument_group("CloudFormation / Custom Resource options") + custom_resource_group.add_argument( + "--cluster-custom-resource-service-token", + help="ServiceToken (ARN) Cluster CloudFormation custom resource provider", + default=TEST_DEFAULTS.get("cluster_custom_resource_service_token"), + ) + + custom_resource_group.add_argument( + "--resource-bucket", + help="Name of bucket to use to to retrieve standard hosted resources like CloudFormation templates.", + default=TEST_DEFAULTS.get("resource_bucket"), + ) + custom_resource_group.add_argument( + "--lambda-layer-source", + help="S3 URI of lambda layer to copy instead of building.", + default=TEST_DEFAULTS.get("lambda_layer_source"), + ) + api_group = parser.add_argument_group("API options") api_group.add_argument( "--api-definition-s3-uri", - help="URI of the Docker image for the Lambda of the ParallelCluster API", + help="URI of the OpenAPI spec of the ParallelCluster API", default=TEST_DEFAULTS.get("api_definition_s3_uri"), ) api_group.add_argument( @@ -316,14 +337,12 @@ def _init_argparser(): default=TEST_DEFAULTS.get("api_definition_s3_uri"), ) api_group.add_argument( - "--public-ecr-image-uri", - help="S3 URI of the ParallelCluster API spec", - default=TEST_DEFAULTS.get("public_ecr_image_uri"), + "--api-uri", help="URI of an existing ParallelCluster API", default=TEST_DEFAULTS.get("api_uri") ) api_group.add_argument( - "--api-uri", - help="URI of an existing ParallelCluster API", - default=TEST_DEFAULTS.get("api_uri"), + "--policies-uri", + help="Use an existing policies URI instead of uploading one.", + default=TEST_DEFAULTS.get("policies_uri"), ) debug_group = parser.add_argument_group("Debugging/Development options") @@ -509,6 +528,7 @@ def _get_pytest_args(args, regions, log_file, out_dir): # noqa: C901 _set_ami_args(args, pytest_args) _set_custom_stack_args(args, pytest_args) _set_api_args(args, pytest_args) + _set_custom_resource_args(args, pytest_args) return pytest_args @@ -579,13 +599,19 @@ def _set_custom_stack_args(args, pytest_args): pytest_args.extend(["--external-shared-storage-stack-name", args.external_shared_storage_stack_name]) +def _set_custom_resource_args(args, pytest_args): + if args.cluster_custom_resource_service_token: + pytest_args.extend(["--cluster-custom-resource-service-token", args.cluster_custom_resource_service_token]) + if args.resource_bucket: + pytest_args.extend(["--resource-bucket", args.resource_bucket]) + if args.lambda_layer_source: + pytest_args.extend(["--lambda-layer-source", args.lambda_layer_source]) + + def _set_api_args(args, pytest_args): if args.api_definition_s3_uri: pytest_args.extend(["--api-definition-s3-uri", args.api_definition_s3_uri]) - if args.public_ecr_image_uri: - pytest_args.extend(["--public-ecr-image-uri", args.public_ecr_image_uri]) - if args.api_uri: pytest_args.extend(["--api-uri", args.api_uri]) diff --git a/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py b/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py index 41fd1a3189..0a5e2502b3 100644 --- a/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py +++ b/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py @@ -145,10 +145,14 @@ def _dump_cluster_log_state(self): @staticmethod def _base_os_to_platform(base_os): """Turn the name of a base OS into the platform.""" - # Special case: alinux2 is how the config file refers to amazon linux 2, but in the chef cookbook - # (and the cloudwatch log config produced by it) the platform is "amazon". + # Special case: in the files of the cookbook regarding the cloudwatch agent under + # cookbooks/aws-parallelcluster-config/files/default/cloudwatch the configurations refers to: + # * "alinux2" as platform "amazon" + # * "rhel8" as platform "redhat" if base_os == "alinux2": return "amazon" + elif base_os == "rhel8": + return "redhat" else: return base_os.rstrip(string.digits) @@ -379,7 +383,11 @@ def _populate_compute_log_existence(self): """Figure out which of the relevant logs for the ComputeFleet nodes don't exist.""" if self.compute_nodes_count == 0: return - critical_compute_node_logs = ["/var/log/parallelcluster/computemgtd"] if self.scheduler == "slurm" else [] + critical_compute_node_logs = ( + ["/var/log/parallelcluster/computemgtd", "/var/log/parallelcluster/bootstrap_error_msg"] + if self.scheduler == "slurm" + else [] + ) for log_dict in self._relevant_logs.get(COMPUTE_NODE_ROLE_NAME): log_path = log_dict.get("file_path") if log_path in critical_compute_node_logs: diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py index 9ac52c645d..538241b482 100644 --- a/tests/integration-tests/tests/common/assertions.py +++ b/tests/integration-tests/tests/common/assertions.py @@ -189,6 +189,18 @@ def assert_instance_has_desired_imds_v2_setting(instance, status): assert_that(imds_v2_status).is_equal_to(status) +def assert_instance_has_desired_tags(instance, tags: List[dict]): + instance_id = instance.get("InstanceId") + instance_tags = instance.get("Tags") + instance_name = [tag["Value"] for tag in instance_tags if tag["Key"] == "Name"] + instance_name_part = f" ({instance_name[0]})" if instance_name else "" + + logging.info(f"Instance {instance_id}{instance_name_part} has tags {instance_tags}") + + for tag in tags: + assert_that(instance_tags).contains(tag) + + def assert_aws_identity_access_is_correct(cluster, users_allow_list, remote_command_executor=None): logging.info("Asserting access to AWS caller identity is correct") @@ -197,7 +209,7 @@ def assert_aws_identity_access_is_correct(cluster, users_allow_list, remote_comm for user, allowed in users_allow_list.items(): logging.info(f"Asserting access to AWS caller identity is {'allowed' if allowed else 'denied'} for user {user}") - command = f"sudo -u {user} aws sts get-caller-identity" + command = f"sudo -u {user} aws sts get-caller-identity --region {cluster.region}" result = remote_command_executor.run_remote_command(command, raise_on_error=False) logging.info(f"user={user} and result.failed={result.failed}") logging.info(f"user={user} and result.stdout={result.stdout}") diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 040a855369..ab7d0e8256 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -214,24 +214,16 @@ def _job_status_retryer(): return _job_status_retryer() def get_job_exit_status(self, job_id): # noqa: D102 - result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) - match = re.search(r"ExitCode=(.+?) ", result.stdout) - return match.group(1) + return self.get_job_info(job_id, field="ExitCode") def get_job_start_time(self, job_id): # noqa: D102 - result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) - match = re.search(r"StartTime=(.+?) ", result.stdout) - return match.group(1) + return self.get_job_info(job_id, field="StartTime") def get_job_submit_time(self, job_id): # noqa: D102 - result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) - match = re.search(r"SubmitTime=(.+?) ", result.stdout) - return match.group(1) + return self.get_job_info(job_id, field="SubmitTime") def get_job_eligible_time(self, job_id): # noqa: D102 - result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) - match = re.search(r"EligibleTime=(.+?) ", result.stdout) - return match.group(1) + return self.get_job_info(job_id, field="EligibleTime") def assert_job_submitted(self, sbatch_output): # noqa: D102 __tracebackhide__ = True @@ -412,6 +404,16 @@ def get_partitions(self): result = self._remote_command_executor.run_remote_command(check_partitions_cmd) return result.stdout.splitlines() + def get_partition_info(self, partition, field=None): + """Return partitions details. If field is provided, only the fieed is returned.""" + result = self._remote_command_executor.run_remote_command( + "scontrol show partition {0}".format(partition) + ).stdout + if field is not None: + match = re.search(rf"(\s{field})=(\S*)", result) + return match.group(2) + return result + def get_job_info(self, job_id, field=None): """Return job details from slurm. If field is provided, only the field is returned""" result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)).stdout diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 51de251ca1..d6921eb6c4 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -51,6 +51,8 @@ "centos7": {"name": "FPGA Developer AMI*", "owners": ["679593333241"]}, "ubuntu1804": {"name": "Deep Learning Base AMI (Ubuntu 18.04)*", "owners": ["amazon"]}, "ubuntu2004": {"name": "Deep Learning AMI GPU CUDA * (Ubuntu 20.04)*", "owners": ["amazon"]}, + # Simple redhat8 to be able to build in remarkable test + "rhel8": {"name": "RHEL-8.7*_HVM*", "owners": ["309956199498", "841258680906", "219670896067"]}, } OS_TO_KERNEL4_AMI_NAME_OWNER_MAP = { @@ -76,7 +78,15 @@ } -def retrieve_latest_ami(region, os, ami_type="official", architecture="x86_64", additional_filters=None, request=None): +def retrieve_latest_ami( + region, + os, + ami_type="official", + architecture="x86_64", + additional_filters=None, + request=None, + allow_private_ami=False, +): if additional_filters is None: additional_filters = [] try: @@ -90,6 +100,7 @@ def retrieve_latest_ami(region, os, ami_type="official", architecture="x86_64", and not request.config.getoption("pcluster_git_ref") and not request.config.getoption("cookbook_git_ref") and not request.config.getoption("node_git_ref") + and not allow_private_ami ): # If none of Git refs is provided, the test is running against released version. # Then retrieve public pcluster AMIs additional_filters.append({"Name": "is-public", "Values": ["true"]}) @@ -195,9 +206,21 @@ def get_installed_parallelcluster_base_version(): return pkg_resources.packaging.version.parse(get_installed_parallelcluster_version()).base_version +def get_aws_domain(region: str): + """Get AWS domain for the given region.""" + if region.startswith("cn-"): + return "amazonaws.com.cn" + elif region.startswith("us-iso-"): + return "c2s.ic.gov" + elif region.startswith("us-isob-"): + return "sc2s.sgov.gov" + else: + return "amazonaws.com" + + def get_sts_endpoint(region): """Get regionalized STS endpoint.""" - return "https://sts.{0}.{1}".format(region, "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com") + return "https://sts.{0}.{1}".format(region, get_aws_domain(region)) def generate_random_string(): @@ -231,7 +254,9 @@ def reboot_head_node(cluster, remote_command_executor=None): logging.info(f"result.failed={result.failed}") logging.info(f"result.stdout={result.stdout}") wait_head_node_running(cluster) - time.sleep(120) # Wait time is required for the head node to complete the reboot + # Wait time is required for the head node to complete the reboot. + # We observed that headnode in US isolated regions may take more time to reboot. + time.sleep(240 if "us-iso" in cluster.region else 120) logging.info(f"Rebooted head node for cluster: {cluster.name}") @@ -308,3 +333,27 @@ def run_system_analyzer(cluster, scheduler_commands_factory, request, partition= preserve_mode=False, ) logging.info("Compute node system information correctly retrieved.") + + +@retry(stop_max_attempt_number=5, wait_fixed=seconds(3)) +def read_remote_file(remote_command_executor, file_path): + """Reads the content of a remote file.""" + logging.info(f"Retrieving remote file {file_path}") + result = remote_command_executor.run_remote_command(f"cat {file_path}") + assert_that(result.failed).is_false() + return result.stdout.strip() + + +@retry(stop_max_attempt_number=60, wait_fixed=seconds(180)) +def wait_process_completion(remote_command_executor, pid): + """Waits for a process with the given pid to terminate.""" + logging.info("Waiting for performance test to complete") + command = f""" + ps --pid {pid} > /dev/null + [ "$?" -ne 0 ] && echo "COMPLETE" || echo "RUNNING" + """ + result = remote_command_executor.run_remote_command(command) + if result.stdout == "RUNNING": + raise Exception("The process is still running") + else: + return result.stdout.strip() diff --git a/tests/integration-tests/tests/configure/test_pcluster_configure.py b/tests/integration-tests/tests/configure/test_pcluster_configure.py index 94854b895a..d61f9c538a 100644 --- a/tests/integration-tests/tests/configure/test_pcluster_configure.py +++ b/tests/integration-tests/tests/configure/test_pcluster_configure.py @@ -10,6 +10,7 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging +from datetime import datetime from os import environ import boto3 @@ -58,13 +59,14 @@ def test_pcluster_configure( skip_if_unsupported_test_options_were_used(request) config_path = test_datadir / "config.yaml" - _create_and_test_standard_configuration(config_path, region, key_name, scheduler, os, instance, vpc_stack) + _create_and_test_standard_configuration(request, config_path, region, key_name, scheduler, os, instance, vpc_stack) inject_additional_config_settings(config_path, request, region) clusters_factory(config_path) def test_pcluster_configure_avoid_bad_subnets( + request, vpc_stack, subnet_in_use1_az3, pcluster_config_reader, @@ -94,12 +96,12 @@ def test_pcluster_configure_avoid_bad_subnets( ] ) stages = orchestrate_pcluster_configure_stages(prompts=bad_subnets_prompts, scheduler=scheduler) - assert_configure_workflow(region, stages, config_path) + assert_configure_workflow(request, region, stages, config_path) assert_config_contains_expected_values(key_name, scheduler, os, instance, region, None, None, config_path) def test_region_without_t2micro( - vpc_stack: CfnVpcStack, pcluster_config_reader, key_name, region, os, scheduler, test_datadir + request, vpc_stack: CfnVpcStack, pcluster_config_reader, key_name, region, os, scheduler, test_datadir ): """ Verify the default instance type (free tier) is retrieved dynamically according to region. @@ -112,7 +114,7 @@ def test_region_without_t2micro( + standard_vpc_subnet_prompts(vpc_stack) ) stages = orchestrate_pcluster_configure_stages(region_without_t2micro_prompts, scheduler) - assert_configure_workflow(region, stages, config_path) + assert_configure_workflow(request, region, stages, config_path) assert_config_contains_expected_values( key_name, scheduler, @@ -172,7 +174,7 @@ def test_efa_and_placement_group( + standard_vpc_subnet_prompts(vpc_stack) ) stages = orchestrate_pcluster_configure_stages(standard_prompts, scheduler) - assert_configure_workflow(region, stages, config_path) + assert_configure_workflow(request, region, stages, config_path) assert_config_contains_expected_values( key_name, scheduler, @@ -189,13 +191,13 @@ def test_efa_and_placement_group( clusters_factory(config_path) -def test_efa_unsupported(vpc_stack, key_name, region, os, instance, scheduler, clusters_factory, test_datadir): +def test_efa_unsupported(request, vpc_stack, key_name, region, os, instance, scheduler, clusters_factory, test_datadir): config_path = test_datadir / "config.yaml" - _create_and_test_standard_configuration(config_path, region, key_name, scheduler, os, instance, vpc_stack) + _create_and_test_standard_configuration(request, config_path, region, key_name, scheduler, os, instance, vpc_stack) def _create_and_test_standard_configuration( - config_path, region, key_name, scheduler, os, instance, vpc_stack: CfnVpcStack + request, config_path, region, key_name, scheduler, os, instance, vpc_stack: CfnVpcStack ): standard_prompts = ( standard_first_stage_prompts(region, key_name, scheduler, os, instance) @@ -203,7 +205,7 @@ def _create_and_test_standard_configuration( + standard_vpc_subnet_prompts(vpc_stack) ) stages = orchestrate_pcluster_configure_stages(standard_prompts, scheduler) - assert_configure_workflow(region, stages, config_path) + assert_configure_workflow(request, region, stages, config_path) assert_config_contains_expected_values( key_name, scheduler, @@ -306,19 +308,24 @@ def get_unsupported_test_runner_options(request): return [option for option in unsupported_options if request.config.getoption(option) is not None] -def assert_configure_workflow(region, stages, config_path): +def assert_configure_workflow(request, region, stages, config_path): logging.info(f"Using `pcluster configure` to write a configuration to {config_path}") environ["AWS_DEFAULT_REGION"] = region - configure_process = pexpect.spawn(f"pcluster configure --config {config_path}") - for stage in stages: - configure_prompt_status = configure_process.expect(stage.get("prompt")) - assert_that(configure_prompt_status).is_equal_to(0) - configure_process.sendline(stage.get("response")) - - # Expecting EOF verifies that `pcluster configure` finished as expected. - configure_process.expect(pexpect.EOF) - configure_process.close() - assert_that(configure_process.exitstatus).is_equal_to(0) + configure_process = pexpect.spawn(f"pcluster configure --config {config_path}", encoding="utf-8", timeout=90) + output_dir = request.config.getoption("output_dir") + with open( + f"{output_dir}/spawned-process-log-{datetime.now().strftime('%d-%m-%Y-%H:%M:%S.%f')}", "w", encoding="utf-8" + ) as spawned_process_log: + configure_process.logfile = spawned_process_log + for stage in stages: + configure_prompt_status = configure_process.expect(stage.get("prompt")) + assert_that(configure_prompt_status).is_equal_to(0) + configure_process.sendline(stage.get("response")) + + # Expecting EOF verifies that `pcluster configure` finished as expected. + configure_process.expect(pexpect.EOF) + configure_process.close() + assert_that(configure_process.exitstatus).is_equal_to(0) # Log the generated config's contents so debugging doesn't always require digging through Jenkins artifacts with open(config_path, encoding="utf-8") as config_file: diff --git a/tests/integration-tests/tests/create/test_create.py b/tests/integration-tests/tests/create/test_create.py index a7e967058a..ef17cf93a4 100644 --- a/tests/integration-tests/tests/create/test_create.py +++ b/tests/integration-tests/tests/create/test_create.py @@ -104,12 +104,14 @@ def test_create_imds_secured( cluster = clusters_factory(cluster_config, raise_on_error=True) status = "required" if imds_support == "v2.0" else "optional" + logging.info("Checking cluster access after cluster creation") assert_head_node_is_running(region, cluster) assert_aws_identity_access_is_correct(cluster, users_allow_list) assert_cluster_imds_v2_requirement_status(region, cluster, status) reboot_head_node(cluster) + logging.info("Checking cluster access after head node reboot") assert_head_node_is_running(region, cluster) assert_aws_identity_access_is_correct(cluster, users_allow_list) assert_cluster_imds_v2_requirement_status(region, cluster, status) @@ -134,13 +136,13 @@ def test_cluster_creation_with_problematic_preinstall_script( assert_lines_in_logs( remote_command_executor, ["/var/log/cfn-init.log"], - [f"Failed to execute OnNodeStart script s3://{ bucket_name }/scripts/{script_name}"], + [f"Failed to execute OnNodeStart script 1 s3://{ bucket_name }/scripts/{script_name}"], ) logging.info("Verifying error in cloudformation failure reason") stack_events = cluster.get_stack_events().get("events") cfn_failure_reason = _get_failure_reason(stack_events) expected_cfn_failure_reason = ( - "Failed to execute OnNodeStart script, " + "Failed to execute OnNodeStart script 1, " "return code: 1. Please check /var/log/cfn-init.log in the head node, or check the " "cfn-init.log in CloudWatch logs. Please refer to https://docs.aws.amazon.com/" "parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for " diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index 3b7d1c0aa8..1fc1d29229 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -32,6 +32,7 @@ from tests.common.assertions import ( assert_head_node_is_running, assert_instance_has_desired_imds_v2_setting, + assert_instance_has_desired_tags, assert_lambda_vpc_settings_are_correct, assert_no_msg_in_logs, ) @@ -141,6 +142,7 @@ def test_build_image( ) _test_build_image_success(image) + _test_build_instances_tags(image, image.config["Build"]["Tags"], region) _test_build_imds_settings(image, "required", region) _test_image_tag_and_volume(image) _test_list_image_log_streams(image) @@ -484,6 +486,23 @@ def _test_build_imds_settings(image, status, region): assert_instance_has_desired_imds_v2_setting(instance, status) +def _test_build_instances_tags(image, build_tags, region): + logging.info("Checking that the ImageBuilder instances have the build tags") + + instance_names = [ + f"Build instance for ParallelClusterImage-{image.image_id}", + f"Test instance for ParallelClusterImage-{image.image_id}", + ] + + describe_response = boto3.client("ec2", region_name=region).describe_instances( + Filters=[{"Name": "tag:Name", "Values": instance_names}] + ) + + for reservations in describe_response.get("Reservations"): + for instance in reservations.get("Instances"): + assert_instance_has_desired_tags(instance, build_tags) + + def _test_build_image_success(image): logging.info("Test build image process for image %s.", image.image_id) diff --git a/tests/integration-tests/tests/custom_resource/__init__.py b/tests/integration-tests/tests/custom_resource/__init__.py new file mode 100644 index 0000000000..9555af58b9 --- /dev/null +++ b/tests/integration-tests/tests/custom_resource/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. diff --git a/tests/integration-tests/tests/custom_resource/conftest.py b/tests/integration-tests/tests/custom_resource/conftest.py new file mode 100644 index 0000000000..4429e0277c --- /dev/null +++ b/tests/integration-tests/tests/custom_resource/conftest.py @@ -0,0 +1,180 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + + +from pathlib import Path + +import boto3 +import cfn_tools +import pkg_resources +import pytest +from cfn_stacks_factory import CfnStack, CfnStacksFactory +from troposphere import Output, Ref +from troposphere.iam import ManagedPolicy +from troposphere.template_generator import TemplateGenerator +from utils import generate_stack_name + + +@pytest.fixture(scope="class", name="cfn") +def cfn_fixture(region): + """Create a CloudFormation Boto3 client.""" + client = boto3.client("cloudformation", region_name=region) + return client + + +@pytest.fixture(scope="session", name="resources_dir") +def resources_dir_fixture(): + return Path(pkg_resources.resource_filename(__name__, "/../../resources")) + + +@pytest.fixture(scope="session", name="cluster_custom_resource_template") +def cluster_custom_resource_template_fixture(resources_dir): + return resources_dir / "cluster_custom_resource.yaml" + + +@pytest.fixture(scope="session", name="cluster_custom_resource_provider_template") +def cluster_custom_resource_provider_template_fixture(resources_dir): + return resources_dir / ".." / ".." / ".." / "cloudformation" / "custom_resource" / "cluster.yaml" + + +@pytest.fixture(scope="session", name="policies_template_path") +def policies_template_path_fixture(resources_dir): + return resources_dir / ".." / ".." / ".." / "cloudformation" / "policies" / "parallelcluster-policies.yaml" + + +def cluster_custom_resource_provider_generator(credentials, region, stack_name, parameters, template): + factory = CfnStacksFactory(credentials) + with open(template, encoding="utf-8") as cfn_file: + template_data = cfn_file.read() + + stack = CfnStack( + name=stack_name, + region=region, + template=template_data, + parameters=[{"ParameterKey": k, "ParameterValue": v} for k, v in parameters.items()], + capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM", "CAPABILITY_AUTO_EXPAND"], + ) + + try: + factory.create_stack(stack, True) + yield stack.cfn_outputs.get("ServiceToken") + finally: + factory.delete_all_stacks() + + +@pytest.fixture(scope="class", name="cluster_custom_resource_provider") +def cluster_custom_resource_provider_fixture( + request, region, resource_bucket, cluster_custom_resource_service_token, cluster_custom_resource_provider_template +): + """Create the cluster custom resource stack.""" + if cluster_custom_resource_service_token: + yield cluster_custom_resource_service_token + return + + parameters = {"CustomBucket": resource_bucket} + yield from cluster_custom_resource_provider_generator( + request.config.getoption("credential"), + region, + generate_stack_name("custom-resource-provider", request.config.getoption("stackname_suffix")), + parameters, + cluster_custom_resource_provider_template, + ) + + +@pytest.fixture(scope="class", name="cluster_custom_resource_factory") +def cluster_custom_resource_factory_fixture( + request, region, os, cluster_custom_resource_template, cluster_custom_resource_provider, vpc_stack +): + factory = CfnStacksFactory(request.config.getoption("credential")) + + def _produce_cluster_custom_resource_stack(parameters=None): + cluster_name = generate_stack_name("custom-resource-c", request.config.getoption("stackname_suffix")) + + parameters = { + "ClusterName": cluster_name, + "HeadNodeSubnet": vpc_stack.get_public_subnet(), + "ComputeNodeSubnet": vpc_stack.get_private_subnet(), + "ServiceToken": cluster_custom_resource_provider, + "Os": os, + **(parameters or {}), + } + + with open(cluster_custom_resource_template, encoding="utf-8") as cfn_file: + template_data = cfn_file.read() + + stack = CfnStack( + name=generate_stack_name("custom-resource", request.config.getoption("stackname_suffix")), + region=region, + template=template_data, + parameters=[{"ParameterKey": k, "ParameterValue": v} for k, v in parameters.items()], + capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM", "CAPABILITY_AUTO_EXPAND"], + ) + + factory.create_stack(stack, True) + stack.factory = factory + return stack + + yield _produce_cluster_custom_resource_stack + + factory.delete_all_stacks() + + +@pytest.fixture(scope="class", name="resource_bucket_cluster_template") +def resource_bucket_cluster_template_fixture(policies_template_path, resource_bucket): + bucket_policy = ManagedPolicy( + title="ResourceBucketAccess", + Description="Policy to access resource bucket", + PolicyDocument={ + "Statement": [ + { + "Action": ["s3:GetObject"], + "Effect": "Allow", + "Resource": {"Fn::Sub": f"arn:${{AWS::Partition}}:s3:::{resource_bucket}/*"}, + }, + { + "Action": ["events:PutRule", "events:DeleteRule", "events:PutTargets", "events:RemoveTargets"], + "Effect": "Allow", + "Resource": {"Fn::Sub": "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/*"}, + }, + ], + "Version": "2012-10-17", + }, + ) + + with open(policies_template_path, "r", encoding="utf-8") as f: + policies_template = TemplateGenerator(cfn_tools.load_yaml(f.read())) + + policies_template.add_resource(bucket_policy) + policies_template.add_output(Output("ResourceBucketAccess", Value=Ref("ResourceBucketAccess"))) + managed_policies = policies_template.resources.get("ParallelClusterLambdaRole").properties["ManagedPolicyArns"] + managed_policies.append(Ref("ResourceBucketAccess")) + return policies_template.to_yaml() + + +@pytest.fixture(scope="class", name="resource_bucket_policies") +def resource_bucket_policies_fixture(request, region, resource_bucket_cluster_template): + factory = CfnStacksFactory(request.config.getoption("credential")) + + parameters = {"EnableIamAdminAccess": "true"} + stack = CfnStack( + name=generate_stack_name("resource-bucket-policies", request.config.getoption("stackname_suffix")), + region=region, + template=resource_bucket_cluster_template, + capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM"], + parameters=[{"ParameterKey": k, "ParameterValue": v} for k, v in parameters.items()], + ) + + try: + factory.create_stack(stack, True) + yield stack + finally: + factory.delete_all_stacks() diff --git a/tests/integration-tests/tests/custom_resource/test_cluster_custom_resource.py b/tests/integration-tests/tests/custom_resource/test_cluster_custom_resource.py new file mode 100644 index 0000000000..42d266ad58 --- /dev/null +++ b/tests/integration-tests/tests/custom_resource/test_cluster_custom_resource.py @@ -0,0 +1,227 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + + +import logging + +import pytest +import urllib3 +import yaml +from assertpy import assert_that +from utils import StackError, generate_stack_name + +from tests.custom_resource.conftest import cluster_custom_resource_provider_generator + +LOGGER = logging.getLogger(__name__) + + +# Dynamically load pcluster library so that unit tests can pass +def pc(): + import pcluster.lib as pc + + return pc + + +def failure_reason(events): + """Return the StatusReason from the most recent failed stack event.""" + + def failed_event_predicate(event): + """Predicate used to filter failed stacks to validate output.""" + failed_states = {"CREATE_FAILED", "UPDATE_FAILED"} + return event["LogicalResourceId"] == "PclusterCluster" and event["ResourceStatus"] in failed_states + + return next(filter(failed_event_predicate, events))["ResourceStatusReason"] + + +def cluster_config(cluster_name): + """Return the configuration for a cluster.""" + cluster = pc().describe_cluster(cluster_name=cluster_name) + config_url = cluster["clusterConfiguration"]["url"] + http = urllib3.PoolManager() + resp = http.request(url=config_url, method="GET") + config = yaml.safe_load(resp.data.decode("UTF-8")) + return config + + +def _stack_parameter(stack, parameter_key): + return next(filter(lambda x: x["ParameterKey"] == parameter_key, stack.parameters)).get("ParameterValue") + + +def test_cluster_create(region, cluster_custom_resource_factory): + stack = cluster_custom_resource_factory() + error_message = "KeyPairValidator" + cluster_name = _stack_parameter(stack, "ClusterName") + cluster = pc().list_clusters(query=f"clusters[?clusterName=='{cluster_name}']|[0]") + assert_that(cluster["clusterStatus"]).is_not_none() + assert_that(stack.cfn_outputs.get("ValidationMessages", "")).contains(error_message) + assert_that(stack.cfn_outputs.get("HeadNodeIp")).is_not_none() + + +@pytest.mark.parametrize( + "parameters, error_message", + [ + ({"ClusterName": "0"}, "Bad Request: '0' does not match"), + ({"OnNodeConfigured": "s3://invalidbucket/invalidkey"}, "OnNodeConfiguredDownloadFailure"), + ], +) +def test_cluster_create_invalid(region, cluster_custom_resource_factory, parameters, error_message): + """Try to create a cluster with invalid syntax and ensure that it fails.""" + with pytest.raises(StackError) as stack_error: + cluster_custom_resource_factory(parameters) + reason = failure_reason(stack_error.value.stack_events) + assert_that(reason).contains(error_message) + + +@pytest.mark.parametrize("external_update", [(False), (True)]) +# pylint: disable=too-many-locals +def test_cluster_update(region, cluster_custom_resource_factory, external_update): + """Perform crud validation on cluster.""" + validation_message = "KeyPairValidator" + stack = cluster_custom_resource_factory() + cluster_name = _stack_parameter(stack, "ClusterName") + parameters = {x["ParameterKey"]: x["ParameterValue"] for x in stack.parameters} + + old_config = cluster_config(cluster_name) + old_max = int(old_config["Scheduling"]["SlurmQueues"][0]["ComputeResources"][0]["MaxCount"]) + update_parameters = {"ComputeInstanceMax": str(int(old_max) + 1)} + + # External updates are not supported due to lack of drift detection, + # however testing here ensures we don't catastrophically fail. + if external_update: + config = cluster_config(cluster_name) + max_count = update_parameters["ComputeInstanceMax"] + config["Scheduling"]["SlurmQueues"][0]["ComputeResources"][0]["MaxCount"] = max_count + cluster = pc().update_cluster(cluster_name=cluster_name, cluster_configuration=config, wait=True) + + # Update the stack + update_params = parameters | update_parameters + stack_params = [{"ParameterKey": k, "ParameterValue": v} for k, v in update_params.items()] + stack.factory.update_stack(stack.name, stack.region, stack_params, stack_is_under_test=True) + + assert_that(stack.cfn_outputs["HeadNodeIp"]).is_not_none() + + # The underlying update doesn't happen if it was externally updated, so no + # validations messages will be available in this case. + if not external_update: + assert_that(stack.cfn_outputs["ValidationMessages"]).contains(validation_message) + + cluster = pc().list_clusters(query=f"clusters[?clusterName=='{cluster_name}']|[0]") + assert_that(cluster["clusterStatus"]).is_equal_to("UPDATE_COMPLETE") + + config = cluster_config(cluster_name) + max_count = int(config["Scheduling"]["SlurmQueues"][0]["ComputeResources"][0]["MaxCount"]) + assert_that(max_count).is_equal_to(int(update_parameters["ComputeInstanceMax"])) + + +@pytest.mark.parametrize( + "update_parameters, error_message", + [ + ({"ClusterName": "j", "ComputeInstanceMax": "20"}, "Cannot update the ClusterName"), + ({"ComputeInstanceMax": "10"}, "Stop the compute fleet"), + ({"ComputeInstanceMax": "-10"}, "Must be greater than or equal to 1."), + ({"OnNodeConfigured": "s3://invalid", "ComputeInstanceMax": "20"}, "s3 url 's3://invalid' is invalid."), + ], +) +# pylint: disable=too-many-locals +def test_cluster_update_invalid(region, cluster_custom_resource_factory, update_parameters, error_message): + """Perform crud validation on cluster.""" + stack = cluster_custom_resource_factory() + cluster_name = _stack_parameter(stack, "ClusterName") + old_cluster_status = pc().describe_cluster(cluster_name=cluster_name) + old_config = cluster_config(cluster_name) + parameters = {x["ParameterKey"]: x["ParameterValue"] for x in stack.parameters} + + # Update the stack to change the name + update_params = parameters | update_parameters + parameters = [{"ParameterKey": k, "ParameterValue": v} for k, v in update_params.items()] + + with pytest.raises(StackError) as stack_error: + stack.factory.update_stack(stack.name, stack.region, parameters, stack_is_under_test=True) + + reason = failure_reason(stack_error.value.stack_events) + assert_that(reason).contains(error_message) + + cluster = pc().list_clusters(query=f"clusters[?clusterName=='{cluster_name}']|[0]") + assert_that(cluster["clusterName"]).is_equal_to(cluster_name) + config = cluster_config(cluster_name) + new_max = int(config["Scheduling"]["SlurmQueues"][0]["ComputeResources"][0]["MaxCount"]) + old_max = int(old_config["Scheduling"]["SlurmQueues"][0]["ComputeResources"][0]["MaxCount"]) + assert_that(new_max).is_equal_to(old_max) + + cluster_status = pc().describe_cluster(cluster_name=cluster_name) + assert_that(old_cluster_status["lastUpdatedTime"]).is_equal_to(cluster_status["lastUpdatedTime"]) + + +def test_cluster_delete_out_of_band( + request, region, cfn, cluster_custom_resource_provider, cluster_custom_resource_factory +): + """Perform crud validation on cluster.""" + + stack = cluster_custom_resource_factory() + cluster_name = _stack_parameter(stack, "ClusterName") + + # Delete the stack outside of CFN + pc().delete_cluster(cluster_name=cluster_name) + + # Delete the stack through CFN and wait for delete to complete + stack.factory.delete_stack(stack.name, stack.region) + status = cfn.describe_stacks(StackName=stack.cfn_stack_id)["Stacks"][0]["StackStatus"] + assert_that(status).is_equal_to("DELETE_COMPLETE") + + +def test_cluster_delete_retain(request, region, cluster_custom_resource_provider, cluster_custom_resource_factory): + """Perform crud validation on cluster.""" + + stack = cluster_custom_resource_factory({"DeletionPolicy": "Retain"}) + cluster_name = _stack_parameter(stack, "ClusterName") + + # Delete the stack through CFN and wait for delete to complete + stack.factory.delete_stack(stack.name, stack.region) + + cluster = pc().describe_cluster(cluster_name=cluster_name) + assert_that(cluster["clusterStatus"]).is_equal_to("CREATE_COMPLETE") + pc().delete_cluster(cluster_name=cluster_name) + + +@pytest.mark.parametrize( + "stack_param, cfn_output", + [ + ("CustomLambdaRole", "ParallelClusterLambdaRoleArn"), + ("AdditionalIamPolicies", "ResourceBucketAccess"), + ], +) +def test_cluster_create_with_custom_policies( + request, + region, + resource_bucket, + resource_bucket_policies, + cluster_custom_resource_provider_template, + cluster_custom_resource_factory, + stack_param, + cfn_output, +): + """Create a custom resource provider with a custom role and create a cluster to validate it.""" + parameters = {"CustomBucket": resource_bucket, stack_param: resource_bucket_policies.cfn_outputs[cfn_output]} + custom_resource_gen = cluster_custom_resource_provider_generator( + request.config.getoption("credential"), + region, + generate_stack_name("custom-resource-provider", request.config.getoption("stackname_suffix")), + parameters, + cluster_custom_resource_provider_template, + ) + service_token = next(custom_resource_gen) + + cluster_parameters = {"CustomBucketAccess": resource_bucket, "ServiceToken": service_token} + stack = cluster_custom_resource_factory(cluster_parameters) + cluster_name = _stack_parameter(stack, "ClusterName") + cluster = pc().list_clusters(query=f"clusters[?clusterName=='{cluster_name}']|[0]") + assert_that(cluster["clusterStatus"]).is_equal_to("CREATE_COMPLETE") diff --git a/tests/integration-tests/tests/efa/test_efa/test_efa/pcluster.config.yaml b/tests/integration-tests/tests/efa/test_efa/test_efa/pcluster.config.yaml index badc7e9324..e1c589cfe7 100644 --- a/tests/integration-tests/tests/efa/test_efa/test_efa/pcluster.config.yaml +++ b/tests/integration-tests/tests/efa/test_efa/test_efa/pcluster.config.yaml @@ -20,8 +20,12 @@ Scheduling: - {{ private_subnet_id }} ComputeResources: - Name: efa-enabled-i1 + {% if "us-iso" in region %} + InstanceType: {{ instance }} + {% else %} Instances: - InstanceType: {{ instance }} + {% endif %} MaxCount: {{ max_queue_size }} MinCount: {{ max_queue_size }} DisableSimultaneousMultithreading: true diff --git a/tests/integration-tests/tests/efa/test_fabric.py b/tests/integration-tests/tests/efa/test_fabric.py index e022ec978f..3c2f40e473 100644 --- a/tests/integration-tests/tests/efa/test_fabric.py +++ b/tests/integration-tests/tests/efa/test_fabric.py @@ -17,8 +17,7 @@ from remote_command_executor import RemoteCommandExecutor from tests.common.assertions import assert_no_errors_in_logs -from tests.common.utils import run_system_analyzer -from tests.performance_tests.common import read_remote_file, wait_process_completion +from tests.common.utils import read_remote_file, run_system_analyzer, wait_process_completion FABTESTS_BASIC_TESTS = ["rdm_tagged_bw", "rdm_tagged_pingpong"] diff --git a/tests/integration-tests/tests/iam/test_iam.py b/tests/integration-tests/tests/iam/test_iam.py index 60f0246e41..cea75b962d 100644 --- a/tests/integration-tests/tests/iam/test_iam.py +++ b/tests/integration-tests/tests/iam/test_iam.py @@ -307,7 +307,7 @@ def test_iam_policies(region, scheduler, pcluster_config_reader, clusters_factor def _test_s3_access(remote_command_executor, region): logging.info("Testing S3 Access") - result = remote_command_executor.run_remote_command(f"AWS_DEFAULT_REGION={region} sudo aws s3 ls").stdout + result = remote_command_executor.run_remote_command(f"sudo aws s3 ls --region {region}").stdout # An error occurred (AccessDenied) when calling the ListBuckets operation: Access Denied assert_that(result).does_not_contain("AccessDenied") @@ -315,7 +315,7 @@ def _test_s3_access(remote_command_executor, region): def _test_batch_access(remote_command_executor, region): logging.info("Testing AWS Batch Access") result = remote_command_executor.run_remote_command( - f"AWS_DEFAULT_REGION={region} aws batch describe-compute-environments" + f"aws batch describe-compute-environments --region {region}" ).stdout # An error occurred (AccessDeniedException) when calling the DescribeComputeEnvironments operation: ... assert_that(result).does_not_contain("AccessDeniedException") @@ -636,6 +636,9 @@ def _create_permission_boundary(permission_boundary_name): "logs:CreateLogGroup", "logs:TagResource", "logs:UntagResource", + "logs:DescribeMetricFilters", + "logs:PutMetricFilter", + "logs:deleteMetricFilter", ], "Condition": { "Fn::If": [ diff --git a/tests/integration-tests/tests/iam/test_iam_image.py b/tests/integration-tests/tests/iam/test_iam_image.py index 470d455e35..63acdddb9a 100644 --- a/tests/integration-tests/tests/iam/test_iam_image.py +++ b/tests/integration-tests/tests/iam/test_iam_image.py @@ -10,7 +10,6 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging -import time import boto3 import pytest @@ -39,8 +38,6 @@ def test_iam_roles( ec2_client = boto3.client("ec2", region_name=region) lambda_client = boto3.client("lambda", region_name=region) - # Give 10 minutes to EC2ImageBuilder to spin up the build instance - time.sleep(600) pcluster_describe_image_result = image.describe() logging.info(pcluster_describe_image_result) _check_roles( @@ -52,6 +49,7 @@ def test_iam_roles( lambda_cleanup_role, ) + # TODO is there a way to complete this check without building an image? _wait_build_image_complete(image) @@ -88,6 +86,21 @@ def _wait_build_image_complete(image): assert_that(image.image_status).is_equal_to("BUILD_COMPLETE") +@retry(wait_fixed=minutes(1), stop_max_delay=minutes(20)) +def _get_resources_with_image_resource(cfn_client, stack_name): + logging.info("Describe stack resources") + resources = cfn_client.describe_stack_resources(StackName=stack_name)["StackResources"] + image_resource_exists = False + logging.info("Checking image resource") + for resource in resources: + if resource["ResourceType"] == "AWS::ImageBuilder::Image": + image_resource_exists = True + logging.info("The image resource exists!") + break + assert_that(image_resource_exists).is_true() + return resources + + def _check_roles( cfn_client, ec2_client, @@ -97,7 +110,8 @@ def _check_roles( lambda_cleanup_role, ): """Test roles are attached to EC2 build instance and Lambda cleanup function in the building stack.""" - resources = cfn_client.describe_stack_resources(StackName=stack_name)["StackResources"] + logging.info("Checking roles are attached to the build instance") + resources = _get_resources_with_image_resource(cfn_client, stack_name) for resource in resources: resource_type = resource["ResourceType"] # Check that there is no role created in the stack. @@ -106,9 +120,11 @@ def _check_roles( # Check the role is attached to the Lambda function lambda_function = lambda_client.get_function(FunctionName=resource["PhysicalResourceId"])["Configuration"] assert_that(lambda_function["Role"]).is_equal_to(lambda_cleanup_role) + logging.info("Lambda function role confirmed") if resource_type == "AWS::ImageBuilder::Image": # Check the instance profile is attached to the EC2 instance imagebuilder_image_arn = resource["PhysicalResourceId"] + logging.info(f"Image builder Image ARN: {imagebuilder_image_arn}") instance_profile_arn = ( ec2_client.describe_instances( Filters=[{"Name": "tag:Ec2ImageBuilderArn", "Values": [imagebuilder_image_arn]}] @@ -119,3 +135,4 @@ def _check_roles( .get("Arn") ) assert_that(instance_profile_arn).contains(instance_profile) + logging.info("Image arn confirmed") diff --git a/tests/integration-tests/tests/log_rotation/test_log_rotation.py b/tests/integration-tests/tests/log_rotation/test_log_rotation.py index 424baf951f..d6203a52e8 100644 --- a/tests/integration-tests/tests/log_rotation/test_log_rotation.py +++ b/tests/integration-tests/tests/log_rotation/test_log_rotation.py @@ -38,6 +38,7 @@ def test_log_rotation( common_logs = [ {"log_name": "cloud-init", "log_path": "/var/log/cloud-init.log", "existence": True}, {"log_name": "supervisord", "log_path": "/var/log/supervisord.log", "existence": True}, + {"log_name": "bootstrap_error_msg", "log_path": "/var/log/parallelcluster/bootstrap_error_msg"}, ] headnode_specified_logs = [ { @@ -60,7 +61,7 @@ def test_log_rotation( "log_name": "dcv-server", "log_path": "/var/log/dcv/server.log", "existence": True, - "trigger_new_entries": True, + "trigger_new_entries": False, }, {"log_name": "dcv-xsession", "log_path": "/var/log/dcv/dcv-xsession.*.log"}, {"log_name": "slurmdbd", "log_path": "/var/log/slurmdbd.log"}, @@ -88,6 +89,16 @@ def test_log_rotation( "trigger_new_entries": True, }, {"log_name": "chef-client", "log_path": "/var/log/chef-client.log", "existence": True}, + { + "log_name": "clustermgtd_events", + "log_path": "/var/log/parallelcluster/clustermgtd.events", + "existence": True, + }, + { + "log_name": "slurm_resume_events", + "log_path": "/var/log/parallelcluster/slurm_resume.events", + "existence": True, + }, ] compute_specified_logs = [ {"log_name": "cloud-init-output", "log_path": "/var/log/cloud-init-output.log", "existence": True}, @@ -127,7 +138,7 @@ def test_log_rotation( ) -@retry(wait_fixed=seconds(20), stop_max_delay=minutes(5)) +@retry(wait_fixed=seconds(20), stop_max_delay=minutes(9)) def _wait_file_not_empty(remote_command_executor, file_path, compute_node_ip=None): if compute_node_ip: size = _run_command_on_node(remote_command_executor, f"stat --format=%s {file_path}", compute_node_ip) @@ -153,7 +164,7 @@ def _wait_log_in_log_stream( ( stream_name for stream_name in stream_names - if private_ip.replace(".", "-") in stream_name and log_name in stream_name + if private_ip.replace(".", "-") in stream_name and stream_name.endswith(log_name) ), None, ) @@ -259,7 +270,7 @@ def _test_logs_are_rotated(os, logs, remote_command_executor, before_log_rotatio def _test_logs_written_to_new_file(logs, remote_command_executor, compute_node_ip=None): - """Test newly genreated logs write to log_file.log instead of log_file.log.1.""" + """Test newly generated logs write to log_file.log instead of log_file.log.1.""" # test logs are written to new log files after rotation for log in logs: if log.get("trigger_new_entries"): diff --git a/tests/integration-tests/tests/dashboard_and_alarms/__init__.py b/tests/integration-tests/tests/monitoring/__init__.py similarity index 100% rename from tests/integration-tests/tests/dashboard_and_alarms/__init__.py rename to tests/integration-tests/tests/monitoring/__init__.py diff --git a/tests/integration-tests/tests/monitoring/structured_log_event_utils.py b/tests/integration-tests/tests/monitoring/structured_log_event_utils.py new file mode 100644 index 0000000000..9cb91bd850 --- /dev/null +++ b/tests/integration-tests/tests/monitoring/structured_log_event_utils.py @@ -0,0 +1,75 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import json +import logging +import re +from typing import AnyStr, Dict, Iterator, Tuple + +from assertpy import assert_that +from clusters_factory import Cluster +from retrying import retry +from time_utils import minutes, seconds + +from tests.cloudwatch_logging.cloudwatch_logging_boto3_utils import ( + get_cluster_log_groups_from_boto3, + get_log_events, + get_log_streams, +) + +logger = logging.getLogger(__name__) + + +def get_log_stream_events(cluster: Cluster, stream_name_pattern: AnyStr) -> Iterator[Tuple[str, Dict]]: + pattern = re.compile(stream_name_pattern) + log_group_name = get_cluster_log_groups_from_boto3(f"/aws/parallelcluster/{cluster.name}")[0].get("logGroupName") + for log_stream in get_log_streams(log_group_name): + log_stream_name = log_stream.get("logStreamName") + if pattern.fullmatch(log_stream_name): + yield from ((log_stream_name, event) for event in get_log_events(log_group_name, log_stream_name)) + + +def get_log_stream_events_by_event_type( + cluster: Cluster, stream_name_pattern: AnyStr, event_type_pattern: AnyStr +) -> Iterator[Tuple[str, Dict[str, Dict]]]: + pattern = re.compile(event_type_pattern) + for stream_name, log_event in get_log_stream_events(cluster, stream_name_pattern): + json_event = json.loads(log_event.get("message", {})) + logger.info( + "Got Event Type: %s, looking for event type %s", json_event.get("event-type", ""), event_type_pattern + ) + if pattern.fullmatch(json_event.get("event-type", "")): + logger.info("Returning event %s for event type %s", json_event.get("event-type", ""), event_type_pattern) + yield stream_name, json_event + + +def get_node_info_from_stream_name(stream_name: str) -> Dict[str, str]: + match = re.match(r"ip-(\d{1,3}(-\d{1,3}){3})\.(i-[0-9a-f]+)\.(.+)", stream_name) + return { + "ip": match.group(1).replace("-", "."), + "instance-id": match.group(3), + "logfile": match.group(4), + } + + +@retry(wait_fixed=seconds(20), stop_max_delay=minutes(5)) +def assert_that_event_exists(cluster: Cluster, stream_name_pattern: str, event_type_pattern: str): + stream_name, event = next(get_log_stream_events_by_event_type(cluster, stream_name_pattern, event_type_pattern)) + logger.info("Found event %s for %s", event, event_type_pattern) + info = get_node_info_from_stream_name(stream_name) + assert_that(event.get("cluster-name")).is_equal_to(cluster.name) + assert_that(event.get("scheduler")).is_equal_to("slurm") + assert_that(event.get("instance-id")).is_equal_to(info.get("instance-id")) + if "compute" in event: + assert_that(event.get("compute").get("address")).is_equal_to(info.get("ip")) + assert_that(event.get("node-role")).is_equal_to("ComputeFleet") + else: + assert_that(event.get("node-role")).is_equal_to("HeadNode") diff --git a/tests/integration-tests/tests/dashboard_and_alarms/test_dashboard_and_alarms.py b/tests/integration-tests/tests/monitoring/test_monitoring.py similarity index 88% rename from tests/integration-tests/tests/dashboard_and_alarms/test_dashboard_and_alarms.py rename to tests/integration-tests/tests/monitoring/test_monitoring.py index 5a6d017922..bcf94419ab 100644 --- a/tests/integration-tests/tests/dashboard_and_alarms/test_dashboard_and_alarms.py +++ b/tests/integration-tests/tests/monitoring/test_monitoring.py @@ -23,7 +23,7 @@ @pytest.mark.usefixtures("instance", "os", "scheduler") @pytest.mark.parametrize("dashboard_enabled, cw_log_enabled", [(True, True), (True, False), (False, False)]) -def test_dashboard_and_alarms( +def test_monitoring( dashboard_enabled, cw_log_enabled, region, @@ -37,21 +37,25 @@ def test_dashboard_and_alarms( ) cluster = clusters_factory(cluster_config) cw_client = boto3.client("cloudwatch", region_name=region) + headnode_instance_id = cluster.get_cluster_instance_ids(node_type="HeadNode")[0] + compute_instance_ids = cluster.get_cluster_instance_ids(node_type="Compute") + # the MinCount is set to 1, so we should have at least one compute node + assert_that(compute_instance_ids).is_not_empty() # test CWAgent metrics # we only perform this test for one of the 3 test conditions # because this test could be time-consuming (we allow some retries to ensure we can get metrics data) if dashboard_enabled and cw_log_enabled: - compute_instance_ids = cluster.get_cluster_instance_ids(node_type="Compute") - # the MinCount is set to 1, so we should have at least one compute node - assert_that(compute_instance_ids).is_not_empty() _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_ids[0]) # test dashboard and alarms _test_dashboard(cw_client, cluster.cfn_name, region, dashboard_enabled, cw_log_enabled) _test_alarms(cw_client, cluster.cfn_name, headnode_instance_id, dashboard_enabled) + # test detailed monitoring + _test_detailed_monitoring(region, compute_instance_ids) + @retry(stop_max_attempt_number=8, wait_fixed=minutes(2)) def _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_id): @@ -82,8 +86,10 @@ def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_e assert_that(dashboard_response["DashboardName"]).is_equal_to(dashboard_name) if cw_log_enabled: assert_that(dashboard_response["DashboardBody"]).contains("Head Node Logs") + assert_that(dashboard_response["DashboardBody"]).contains("Cluster Health Metrics") else: assert_that(dashboard_response["DashboardBody"]).does_not_contain("Head Node Logs") + assert_that(dashboard_response["DashboardBody"]).does_not_contain("Cluster Health Metrics") else: try: cw_client.get_dashboard(DashboardName=dashboard_name) @@ -188,3 +194,15 @@ def _verify_alarms(alarms, metric_name, instance_id): assert_that(alarms[0]["Dimensions"]).contains({"Name": "path", "Value": "/"}).contains( {"Name": "InstanceId", "Value": instance_id} ) + + +def _test_detailed_monitoring(region, compute_instance_ids): + ec2_response = boto3.client("ec2", region_name=region).describe_instances(InstanceIds=compute_instance_ids) + monitoring_states = [ + instance.get("Monitoring").get("State") + for reservation in ec2_response.get("Reservations") + for instance in reservation.get("Instances") + ] + assert_that(monitoring_states).is_not_empty() + assert_that(set(monitoring_states)).is_length(1) + assert_that(monitoring_states[0]).is_equal_to("enabled") diff --git a/tests/integration-tests/tests/dashboard_and_alarms/test_dashboard_and_alarms/test_dashboard_and_alarms/pcluster.config.yaml b/tests/integration-tests/tests/monitoring/test_monitoring/test_monitoring/pcluster.config.yaml similarity index 80% rename from tests/integration-tests/tests/dashboard_and_alarms/test_dashboard_and_alarms/test_dashboard_and_alarms/pcluster.config.yaml rename to tests/integration-tests/tests/monitoring/test_monitoring/test_monitoring/pcluster.config.yaml index 717e936768..4e2c55712e 100644 --- a/tests/integration-tests/tests/dashboard_and_alarms/test_dashboard_and_alarms/test_dashboard_and_alarms/pcluster.config.yaml +++ b/tests/integration-tests/tests/monitoring/test_monitoring/test_monitoring/pcluster.config.yaml @@ -23,7 +23,9 @@ Scheduling: {% else %} Instances: - InstanceType: {{ instance }} - # set MinCount to 1 in order to test compute node disk and memory metrics are not gathered + # set MinCount to 1 to ensure the existence of compute nodes + # in order to test compute node disk and memory metrics are not gathered + # and test detailed monitoring setting MinCount: 1 {% endif %} Monitoring: diff --git a/tests/integration-tests/tests/monitoring/test_structured_log_events.py b/tests/integration-tests/tests/monitoring/test_structured_log_events.py new file mode 100644 index 0000000000..f25d7c77ea --- /dev/null +++ b/tests/integration-tests/tests/monitoring/test_structured_log_events.py @@ -0,0 +1,53 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +import boto3 +import pytest +from remote_command_executor import RemoteCommandExecutor +from utils import test_cluster_health_metric + +from tests.monitoring.structured_log_event_utils import assert_that_event_exists + + +@pytest.mark.usefixtures("instance", "os", "scheduler") +def test_custom_compute_action_failure( + region, + pcluster_config_reader, + clusters_factory, + test_datadir, + s3_bucket_factory, + scheduler_commands_factory, +): + # Create S3 bucket for pre-install scripts + bucket_name = s3_bucket_factory() + + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) + bad_script = "on_compute_configured_error.sh" + bad_script_path = f"test_structured_logging/{bad_script}" + bucket.upload_file(str(test_datadir / bad_script), bad_script_path) + + cluster_config = pcluster_config_reader(bucket=bucket_name, bad_script_path=bad_script_path) + cluster = clusters_factory(cluster_config) + + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + + scheduler_commands.submit_command("hostname", nodes=1, partition="queue-1") + scheduler_commands.submit_command("hostname", nodes=1, partition="queue-2") + + assert_that_event_exists(cluster, r".+\.clustermgtd_events", "invalid-backing-instance-count") + assert_that_event_exists(cluster, r".+\.clustermgtd_events", "protected-mode-error-count") + assert_that_event_exists(cluster, r".+\.bootstrap_error_msg", "custom-action-error") + assert_that_event_exists(cluster, r".+\.clustermgtd_events", "compute-node-idle-time") + + test_cluster_health_metric(["OnNodeConfiguredRunErrors"], cluster.name, region) + test_cluster_health_metric(["MaxDynamicNodeIdleTime"], cluster.name, region) diff --git a/tests/integration-tests/tests/monitoring/test_structured_log_events/test_custom_compute_action_failure/on_compute_configured_error.sh b/tests/integration-tests/tests/monitoring/test_structured_log_events/test_custom_compute_action_failure/on_compute_configured_error.sh new file mode 100644 index 0000000000..e0dd1207da --- /dev/null +++ b/tests/integration-tests/tests/monitoring/test_structured_log_events/test_custom_compute_action_failure/on_compute_configured_error.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "This script can't stop, won't stop." 1>&2 +exit 1 diff --git a/tests/integration-tests/tests/monitoring/test_structured_log_events/test_custom_compute_action_failure/pcluster.config.yaml b/tests/integration-tests/tests/monitoring/test_structured_log_events/test_custom_compute_action_failure/pcluster.config.yaml new file mode 100644 index 0000000000..851125d2f5 --- /dev/null +++ b/tests/integration-tests/tests/monitoring/test_structured_log_events/test_custom_compute_action_failure/pcluster.config.yaml @@ -0,0 +1,47 @@ +Image: + Os: {{ os }} +HeadNode: + InstanceType: {{ instance }} + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} + Imds: + Secured: {{ imds_secured }} +Scheduling: + Scheduler: {{ scheduler }} + SlurmQueues: + - Name: queue-1 + ComputeResources: + - Name: compute-a + Instances: + - InstanceType: {{ instance }} + MinCount: 0 + MaxCount: 2 + Networking: + SubnetIds: + - {{ private_subnet_id }} + - Name: queue-2 + ComputeResources: + - Name: compute-b + Instances: + - InstanceType: {{ instance }} + MinCount: 0 + MaxCount: 2 + Networking: + SubnetIds: + - {{ private_subnet_id }} + Iam: + S3Access: + - BucketName: {{ bucket }} + EnableWriteAccess: False + CustomActions: + OnNodeConfigured: + Script: s3://{{ bucket }}/{{ bad_script_path }} + Args: + - "no args" +Monitoring: + Logs: + CloudWatch: + Enabled: true + RetentionInDays: 14 diff --git a/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.yaml b/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.yaml index 11439b8d84..9871de5bbb 100644 --- a/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.yaml +++ b/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.yaml @@ -1,6 +1,7 @@ Image: Os: {{ os }} HeadNode: + # Use multi-NIC instance type from https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html InstanceType: {{ instance }} Networking: SubnetId: {{ public_subnet_id }} @@ -25,17 +26,13 @@ Scheduling: MinCount: 1 Efa: Enabled: true - {% if instance == "p4d.24xlarge" %} - CapacityReservationTarget: - CapacityReservationResourceGroupArn: arn:aws:resource-groups:us-east-1:447714826191:group/EC2CRGroup - {% endif %} {% endif %} Networking: SubnetIds: - {{ private_subnet_id }} {% if scheduler != "awsbatch" %} PlacementGroup: - Enabled: {% if instance != "p4d.24xlarge" %}true{% else %}false{% endif %} + Enabled: false {% endif %} SharedStorage: - MountDir: /shared diff --git a/tests/integration-tests/tests/networking/test_cluster_networking.py b/tests/integration-tests/tests/networking/test_cluster_networking.py index e90dba782f..edb97b0fbd 100644 --- a/tests/integration-tests/tests/networking/test_cluster_networking.py +++ b/tests/integration-tests/tests/networking/test_cluster_networking.py @@ -31,8 +31,15 @@ ) from tests.common.osu_common import compile_osu from tests.common.schedulers_common import SlurmCommands +from tests.common.storage.constants import StorageType from tests.common.utils import get_default_vpc_security_group -from tests.storage.storage_common import assert_fsx_correctly_shared, assert_fsx_lustre_correctly_mounted, get_fsx_ids +from tests.storage.storage_common import ( + assert_fsx_lustre_correctly_mounted, + get_efs_ids, + get_fsx_ids, + test_efs_correctly_mounted, + verify_directory_correctly_shared, +) @pytest.mark.usefixtures("os", "scheduler", "instance") @@ -40,8 +47,9 @@ def test_cluster_in_private_subnet( region, pcluster_config_reader, clusters_factory, vpc_stack, scheduler_commands_factory ): # This test just creates a cluster in the private subnet and just checks that no failures occur - fsx_mount_dir = "/fsx_mount" - cluster_config = pcluster_config_reader(fsx_mount_dir=fsx_mount_dir) + storage_type = StorageType.STORAGE_EFS if "us-iso" in region else StorageType.STORAGE_FSX + mount_dir = "/private_storage_mount" + cluster_config = pcluster_config_reader(storage_type=storage_type.value, mount_dir=mount_dir) cluster = clusters_factory(cluster_config) assert_that(cluster).is_not_none() @@ -49,8 +57,9 @@ def test_cluster_in_private_subnet( assert_that(len(get_compute_nodes_instance_ids(cluster.cfn_name, region))).is_equal_to(1) remote_command_executor = RemoteCommandExecutor(cluster, bastion=bastion) scheduler_commands = scheduler_commands_factory(remote_command_executor) - _test_fsx_in_private_subnet(cluster, region, fsx_mount_dir, remote_command_executor, scheduler_commands) - + _test_shared_storage_in_private_subnet( + cluster, region, storage_type, mount_dir, remote_command_executor, scheduler_commands + ) lambda_vpc_config = cluster.config["DeploymentSettings"]["LambdaFunctionsVpcConfig"] assert_lambda_vpc_settings_are_correct( cluster.cfn_name, region, lambda_vpc_config["SecurityGroupIds"], lambda_vpc_config["SubnetIds"] @@ -92,11 +101,21 @@ def test_existing_eip(existing_eip, pcluster_config_reader, clusters_factory): connection.run("cat /var/log/cfn-init.log", timeout=60) -def _test_fsx_in_private_subnet(cluster, region, fsx_mount_dir, remote_command_executor, scheduler_commands): +def _test_shared_storage_in_private_subnet( + cluster, region, storage_type, mount_dir, remote_command_executor, scheduler_commands +): """Test FSx can be mounted in private subnet.""" - fsx_fs_id = get_fsx_ids(cluster, region)[0] - assert_fsx_lustre_correctly_mounted(remote_command_executor, fsx_mount_dir, region, fsx_fs_id) - assert_fsx_correctly_shared(scheduler_commands, remote_command_executor, fsx_mount_dir) + if storage_type == StorageType.STORAGE_EFS: + fs_id = get_efs_ids(cluster, region)[0] + test_efs_correctly_mounted(remote_command_executor, mount_dir, region, fs_id) + elif storage_type == StorageType.STORAGE_FSX: + fs_id = get_fsx_ids(cluster, region)[0] + assert_fsx_lustre_correctly_mounted(remote_command_executor, mount_dir, region, fs_id) + else: + raise Exception(f"The storage type '{storage_type}' is not supported in this test.") + verify_directory_correctly_shared( + remote_command_executor, mount_dir, scheduler_commands, partitions=scheduler_commands.get_partitions() + ) @pytest.mark.usefixtures("instance") diff --git a/tests/integration-tests/tests/networking/test_cluster_networking/test_cluster_in_private_subnet/pcluster.config.yaml b/tests/integration-tests/tests/networking/test_cluster_networking/test_cluster_in_private_subnet/pcluster.config.yaml index fb43dfc932..32702b7b11 100644 --- a/tests/integration-tests/tests/networking/test_cluster_networking/test_cluster_in_private_subnet/pcluster.config.yaml +++ b/tests/integration-tests/tests/networking/test_cluster_networking/test_cluster_in_private_subnet/pcluster.config.yaml @@ -28,15 +28,20 @@ Scheduling: SubnetIds: - {{ private_subnet_id }} SharedStorage: - - MountDir: {{ fsx_mount_dir }} - Name: privatefsx + - MountDir: {{ mount_dir }} + Name: privatestorage + {% if storage_type == "EFS" %} + StorageType: Efs + {% endif %} + {% if storage_type == "FSX" %} StorageType: FsxLustre FsxLustreSettings: StorageCapacity: 1200 DeploymentType: SCRATCH_2 + {% endif %} DeploymentSettings: LambdaFunctionsVpcConfig: SecurityGroupIds: - {{ default_vpc_security_group_id }} SubnetIds: - - {{ private_subnet_id }} \ No newline at end of file + - {{ private_subnet_id }} diff --git a/tests/integration-tests/tests/networking/test_placement_group.py b/tests/integration-tests/tests/networking/test_placement_group.py index d4c080af35..3af74f95da 100644 --- a/tests/integration-tests/tests/networking/test_placement_group.py +++ b/tests/integration-tests/tests/networking/test_placement_group.py @@ -61,5 +61,5 @@ def _get_launch_template_name(cluster, queue_name, compute_resource): def _get_slurm_placement_group_from_stack(cluster, region): stack_resources = utils.retrieve_cfn_resources(cluster.cfn_name, region) - placement_group = next(v for k, v in stack_resources.items() if k.startswith("ComputeFleetPlacementGroup")) + placement_group = next(v for k, v in stack_resources.items() if k.startswith("PlacementGroup")) return placement_group diff --git a/tests/integration-tests/tests/networking/test_security_groups.py b/tests/integration-tests/tests/networking/test_security_groups.py index 0d21609afa..16002c5726 100644 --- a/tests/integration-tests/tests/networking/test_security_groups.py +++ b/tests/integration-tests/tests/networking/test_security_groups.py @@ -61,8 +61,8 @@ def test_overwrite_sg(region, scheduler, custom_security_group, pcluster_config_ for instance in instances: assert_that(instance["SecurityGroups"]).is_length(1) - if scheduler != "awsbatch": - # FSx is not supported when using AWS Batch as a scheduler + # FSx is not supported in US isolated regions or when using AWS Batch as a scheduler + if "us-iso" not in region and scheduler != "awsbatch": logging.info("Collecting security groups of the FSx") fsx_id = cluster.cfn_resources[f"FSX{create_hash_suffix(fsx_name)}"] fsx_client = boto3.client("fsx", region_name=region) diff --git a/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.update.yaml b/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.update.yaml index bd141a0951..14be861ee3 100644 --- a/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.update.yaml @@ -34,7 +34,7 @@ SharedStorage: - MountDir: efs Name: {{ efs_name }} StorageType: Efs - {% if scheduler != "awsbatch" %} + {% if "us-iso" not in region and scheduler != "awsbatch" %} - MountDir: fsx Name: {{ fsx_name }} StorageType: FsxLustre diff --git a/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.yaml b/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.yaml index df2f402d27..8ee4d2b3a2 100644 --- a/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.yaml +++ b/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.yaml @@ -34,7 +34,7 @@ SharedStorage: - MountDir: efs Name: {{ efs_name }} StorageType: Efs - {% if scheduler != "awsbatch" %} + {% if "us-iso" not in region and scheduler != "awsbatch" %} - MountDir: fsx Name: {{ fsx_name }} StorageType: FsxLustre diff --git a/tests/integration-tests/tests/pcluster_api/test_api.py b/tests/integration-tests/tests/pcluster_api/test_api.py index 02fe6df0f8..c07d291326 100644 --- a/tests/integration-tests/tests/pcluster_api/test_api.py +++ b/tests/integration-tests/tests/pcluster_api/test_api.py @@ -12,8 +12,11 @@ import logging +from pathlib import Path import boto3 +import cfn_tools +import pkg_resources import pytest from assertpy import assert_that from benchmarks.common.util import get_instance_vcpus @@ -38,10 +41,11 @@ from pcluster_client.model.requested_compute_fleet_status import RequestedComputeFleetStatus from pcluster_client.model.update_cluster_request_content import UpdateClusterRequestContent from pcluster_client.model.update_compute_fleet_request_content import UpdateComputeFleetRequestContent +from troposphere.template_generator import TemplateGenerator from utils import generate_stack_name from tests.common.assertions import wait_for_num_instances_in_cluster -from tests.common.utils import retrieve_latest_ami +from tests.common.utils import get_installed_parallelcluster_version, retrieve_latest_ami LOGGER = logging.getLogger(__name__) NUM_OF_COMPUTE_INSTANCES = 2 @@ -121,26 +125,127 @@ def _ec2_wait(region, instances, waiter_type): waiter.wait(InstanceIds=instances) +@pytest.fixture(scope="session", name="resources_dir") +def resources_dir_fixture(): + return Path(pkg_resources.resource_filename(__name__, "/../../resources")) + + +@pytest.fixture(scope="session", name="policies_template_path") +def policies_template_path_fixture(resources_dir): + return resources_dir / ".." / ".." / ".." / "cloudformation" / "policies" / "parallelcluster-policies.yaml" + + +@pytest.fixture(scope="class", name="custom_actions_bucket_name") +def custom_actions_bucket_name_fixture(s3_bucket_factory): + return s3_bucket_factory() + + +@pytest.fixture(scope="class", name="policies_template_with_custom_actions_bucket_access") +def policies_template_with_custom_actions_bucket_access_fixture(policies_template_path, custom_actions_bucket_name): + with open(policies_template_path, "r", encoding="utf-8") as f: + policies_template = TemplateGenerator(cfn_tools.load_yaml(f.read())) + + policy_document = policies_template.resources.get("ParallelClusterClusterPolicy").properties["PolicyDocument"] + statement = policy_document.get("Statement") + statement.append( + { + "Action": ["s3:GetObject"], + "Effect": "Allow", + "Resource": {"Fn::Sub": f"arn:${{AWS::Partition}}:s3:::{custom_actions_bucket_name}/*"}, + } + ) + return policies_template.to_yaml() + + +@pytest.fixture(scope="class", name="policies_uri") +def policies_uri_fixture(policies_template_with_custom_actions_bucket_access, resource_bucket, region): + bucket = boto3.resource("s3", region_name=region).Bucket(resource_bucket) + path = f"parallelcluster/{get_installed_parallelcluster_version()}/templates/policies/custom-policies.yaml" + bucket.put_object(Key=path, Body=policies_template_with_custom_actions_bucket_access) + + yield (f"https://{resource_bucket}.s3.{region}.amazonaws.com{'.cn' if region.startswith('cn') else ''}/{path}") + + @pytest.mark.usefixtures("os", "instance") -def test_cluster_slurm(region, api_client, create_cluster, request, pcluster_config_reader, scheduler, instance): +def test_cluster_slurm( + region, + api_client, + create_cluster, + request, + pcluster_config_reader, + scheduler, + instance, + custom_actions_bucket_name, + test_datadir, +): assert_that(scheduler).is_equal_to("slurm") - _test_cluster_workflow(region, api_client, create_cluster, request, pcluster_config_reader, scheduler, instance) + _test_cluster_workflow( + region, + api_client, + create_cluster, + request, + pcluster_config_reader, + scheduler, + instance, + custom_actions_bucket_name, + test_datadir, + ) @pytest.mark.usefixtures("os", "instance") -def test_cluster_awsbatch(region, api_client, create_cluster, request, pcluster_config_reader, scheduler, instance): +def test_cluster_awsbatch( + region, + api_client, + create_cluster, + request, + pcluster_config_reader, + scheduler, + instance, + custom_actions_bucket_name, + test_datadir, +): assert_that(scheduler).is_equal_to("awsbatch") - _test_cluster_workflow(region, api_client, create_cluster, request, pcluster_config_reader, scheduler, instance) + _test_cluster_workflow( + region, + api_client, + create_cluster, + request, + pcluster_config_reader, + scheduler, + instance, + custom_actions_bucket_name, + test_datadir, + ) -def _test_cluster_workflow(region, api_client, create_cluster, request, pcluster_config_reader, scheduler, instance): - if scheduler == "slurm": - initial_config_file = pcluster_config_reader() - updated_config_file = pcluster_config_reader("pcluster.config.update.yaml") - else: - vcpus = get_instance_vcpus(region, instance) * NUM_OF_COMPUTE_INSTANCES - initial_config_file = pcluster_config_reader(vcpus=vcpus) - updated_config_file = pcluster_config_reader("pcluster.config.update.yaml", vcpus=vcpus) +def _test_cluster_workflow( + region, + api_client, + create_cluster, + request, + pcluster_config_reader, + scheduler, + instance, + custom_actions_bucket_name, + test_datadir, +): + script_name = "custom_action.sh" + bucket_name, https_url, s3_url = _create_custom_action_urls( + region, custom_actions_bucket_name, script_name, test_datadir + ) + + config_template_args = { + "bucket_name": bucket_name, + "on_node_start_script_sequence": _create_script_sequence("on_node_start", https_url, s3_url), + "on_node_configured_script_sequence": _create_script_sequence("on_node_configured", https_url, s3_url), + "on_node_updated_script_sequence": _create_script_sequence("on_node_updated", https_url, s3_url), + } + + if scheduler != "slurm": + config_template_args["vcpus"] = get_instance_vcpus(region, instance) * NUM_OF_COMPUTE_INSTANCES + + initial_config_file = pcluster_config_reader(**config_template_args) + updated_config_file = pcluster_config_reader("pcluster.config.update.yaml", **config_template_args) cluster_name = generate_stack_name("integ-tests", request.config.getoption("stackname_suffix")) cluster_operations_client = cluster_operations_api.ClusterOperationsApi(api_client) @@ -180,6 +285,40 @@ def _test_cluster_workflow(region, api_client, create_cluster, request, pcluster _test_delete_cluster(region, cluster_operations_client, cluster_name) +def _create_script_sequence(event_name, https_url, s3_url): + sequence = [] + + for i in range(10): + if i % 2 == 0: + cache_affinity = (i / 2) % 2 + url = f"{https_url}&cache_affinity={cache_affinity}" + else: + url = s3_url + + sequence.append( + { + "script": url, + "args": [f"echo {event_name} {i}"], + } + ) + + return sequence + + +def _create_custom_action_urls(region, custom_actions_bucket_name, script_name, test_datadir): + bucket_name = custom_actions_bucket_name + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) + script_path = f"scripts/{script_name}" + bucket.upload_file(str(test_datadir / script_name), script_path) + s3_url = f"s3://{bucket_name}/{script_path}" + https_url = boto3.client("s3", region_name=region).generate_presigned_url( + "get_object", + Params={"Bucket": bucket_name, "Key": script_path}, + ExpiresIn=86400, + ) + return bucket_name, https_url, s3_url + + def _test_describe_cluster_head_node(region, client, cluster_name): response = client.describe_cluster_instances( cluster_name=cluster_name, node_type=NodeType("HeadNode"), region=region @@ -314,6 +453,7 @@ def _test_create_cluster(client, create_cluster, cluster_name, config): cluster, response = create_cluster(client, cluster_name, config) LOGGER.info("Create cluster response: %s", response) assert_that(response.cluster.cluster_name).is_equal_to(cluster_name) + return cluster diff --git a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/custom_action.sh b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/custom_action.sh new file mode 100644 index 0000000000..0e9f3c5b4d --- /dev/null +++ b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/custom_action.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo "custom_action script has $# arguments" +for arg in "$@" +do + echo "executing ${arg}" + "${arg}" || echo "failed to execute ${arg}" +done diff --git a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/pcluster.config.update.yaml b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/pcluster.config.update.yaml index 9446d56781..add5f09b89 100644 --- a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/pcluster.config.update.yaml @@ -13,6 +13,29 @@ HeadNode: KeyName: {{ key_name }} Imds: Secured: False + Iam: + S3Access: + - BucketName: {{ bucket_name }} + EnableWriteAccess: true + CustomActions: + OnNodeStart: + Sequence: + {% for item in on_node_start_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} + OnNodeConfigured: + Sequence: + {% for item in on_node_configured_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} Scheduling: Scheduler: awsbatch AwsBatchQueues: diff --git a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/pcluster.config.yaml b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/pcluster.config.yaml index 8c9d28f5cc..7e1eda9a84 100644 --- a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/pcluster.config.yaml +++ b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_awsbatch/pcluster.config.yaml @@ -13,6 +13,29 @@ HeadNode: KeyName: {{ key_name }} Imds: Secured: False + Iam: + S3Access: + - BucketName: {{ bucket_name }} + EnableWriteAccess: true + CustomActions: + OnNodeStart: + Sequence: + {% for item in on_node_start_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} + OnNodeConfigured: + Sequence: + {% for item in on_node_configured_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} Scheduling: Scheduler: awsbatch AwsBatchQueues: diff --git a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/custom_action.sh b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/custom_action.sh new file mode 100644 index 0000000000..0e9f3c5b4d --- /dev/null +++ b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/custom_action.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo "custom_action script has $# arguments" +for arg in "$@" +do + echo "executing ${arg}" + "${arg}" || echo "failed to execute ${arg}" +done diff --git a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/pcluster.config.update.yaml b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/pcluster.config.update.yaml index 0a9a58aa64..9d27eba14a 100644 --- a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/pcluster.config.update.yaml @@ -6,6 +6,38 @@ HeadNode: SubnetId: {{ public_subnet_id }} Ssh: KeyName: {{ key_name }} + Iam: + S3Access: + - BucketName: {{ bucket_name }} + EnableWriteAccess: true + CustomActions: + OnNodeStart: + Sequence: + {% for item in on_node_start_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} + OnNodeConfigured: + Sequence: + {% for item in on_node_configured_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} + OnNodeUpdated: + Sequence: + {% for item in on_node_updated_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} Scheduling: Scheduler: {{ scheduler }} SlurmSettings: diff --git a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/pcluster.config.yaml b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/pcluster.config.yaml index af0c711129..66b57b0473 100644 --- a/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/pcluster.config.yaml +++ b/tests/integration-tests/tests/pcluster_api/test_api/test_cluster_slurm/pcluster.config.yaml @@ -6,6 +6,38 @@ HeadNode: SubnetId: {{ public_subnet_id }} Ssh: KeyName: {{ key_name }} + Iam: + S3Access: + - BucketName: {{ bucket_name }} + EnableWriteAccess: true + CustomActions: + OnNodeStart: + Sequence: + {% for item in on_node_start_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} + OnNodeConfigured: + Sequence: + {% for item in on_node_configured_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} + OnNodeUpdated: + Sequence: + {% for item in on_node_updated_script_sequence %} + - Args: + {% for arg in item.args %} + - {{ arg }} + {% endfor %} + Script: {{ item.script }} + {% endfor %} Scheduling: Scheduler: {{ scheduler }} SlurmSettings: diff --git a/tests/integration-tests/tests/pcluster_api/test_api_infrastructure.py b/tests/integration-tests/tests/pcluster_api/test_api_infrastructure.py index a5ff28458f..de4edb1efb 100644 --- a/tests/integration-tests/tests/pcluster_api/test_api_infrastructure.py +++ b/tests/integration-tests/tests/pcluster_api/test_api_infrastructure.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and limitations under the License. import logging import re -import time import boto3 import botocore @@ -30,18 +29,22 @@ @pytest.fixture() -def api_with_default_settings(api_infrastructure_s3_uri, public_ecr_image_uri, api_definition_s3_uri, request, region): +def api_with_default_settings( + api_infrastructure_s3_uri, api_definition_s3_uri, policies_uri, request, region, resource_bucket +): factory = CfnStacksFactory(request.config.getoption("credential")) params = [] if api_definition_s3_uri: params.append({"ParameterKey": "ApiDefinitionS3Uri", "ParameterValue": api_definition_s3_uri}) - if public_ecr_image_uri: - params.append({"ParameterKey": "PublicEcrImageUri", "ParameterValue": public_ecr_image_uri}) + if policies_uri: + params.append({"ParameterKey": "PoliciesTemplateUri", "ParameterValue": policies_uri}) + if resource_bucket: + params.append({"ParameterKey": "CustomBucket", "ParameterValue": resource_bucket}) template = ( api_infrastructure_s3_uri - or f"https://{region}-aws-parallelcluster.s3.{region}.amazonaws.com{'.cn' if region.startswith('cn') else ''}" + or f"https://{resource_bucket}.s3.{region}.amazonaws.com{'.cn' if region.startswith('cn') else ''}" f"/parallelcluster/{get_installed_parallelcluster_version()}/api/parallelcluster-api.yaml" ) logging.info(f"Creating API Server stack in {region} with template {template}") @@ -67,31 +70,23 @@ def test_api_infrastructure_with_default_parameters(region, api_with_default_set """ parallelcluster_lambda_name = api_with_default_settings.cfn_resources["ParallelClusterFunction"] parallelcluster_lambda_arn = api_with_default_settings.cfn_outputs["ParallelClusterLambdaArn"] - parallelcluster_api_copied_image_uri = api_with_default_settings.cfn_outputs["UriOfCopyOfPublicEcrImage"] parallelcluster_api_id = api_with_default_settings.cfn_resources["ApiGatewayApiWithoutCustomDomain"] parallelcluster_api_url = api_with_default_settings.cfn_outputs["ParallelClusterApiInvokeUrl"] parallelcluster_user_role = api_with_default_settings.cfn_outputs["ParallelClusterApiUserRole"] - image_builder_pipeline = api_with_default_settings.cfn_outputs["ParallelClusterDockerUpdateImagePipeline"] - _assert_parallelcluster_lambda( - lambda_name=parallelcluster_lambda_name, - lambda_arn=parallelcluster_lambda_arn, - lambda_image_uri=parallelcluster_api_copied_image_uri, - ) + _assert_parallelcluster_lambda(lambda_name=parallelcluster_lambda_name, lambda_arn=parallelcluster_lambda_arn) _assert_parallelcluster_api(api_id=parallelcluster_api_id, api_url=parallelcluster_api_url) _test_auth(region, parallelcluster_user_role, parallelcluster_api_url) - _test_docker_image_refresh(image_builder_pipeline, parallelcluster_lambda_name) _test_api_deletion(api_with_default_settings) -def _assert_parallelcluster_lambda(lambda_name, lambda_arn, lambda_image_uri): +def _assert_parallelcluster_lambda(lambda_name, lambda_arn): """Check that the ParallelCluster Lambda is correctly configured :param client: the Lambda client :param lambda_name: the name of the ParallelCluster Lambda :param lambda_arn: the ARN of the ParallelCluster Lambda - :param lambda_image_uri: the URI of the local copy of the ParallelCluster Lambda Docker image """ logging.info("Checking Lambda configuration") @@ -100,13 +95,15 @@ def _assert_parallelcluster_lambda(lambda_name, lambda_arn, lambda_image_uri): lambda_configuration = lambda_resource["Configuration"] assert_that(lambda_configuration["FunctionArn"]).is_equal_to(lambda_arn) assert_that(lambda_configuration["Timeout"]).is_equal_to(30) + assert_that(lambda_configuration).contains("Layers") + assert_that(len(lambda_configuration["Layers"])).is_equal_to(1) + assert_that(lambda_configuration["Layers"][0]).contains("Arn") if "TracingConfig" in lambda_configuration: # When executed in GovCloud get_function does not return TracingConfig assert_that(lambda_configuration["TracingConfig"]["Mode"]).is_equal_to("Active") assert_that(lambda_configuration["MemorySize"]).is_equal_to(2048) assert_that(lambda_resource["Tags"]).contains("parallelcluster:version") assert_that(lambda_resource["Tags"]).contains("parallelcluster:resource") - assert_that(lambda_resource["Code"]["ImageUri"]).is_equal_to(lambda_image_uri) def _assert_parallelcluster_api(api_id, api_url): @@ -167,35 +164,7 @@ def _test_api_deletion(api_stack): cfn = boto3.client("cloudformation") cfn.delete_stack(StackName=api_stack.name) - cfn.get_waiter("stack_delete_complete").wait( - StackName=api_stack.name, - ) - - -def _test_docker_image_refresh(image_builder_pipeline, lambda_name): - logging.info("Testing ImageBuilder pipeline and docker image refresh") - - image_builder = boto3.client("imagebuilder") - image_builder.start_image_pipeline_execution( - imagePipelineArn=image_builder_pipeline, - ) - response = image_builder.list_image_pipeline_images( - imagePipelineArn=image_builder_pipeline, - ) - - assert_that(response["imageSummaryList"]).is_length(1) - image = _wait_for_image_build(image_builder_pipeline) - logging.info("Image %s", image) - assert_that(image["state"]["status"]).is_equal_to("AVAILABLE") - - # Wait for 2 minutes for the Lambda to be updated - time.sleep(120) - lambda_client = boto3.client("lambda") - lambda_resource = lambda_client.get_function(FunctionName=lambda_name) - logging.info("API Lambda %s", lambda_resource) - assert_that(lambda_resource["Code"]["ImageUri"]).is_equal_to( - image["outputResources"]["containers"][0]["imageUris"][0] - ) + cfn.get_waiter("stack_delete_complete").wait(StackName=api_stack.name) @retry( diff --git a/tests/integration-tests/tests/scaling/test_scaling.py b/tests/integration-tests/tests/scaling/test_scaling.py index d080d72cb4..eb4c6f88e2 100644 --- a/tests/integration-tests/tests/scaling/test_scaling.py +++ b/tests/integration-tests/tests/scaling/test_scaling.py @@ -36,11 +36,22 @@ def test_multiple_jobs_submission( # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_execution_time = 9 + # Test using the max no of queues because the scheduler and node daemon operations take slight longer + # with multiple queues + no_of_queues = 100 - cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime) + cluster_config = pcluster_config_reader( + scaledown_idletime=scaledown_idletime, + no_of_queues=no_of_queues, + ) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = scheduler_commands_factory(remote_command_executor) + + # Check if the multiple partitions were created on Slurm + partitions = scheduler_commands.get_partitions() + assert_that(partitions).is_length(no_of_queues) + scheduler = "slurm" if scheduler == "slurm_plugin" else scheduler logging.info("Executing sleep job to start a dynamic node") diff --git a/tests/integration-tests/tests/scaling/test_scaling/test_multiple_jobs_submission/pcluster.config.yaml b/tests/integration-tests/tests/scaling/test_scaling/test_multiple_jobs_submission/pcluster.config.yaml index a205075087..96e81cd861 100644 --- a/tests/integration-tests/tests/scaling/test_scaling/test_multiple_jobs_submission/pcluster.config.yaml +++ b/tests/integration-tests/tests/scaling/test_scaling/test_multiple_jobs_submission/pcluster.config.yaml @@ -18,7 +18,8 @@ Scheduling: ScaledownIdletime: {{ scaledown_idletime }} {% endif %} {{ scheduler_prefix }}Queues: - - Name: queue-0 + {% for q in range(no_of_queues) %} + - Name: queue-{{q}} ComputeResources: - Name: compute-resource-0 Instances: @@ -26,3 +27,4 @@ Scheduling: Networking: SubnetIds: - {{ public_subnet_id }} + {% endfor %} diff --git a/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues.before_update_arm64.yaml b/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues.before_update_arm64.yaml index 0f5db4d49d..d615296597 100644 --- a/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues.before_update_arm64.yaml +++ b/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues.before_update_arm64.yaml @@ -23,10 +23,10 @@ EphemeralVolume: null RootVolume: Encrypted: true - Iops: 3000 Size: null - Throughput: 125 VolumeType: gp3 + Throughput: 125 + Iops: 3000 CustomActions: null CustomSettings: null Iam: diff --git a/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues.before_update_x86_64.yaml b/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues.before_update_x86_64.yaml index 8d09121121..34e7c911ea 100644 --- a/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues.before_update_x86_64.yaml +++ b/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues.before_update_x86_64.yaml @@ -23,10 +23,10 @@ EphemeralVolume: null RootVolume: Encrypted: true - Iops: 3000 Size: null - Throughput: 125 VolumeType: gp3 + Throughput: 125 + Iops: 3000 CustomActions: null CustomSettings: null Iam: diff --git a/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues_arm64.yaml b/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues_arm64.yaml index 8754af1e1d..66c84f6173 100644 --- a/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues_arm64.yaml +++ b/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues_arm64.yaml @@ -40,10 +40,10 @@ EphemeralVolume: null RootVolume: Encrypted: true - Iops: 3000 Size: null - Throughput: 125 VolumeType: gp3 + Throughput: 125 + Iops: 3000 CustomActions: null CustomSettings: null Iam: @@ -89,10 +89,10 @@ EphemeralVolume: null RootVolume: Encrypted: true - Iops: 3000 Size: null - Throughput: 125 VolumeType: gp3 + Throughput: 125 + Iops: 3000 CustomActions: null CustomSettings: null Iam: diff --git a/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues_x86_64.yaml b/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues_x86_64.yaml index 7ceb74d126..c2ef87ea33 100644 --- a/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues_x86_64.yaml +++ b/tests/integration-tests/tests/scheduler_plugin/test_scheduler_plugin/test_scheduler_plugin_integration/scheduler_queues_x86_64.yaml @@ -40,10 +40,10 @@ EphemeralVolume: null RootVolume: Encrypted: true - Iops: 3000 Size: null - Throughput: 125 VolumeType: gp3 + Throughput: 125 + Iops: 3000 CustomActions: null CustomSettings: null Iam: @@ -89,10 +89,10 @@ EphemeralVolume: null RootVolume: Encrypted: true - Iops: 3000 Size: null - Throughput: 125 VolumeType: gp3 + Throughput: 125 + Iops: 3000 CustomActions: null CustomSettings: null Iam: diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index feb0c262c2..cdf86df93d 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -22,7 +22,13 @@ from retrying import retry from tags_utils import convert_tags_dicts_to_tags_list, get_compute_node_tags from time_utils import minutes, seconds -from utils import check_status, get_compute_nodes_instance_ids, get_instance_info, wait_for_computefleet_changed +from utils import ( + check_status, + get_compute_nodes_instance_ids, + get_instance_info, + test_cluster_health_metric, + wait_for_computefleet_changed, +) from tests.common.assertions import ( assert_lines_in_logs, @@ -47,7 +53,8 @@ wait_for_num_nodes_in_scheduler, ) from tests.common.mpi_common import compile_mpi_ring -from tests.common.schedulers_common import TorqueCommands +from tests.common.schedulers_common import SlurmCommands, TorqueCommands +from tests.monitoring import structured_log_event_utils @pytest.mark.usefixtures("instance", "os") @@ -120,6 +127,7 @@ def test_slurm( gpu_instance_type, clustermgtd_conf_path, slurm_root_path, + slurm_commands, ) @@ -288,6 +296,7 @@ def test_slurm_protected_mode( ) pending_job_id = _test_active_job_running(scheduler_commands, remote_command_executor, clustermgtd_conf_path) _test_protected_mode(scheduler_commands, remote_command_executor, cluster) + test_cluster_health_metric(["NoCorrespondingInstanceErrors", "OnNodeStartRunErrors"], cluster.cfn_name, region) _test_job_run_in_working_queue(scheduler_commands) _test_recover_from_protected_mode(pending_job_id, pcluster_config_reader, bucket_name, cluster, scheduler_commands) @@ -329,6 +338,7 @@ def test_fast_capacity_failover( clusters_factory, test_datadir, scheduler_commands_factory, + region, ): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) @@ -349,6 +359,10 @@ def test_fast_capacity_failover( static_nodes_in_ice_compute_resource, ice_dynamic_nodes, ) + structured_log_event_utils.assert_that_event_exists( + cluster, r".+\.slurm_resume_events", "node-launch-failure-count" + ) + test_cluster_health_metric(["InsufficientCapacityErrors"], cluster.cfn_name, region) # remove logs from slurm_resume log and clustermgtd log in order to check logs after disable fast capacity fail-over remote_command_executor.run_remote_command("sudo truncate -s 0 /var/log/parallelcluster/slurm_resume.log") remote_command_executor.run_remote_command("sudo truncate -s 0 /var/log/parallelcluster/clustermgtd") @@ -384,6 +398,96 @@ def test_slurm_config_update( ) +@pytest.mark.usefixtures("region", "os", "instance", "scheduler") +@pytest.mark.slurm_custom_config_parameters +def test_slurm_custom_config_parameters( + region, + pcluster_config_reader, + clusters_factory, + test_datadir, + s3_bucket_factory, + scheduler_commands_factory, +): + """Test slurm custom settings.""" + # When launching a cluster with a Yaml with custom config parameters + + # Prepare bucket + bucket_name = s3_bucket_factory() + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) + bucket.upload_file(str(test_datadir / "custom_slurm_settings.txt"), "custom_slurm_settings.conf") + slurm_settings_file = f" s3://{bucket_name}/custom_slurm_settings.conf" + + cluster_config = pcluster_config_reader(bucket=bucket_name) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + slurm_commands = SlurmCommands(remote_command_executor) + + # then we expect custom values to be set in slurm config + # Bulk Settings + debug_flags = slurm_commands.get_conf_param("DebugFlags") + # must check them individually because they can be reordered + assert "Steps" in debug_flags + assert "Power" in debug_flags + assert "CpuFrequency" in debug_flags + assert "medium" == slurm_commands.get_conf_param("GpuFreqDef") + assert "5000" == slurm_commands.get_conf_param("MaxStepCount") + + # Partition + assert "5" == slurm_commands.get_partition_info("q1", "GraceTime") + assert "500" == slurm_commands.get_partition_info("q1", "MaxMemPerNode") + + # ComputeResource 1 + assert "50" == slurm_commands.get_node_attribute("q1-dy-cr1-1", "Weight") + assert "10000" == slurm_commands.get_node_attribute("q1-dy-cr1-1", "Port") + assert "2000" == slurm_commands.get_node_attribute("q1-dy-cr1-1", "Memory") + + # ComputeResource 2 + assert "150" == slurm_commands.get_node_attribute("q1-dy-cr2-1", "Weight") + assert "10010" == slurm_commands.get_node_attribute("q1-dy-cr2-1", "Port") + assert "2500" == slurm_commands.get_node_attribute("q1-dy-cr2-1", "Memory") + + # Update the cluster changing Queue and Compute resources settings in the yaml + # and Bulk settings with a config file in S3 + + updated_config_file = pcluster_config_reader( + config_file="pcluster.config.update.yaml", custom_settings_file=slurm_settings_file, bucket=bucket_name + ) + + # apply the changes on the cluster + logging.info("Updating the cluster to remove all the shared storage (managed storage will be retained)") + cluster.stop() + wait_for_computefleet_changed(cluster, "STOPPED") + cluster.update(str(updated_config_file)) + cluster.start() + wait_for_computefleet_changed(cluster, "RUNNING") + + # then we expect updated custom values to be set in slurm config + # Bulk Settings + debug_flags = slurm_commands.get_conf_param("DebugFlags") + # must check them individually because they can be reordered + assert "Steps" in debug_flags + assert "Power" in debug_flags + assert "CpuFrequency" in debug_flags + assert "BurstBuffer" in debug_flags + assert "Network" in debug_flags + assert "high" == slurm_commands.get_conf_param("GpuFreqDef") + assert "10000" == slurm_commands.get_conf_param("MaxStepCount") + + # Partition + assert "15" == slurm_commands.get_partition_info("q1", "GraceTime") + assert "1500" == slurm_commands.get_partition_info("q1", "MaxMemPerNode") + + # ComputeResource 1 + assert "75" == slurm_commands.get_node_attribute("q1-dy-cr1-1", "Weight") + assert "20000" == slurm_commands.get_node_attribute("q1-dy-cr1-1", "Port") + assert "4000" == slurm_commands.get_node_attribute("q1-dy-cr1-1", "Memory") + + # ComputeResource 2 + assert "250" == slurm_commands.get_node_attribute("q1-dy-cr2-1", "Weight") + assert "25000" == slurm_commands.get_node_attribute("q1-dy-cr2-1", "Port") + assert "4100" == slurm_commands.get_node_attribute("q1-dy-cr2-1", "Memory") + + @pytest.mark.usefixtures("region", "os", "instance", "scheduler") @pytest.mark.slurm_memory_based_scheduling def test_slurm_memory_based_scheduling( @@ -938,13 +1042,11 @@ def _test_cloud_node_health_check( _assert_slurmd_timeout(remote_command_executor, timeout=180) # Nodes with networking failures should fail slurm health check before failing ec2_status_check # Test on freshly launched dynamic nodes - kill_job_id = _submit_kill_networking_job( + _submit_kill_networking_job( remote_command_executor, scheduler_commands, partition, node_type="dynamic", num_nodes=num_dynamic_nodes ) # Sleep for a bit so the command to detach network interface can be run time.sleep(15) - # Job will hang, cancel it manually to avoid waiting for job failing - scheduler_commands.cancel_job(kill_job_id) # Assert nodes are put into DOWN for not responding # TO-DO: this test only works with num_dynamic = 1 because slurm will record this error in nodelist format # i.e. error: Nodes q2-st-t2large-[1-2] not responding, setting DOWN @@ -954,6 +1056,7 @@ def _test_cloud_node_health_check( ["/var/log/slurmctld.log"], ["Nodes {} not responding, setting DOWN".format(",".join(dynamic_nodes))], ) + test_cluster_health_metric(["SlurmNodeNotRespondingErrors"], cluster_name, region) # Assert dynamic nodes are reset _wait_for_node_reset(scheduler_commands, static_nodes=[], dynamic_nodes=dynamic_nodes) assert_num_instances_in_cluster(cluster_name, region, len(static_nodes)) @@ -989,6 +1092,7 @@ def _test_ec2_status_check_replacement( ["/var/log/parallelcluster/clustermgtd"], ["Setting nodes failing health check type ec2_health_check to DRAIN"], ) + test_cluster_health_metric(["EC2HealthCheckErrors"], cluster_name, region) scheduler_commands.cancel_job(kill_job_id) # Assert static nodes are reset _wait_for_node_reset( @@ -1373,7 +1477,7 @@ def _gpu_resource_check(slurm_commands, partition, instance_type, instance_type_ def _test_slurm_version(remote_command_executor): logging.info("Testing Slurm Version") version = remote_command_executor.run_remote_command("sinfo -V").stdout - assert_that(version).is_equal_to("slurm 22.05.8") + assert_that(version).is_equal_to("slurm 23.02.1") def _test_job_dependencies(slurm_commands, region, stack_name, scaledown_idletime): @@ -1716,6 +1820,7 @@ def _test_compute_node_bootstrap_timeout( gpu_instance_type, clustermgtd_conf_path, slurm_root_path, + slurm_commands, ): """Test compute_node_bootstrap_timeout is passed into slurm.conf and parallelcluster_clustermgtd.conf.""" slurm_parallelcluster_conf = remote_command_executor.run_remote_command( @@ -1725,7 +1830,7 @@ def _test_compute_node_bootstrap_timeout( clustermgtd_conf = remote_command_executor.run_remote_command(f"sudo cat {clustermgtd_conf_path}").stdout assert_that(clustermgtd_conf).contains(f"node_replacement_timeout = {compute_node_bootstrap_timeout}") # Update cluster - update_compute_node_bootstrap_timeout = 1200 + update_compute_node_bootstrap_timeout = 10 updated_config_file = pcluster_config_reader( scaledown_idletime=scaledown_idletime, gpu_instance_type=gpu_instance_type, @@ -1741,6 +1846,16 @@ def _test_compute_node_bootstrap_timeout( assert_that(clustermgtd_conf).contains(f"node_replacement_timeout = {update_compute_node_bootstrap_timeout}") assert_that(clustermgtd_conf).does_not_contain(f"node_replacement_timeout = {compute_node_bootstrap_timeout}") + slurm_commands.submit_command_and_assert_job_accepted( + submit_command_args={ + "command": "sleep 1", + "partition": "ondemand", + "constraint": "c5.xlarge", + "nodes": 2, + } + ) + test_cluster_health_metric(["InstanceBootstrapTimeoutErrors"], cluster.cfn_name, cluster.region) + def _retrieve_slurm_root_path(remote_command_executor): return remote_command_executor.run_remote_command("dirname $(dirname $(which scontrol))").stdout @@ -2098,8 +2213,8 @@ def _test_memory_based_scheduling_enabled_true( assert_that(slurm_commands.get_job_info(job_id_1, field="JobState")).is_equal_to("RUNNING") assert_that(slurm_commands.get_job_info(job_id_2, field="JobState")).is_equal_to("PENDING") # Check that memory appears in the TRES allocated for the job - assert_that(slurm_commands.get_job_info(job_id_1, field="TRES")).contains("mem=2000M") - assert_that(slurm_commands.get_job_info(job_id_2, field="TRES")).contains("mem=2000M") + assert_that(slurm_commands.get_job_info(job_id_1, field="ReqTRES")).contains("mem=2000M") + assert_that(slurm_commands.get_job_info(job_id_2, field="ReqTRES")).contains("mem=2000M") slurm_commands.wait_job_completed(job_id_1) slurm_commands.wait_job_completed(job_id_2) @@ -2182,7 +2297,7 @@ def trigger_slurm_reconfigure_race_condition(remote_command_executor): remote_command_executor, "/var/log/slurmctld.log", "slurmctld version .* started on cluster" ) reconfigure_time = _get_latest_timestamp_for_log_entry( - remote_command_executor, "/var/log/slurmctld.log", "_slurm_rpc_reconfigure_controller: completed" + remote_command_executor, "/var/log/slurmctld.log", "reconfigure_slurm: completed" ) assert_that(restart_time.second).is_equal_to(reconfigure_time.second) assert_that((reconfigure_time - restart_time).total_seconds()).is_less_than_or_equal_to(1.0) diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.update.config.yaml b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.update.config.yaml index 39cbd5f22b..b5b3df63f3 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.update.config.yaml +++ b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.update.config.yaml @@ -22,6 +22,7 @@ Scheduling: - Name: same-name-diff-queue Instances: - InstanceType: c5.xlarge + MinCount: 1 MaxCount: 5 - Name: gpu Networking: diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/custom_slurm_settings.txt b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/custom_slurm_settings.txt new file mode 100644 index 0000000000..0063483bed --- /dev/null +++ b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/custom_slurm_settings.txt @@ -0,0 +1,9 @@ +# +# Example slurm.conf file with custom settings +# +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +DebugFlags=Steps,Power,CpuFrequency,BurstBuffer,Network +GpuFreqDef=high +MaxStepCount=10000 diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/pcluster.config.update.yaml b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/pcluster.config.update.yaml new file mode 100644 index 0000000000..876b7cc37d --- /dev/null +++ b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/pcluster.config.update.yaml @@ -0,0 +1,42 @@ +Image: + Os: {{ os }} +HeadNode: + InstanceType: {{ instance }} + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} + Iam: + S3Access: + - BucketName: {{ bucket }} + EnableWriteAccess: true +Scheduling: + Scheduler: slurm + SlurmSettings: + CustomSlurmSettingsIncludeFile: {{ custom_settings_file }} + SlurmQueues: + - Name: q1 + CustomSlurmSettings: + GraceTime: 15 + MaxMemPerNode: 1500 + Networking: + SubnetIds: + - {{ private_subnet_id }} + ComputeResources: + - Name: cr1 + CustomSlurmSettings: + Port: 20000 + RealMemory: 4000 + Weight: 75 + Instances: + - InstanceType: t2.large + MinCount: 0 + - Name: cr2 + CustomSlurmSettings: + Port: 25000 + RealMemory: 4100 + Weight: 250 + Instances: + - InstanceType: t2.large + MinCount: 0 + diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/pcluster.config.yaml b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/pcluster.config.yaml new file mode 100644 index 0000000000..cc9884a657 --- /dev/null +++ b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm_custom_config_parameters/pcluster.config.yaml @@ -0,0 +1,44 @@ +Image: + Os: {{ os }} +HeadNode: + InstanceType: {{ instance }} + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} + Iam: + S3Access: + - BucketName: {{ bucket }} + EnableWriteAccess: true +Scheduling: + Scheduler: slurm + SlurmSettings: + CustomSlurmSettings: + - DebugFlags: Steps,Power,CpuFrequency + - GpuFreqDef: medium + - MaxStepCount: 5000 + SlurmQueues: + - Name: q1 + CustomSlurmSettings: + GraceTime: 5 + MaxMemPerNode: 500 + Networking: + SubnetIds: + - {{ private_subnet_id }} + ComputeResources: + - Name: cr1 + CustomSlurmSettings: + Port: 10000 + RealMemory: 2000 + Weight: 50 + Instances: + - InstanceType: t2.large + MinCount: 0 + - Name: cr2 + CustomSlurmSettings: + Port: 10010 + RealMemory: 2500 + Weight: 150 + Instances: + - InstanceType: t2.large + MinCount: 0 diff --git a/tests/integration-tests/tests/schedulers/test_slurm_accounting.py b/tests/integration-tests/tests/schedulers/test_slurm_accounting.py index f52fba4845..be694d3d18 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm_accounting.py +++ b/tests/integration-tests/tests/schedulers/test_slurm_accounting.py @@ -123,6 +123,14 @@ def _test_that_slurmdbd_is_running(remote_command_executor): assert_that(_is_accounting_enabled(remote_command_executor)).is_true() +def _test_slurm_accounting_password(remote_command_executor): + storage_pass = remote_command_executor.run_remote_command( + "sudo grep StoragePass /opt/slurm/etc/slurm_parallelcluster_slurmdbd.conf |" "sed -e 's/StoragePass=//g'", + hide=True, + ).stdout.strip() + assert_that(storage_pass).is_not_equal_to("dummy") + + @pytest.mark.usefixtures("os", "instance", "scheduler") def test_slurm_accounting( region, @@ -161,6 +169,17 @@ def test_slurm_accounting( _test_require_server_identity(remote_command_executor, test_resources_dir, region) _test_jobs_get_recorded(scheduler_commands) + # Update the queues to check that bug with the Slurm Accounting database server password + # is fixed (see https://github.com/aws/aws-parallelcluster/issues/5151 ) + updated_config_file = pcluster_config_reader( + config_file="pcluster.config.update.yaml", + public_subnet_id=public_subnet_id, + private_subnet_id=private_subnet_id, + **config_params, + ) + cluster.update(str(updated_config_file), force_update="true") + _test_slurm_accounting_password(remote_command_executor) + @pytest.mark.usefixtures("os", "instance", "scheduler") def test_slurm_accounting_disabled_to_enabled_update( diff --git a/tests/integration-tests/tests/schedulers/test_slurm_accounting/test_slurm_accounting/pcluster.config.update.yaml b/tests/integration-tests/tests/schedulers/test_slurm_accounting/test_slurm_accounting/pcluster.config.update.yaml new file mode 100644 index 0000000000..2441571382 --- /dev/null +++ b/tests/integration-tests/tests/schedulers/test_slurm_accounting/test_slurm_accounting/pcluster.config.update.yaml @@ -0,0 +1,41 @@ +Image: + Os: {{ os }} +HeadNode: + InstanceType: {{ instance }} + Networking: + SubnetId: {{ public_subnet_id }} + AdditionalSecurityGroups: + - {{ database_client_security_group }} + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AmazonSSMFullAccess + Ssh: + KeyName: {{ key_name }} + Imds: + Secured: {{ imds_secured }} +Scheduling: + Scheduler: {{ scheduler }} + SlurmSettings: + Database: + Uri: {{ database_host }} + UserName: {{ database_admin_user }} + PasswordSecretArn: {{ database_secret_arn }} + SlurmQueues: + - Name: compute + ComputeResources: + - Name: cit + Instances: + - InstanceType: {{ instance }} + MinCount: 0 + MaxCount: 12 + Networking: + SubnetIds: + - {{ private_subnet_id }} + Iam: + AdditionalIamPolicies: + - Policy: arn:aws:iam::aws:policy/AmazonSSMFullAccess +Monitoring: + Logs: + CloudWatch: + Enabled: true + RetentionInDays: 14 diff --git a/tests/integration-tests/tests/storage/kms_key_factory.py b/tests/integration-tests/tests/storage/kms_key_factory.py index 7e4093ce1c..aa1536198f 100644 --- a/tests/integration-tests/tests/storage/kms_key_factory.py +++ b/tests/integration-tests/tests/storage/kms_key_factory.py @@ -8,8 +8,9 @@ import pkg_resources from jinja2 import FileSystemLoader from jinja2.sandbox import SandboxedEnvironment +from utils import get_arn_partition -from tests.common.utils import get_sts_endpoint +from tests.common.utils import get_aws_domain, get_sts_endpoint class KMSKeyFactory: @@ -64,10 +65,8 @@ def _create_role(self, region): # Create the iam role logging.info("creating iam role {0} for creating KMS key...".format(iam_role_name)) - self.partition = next( - ("aws-" + partition for partition in ["us-gov", "cn"] if self.region.startswith(partition)), "aws" - ) - domain_suffix = ".cn" if self.partition == "aws-cn" else "" + self.partition = get_arn_partition(region) + aws_domain = get_aws_domain(self.region) # Add EC2 as trust entity of the IAM role trust_relationship_policy_ec2 = { @@ -75,7 +74,7 @@ def _create_role(self, region): "Statement": [ { "Effect": "Allow", - "Principal": {"Service": "ec2.amazonaws.com{0}".format(domain_suffix)}, + "Principal": {"Service": "ec2.{0}".format(aws_domain)}, "Action": "sts:AssumeRole", } ], diff --git a/tests/integration-tests/tests/storage/storage_common.py b/tests/integration-tests/tests/storage/storage_common.py index ce513df6f9..6348472529 100644 --- a/tests/integration-tests/tests/storage/storage_common.py +++ b/tests/integration-tests/tests/storage/storage_common.py @@ -352,9 +352,9 @@ def assert_subnet_az_relations_from_config( if expected_in_same_az: assert_that(set(cluster_avail_zones)).is_length(1) # If caller does not expect same az, we expect more availability zones. - elif "-iso" in region: - # For isolated regions, we only impose a weak check to make sure there are two or more availability zones. - assert_that(len(set(cluster_avail_zones))).is_greater_than_or_equal_to(2) + elif region == "us-isob-east-1": + # us-isob-east-1 provides 2 availability zones. + assert_that(len(set(cluster_avail_zones))).is_equal_to(2) else: # For other regions, we impose a strong check to make sure each subnet is in a different availability zone. assert_that(len(set(cluster_avail_zones))).is_equal_to(len(cluster_avail_zones)) diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index 80c137ef6c..fd4d6dde28 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -180,7 +180,7 @@ def test_ebs_existing( remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = scheduler_commands_factory(remote_command_executor) existing_mount_dir = "/" + existing_mount_dir - test_ebs_correctly_mounted(remote_command_executor, existing_mount_dir, volume_size="9.8") + test_ebs_correctly_mounted(remote_command_executor, existing_mount_dir, volume_size="9.[7,8]") _test_ebs_correctly_shared(remote_command_executor, existing_mount_dir, scheduler_commands) # Checks for test data result = remote_command_executor.run_remote_command("cat {}/test.txt".format(existing_mount_dir)) diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.yaml b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.yaml index 15ed696460..fd2570fc8a 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.yaml +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.yaml @@ -9,9 +9,13 @@ HeadNode: LocalStorage: RootVolume: Encrypted: false # Test turning off root volume encryption + {% if "-iso" in region %} + VolumeType: gp2 + {% else %} VolumeType: gp3 - Iops: 3400 Throughput: 135 + Iops: 3400 + {% endif %} Imds: Secured: {{ imds_secured }} Scheduling: @@ -23,9 +27,13 @@ Scheduling: LocalStorage: RootVolume: Encrypted: false # Test turning off root volume encryption + {% if "-iso" in region %} + VolumeType: gp2 + {% else %} VolumeType: gp3 - Iops: 3200 Throughput: 130 + Iops: 3200 + {% endif %} {% endif %} ComputeResources: - Name: compute-resource-0 @@ -48,9 +56,13 @@ Scheduling: LocalStorage: RootVolume: Encrypted: false + {% if "-iso" in region %} + VolumeType: gp2 + {% else %} VolumeType: gp3 - Iops: 3200 Throughput: 130 + Iops: 3200 + {% endif %} ComputeResources: - Name: compute-resource-0 Instances: @@ -66,11 +78,15 @@ SharedStorage: Name: ebs1 StorageType: Ebs EbsSettings: - Iops: 3200 Size: {{ volume_sizes[0] }} - VolumeType: gp3 Encrypted: true + {% if "-iso" in region %} + VolumeType: gp2 + {% else %} + VolumeType: gp3 Throughput: 130 + Iops: 3200 + {% endif %} - MountDir: {{ mount_dirs[1] }} Name: ebs2 StorageType: Ebs @@ -82,9 +98,13 @@ SharedStorage: Name: ebs3 StorageType: Ebs EbsSettings: - Iops: 150 Size: {{ volume_sizes[2] }} + {% if "-iso" in region %} + VolumeType: gp2 + {% else %} VolumeType: io2 + Iops: 150 + {% endif %} - MountDir: {{ mount_dirs[3] }} Name: ebs4 StorageType: Ebs diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index e9814c8100..9494f6c8af 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -14,6 +14,7 @@ import pytest from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor from retrying import retry from tags_utils import ( convert_tags_dicts_to_tags_list, @@ -26,6 +27,7 @@ ) from time_utils import minutes, seconds +from tests.common.schedulers_common import SlurmCommands from tests.common.utils import get_installed_parallelcluster_version @@ -64,6 +66,9 @@ def test_tag_propagation(pcluster_config_reader, clusters_factory, scheduler, os # Checks for tag propagation _check_tag_propagation(cluster, scheduler, os, volume_name) + if scheduler == "slurm": + _test_queue_and_compute_resources_tags(cluster, pcluster_config_reader, scheduler, os, volume_name) + @retry(wait_fixed=seconds(20), stop_max_delay=minutes(5)) def _wait_for_compute_fleet_start(cluster): @@ -71,8 +76,24 @@ def _wait_for_compute_fleet_start(cluster): assert_that(compute_nodes).is_length(1) -def _check_tag_propagation(cluster, scheduler, os, volume_name): - config_file_tags = {"ConfigFileTag": "ConfigFileTagValue"} +def _check_tag_propagation(cluster, scheduler, os, volume_name, queue_tags=None, compute_resource_tags=None): + config_file_tags = { + "ConfigFileTag": "ConfigFileTagValue", + "QueueOverrideTag": "ClusterLevelValue", + "ComputeOverrideTag": "ClusterLevelValue", + } + if not queue_tags: + queue_tags = { + "QueueTag": "QueueValue", + "QueueOverrideTag": "QueueLevelValue", + "ComputeOverrideTag": "QueueLevelValue", + } + if not compute_resource_tags: + compute_resource_tags = { + "ComputeResourceTag": "ComputeResourceValue", + "ComputeOverrideTag": "ComputeLevelValue", + } + version_tags = {"parallelcluster:version": get_installed_parallelcluster_version()} cluster_name_tags = {"parallelcluster:cluster-name": cluster.name} test_cases = [ @@ -101,7 +122,7 @@ def _check_tag_propagation(cluster, scheduler, os, volume_name): "expected_tags": ( cluster_name_tags, {"Name": "Compute", "parallelcluster:node-type": "Compute"}, - config_file_tags, + {**config_file_tags, **queue_tags, **compute_resource_tags}, ), "skip": scheduler == "awsbatch", }, @@ -111,7 +132,7 @@ def _check_tag_propagation(cluster, scheduler, os, volume_name): "expected_tags": ( cluster_name_tags, {"parallelcluster:node-type": "Compute"}, - config_file_tags if scheduler == "slurm" else {}, + {**config_file_tags, **queue_tags, **compute_resource_tags}, ), "tag_getter_kwargs": {"cluster": cluster, "os": os}, "skip": scheduler == "awsbatch", @@ -135,3 +156,37 @@ def _check_tag_propagation(cluster, scheduler, os, volume_name): observed_tags = tag_getter(**tag_getter_args) expected_tags = test_case["expected_tags"] assert_that(observed_tags).contains(*convert_tags_dicts_to_tags_list(expected_tags)) + + +def _test_queue_and_compute_resources_tags(cluster, pcluster_config_reader, scheduler, os, volume_name): + # Test update queue level tags and compute resource level tags with queue update strategy + command_executor = RemoteCommandExecutor(cluster) + slurm_commands = SlurmCommands(command_executor) + result = slurm_commands.submit_command("sleep infinity", constraint="static") + job_id = slurm_commands.assert_job_submitted(result.stdout) + slurm_commands.wait_job_running(job_id) + + # Updates cluster with new configuration + updated_cluster_config = pcluster_config_reader( + config_file="pcluster.config.update.queue_update.yaml", volume_name=volume_name + ) + cluster.update(str(updated_cluster_config)) + + slurm_commands.assert_job_state(job_id, "RUNNING") + # check nodes still have old tags before replacement + _check_tag_propagation(cluster, scheduler, os, volume_name) + + # requeue job to launch new instances for nodes + command_executor.run_remote_command(f"scontrol requeue {job_id}") + slurm_commands.wait_job_running(job_id) + # check new instances have new tags + new_queue_tags = { + "QueueTagUpdate": "QueueValueUpdate", + "QueueOverrideTag": "QueueLevelValueUpdate", + "ComputeOverrideTag": "QueueLevelValueUpdate", + } + new_compute_resource_tags = { + "ComputeResourceTagUpdate": "ComputeResourceValueUpdate", + "ComputeOverrideTag": "ComputeLevelValueUpdate", + } + _check_tag_propagation(cluster, scheduler, os, volume_name, new_queue_tags, new_compute_resource_tags) diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml new file mode 100644 index 0000000000..9e88424cc2 --- /dev/null +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.queue_update.yaml @@ -0,0 +1,47 @@ +Image: + Os: {{ os }} +Tags: + - Key: ConfigFileTag + Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue +HeadNode: + InstanceType: {{ instance }} + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} + Imds: + Secured: {{ imds_secured }} +Scheduling: + Scheduler: {{ scheduler }} + SlurmSettings: + QueueUpdateStrategy: DRAIN + SlurmQueues: + - Name: queue-0 + ComputeResources: + - Name: compute-resource-1 + Instances: + - InstanceType: {{ instance }} + MinCount: 1 + Tags: + - Key: ComputeResourceTagUpdate + Value: ComputeResourceValueUpdate + - Key: ComputeOverrideTag + Value: ComputeLevelValueUpdate + Networking: + SubnetIds: + - {{ private_subnet_id }} + Tags: + - Key: QueueTagUpdate + Value: QueueValueUpdate + - Key: QueueOverrideTag + Value: QueueLevelValueUpdate + - Key: ComputeOverrideTag + Value: QueueLevelValueUpdate +SharedStorage: + - MountDir: /shared + Name: {{ volume_name }} + StorageType: Ebs diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml index 5f5c2b46ea..dbc23b5a73 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.update.yaml @@ -3,6 +3,10 @@ Image: Tags: - Key: ConfigFileTag Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue HeadNode: InstanceType: {{ instance }} Networking: @@ -13,23 +17,42 @@ HeadNode: Secured: {{ imds_secured }} Scheduling: Scheduler: {{ scheduler }} - {% if scheduler == "awsbatch" %}AwsBatchQueues:{% else %}SlurmQueues:{% endif %} - - Name: queue-1 + {% if scheduler == "awsbatch" %} + AwsBatchQueues: + - Name: queue-0 ComputeResources: - Name: compute-resource-1 - {% if scheduler == "awsbatch" %} InstanceTypes: - {{ instance }} MinvCpus: 4 DesiredvCpus: 4 - {% else %} + Networking: + SubnetIds: + - {{ private_subnet_id }} + {% else %} + SlurmQueues: + - Name: queue-0 + ComputeResources: + - Name: compute-resource-1 Instances: - InstanceType: {{ instance }} MinCount: 1 - {% endif %} + Tags: + - Key: ComputeResourceTag + Value: ComputeResourceValue + - Key: ComputeOverrideTag + Value: ComputeLevelValue Networking: SubnetIds: - {{ private_subnet_id }} + Tags: + - Key: QueueTag + Value: QueueValue + - Key: QueueOverrideTag + Value: QueueLevelValue + - Key: ComputeOverrideTag + Value: QueueLevelValue + {% endif %} SharedStorage: - MountDir: /shared Name: {{ volume_name }} diff --git a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.yaml b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.yaml index 7cf64ce519..2fefaf77f2 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.yaml +++ b/tests/integration-tests/tests/tags/test_tag_propagation/test_tag_propagation/pcluster.config.yaml @@ -3,6 +3,10 @@ Image: Tags: - Key: ConfigFileTag Value: ConfigFileTagValue + - Key: QueueOverrideTag + Value: ClusterLevelValue + - Key: ComputeOverrideTag + Value: ClusterLevelValue HeadNode: InstanceType: {{ instance }} Networking: @@ -13,23 +17,42 @@ HeadNode: Secured: {{ imds_secured }} Scheduling: Scheduler: {{ scheduler }} - {% if scheduler == "awsbatch" %}AwsBatchQueues:{% else %}SlurmQueues:{% endif %} + {% if scheduler == "awsbatch" %} + AwsBatchQueues: - Name: queue-0 ComputeResources: - Name: compute-resource-0 - {% if scheduler == "awsbatch" %} InstanceTypes: - {{ instance }} MinvCpus: 4 DesiredvCpus: 4 - {% else %} + Networking: + SubnetIds: + - {{ private_subnet_id }} + {% else %} + SlurmQueues: + - Name: queue-0 + ComputeResources: + - Name: compute-resource-0 Instances: - InstanceType: {{ instance }} MinCount: 1 - {% endif %} + Tags: + - Key: ComputeResourceTag + Value: ComputeResourceValue + - Key: ComputeOverrideTag + Value: ComputeLevelValue Networking: SubnetIds: - {{ private_subnet_id }} + Tags: + - Key: QueueTag + Value: QueueValue + - Key: QueueOverrideTag + Value: QueueLevelValue + - Key: ComputeOverrideTag + Value: QueueLevelValue + {% endif %} SharedStorage: - MountDir: /shared Name: {{ volume_name }} diff --git a/tests/integration-tests/tests/trainium/test_trainium.py b/tests/integration-tests/tests/trainium/test_trainium.py index 10ecd74b30..1ff6b9dc84 100644 --- a/tests/integration-tests/tests/trainium/test_trainium.py +++ b/tests/integration-tests/tests/trainium/test_trainium.py @@ -38,6 +38,8 @@ def test_trainium( # _test_allreduce_single_node(test_datadir, remote_command_executor, scheduler_commands) _test_ccl_two_nodes(test_datadir, remote_command_executor, scheduler_commands) + _test_primary_ip(test_datadir, remote_command_executor, scheduler_commands) + def _test_allreduce_single_node(test_datadir, remote_command_executor, scheduler_commands): result = scheduler_commands.submit_script(str(test_datadir / "neuron-allreduce.sh"), partition="queue-trn2") @@ -64,3 +66,14 @@ def _test_ccl_two_nodes(test_datadir, remote_command_executor, scheduler_command print(result.stdout) assert_that(result.stdout).contains("CCL(1)", "CCL(50)", "CCL(99)", "CCL(100)") + + +def _test_primary_ip(test_datadir, remote_command_executor, scheduler_commands): + result = scheduler_commands.submit_script(str(test_datadir / "test-primary-ip.sh"), partition="queue-trn32") + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id) + scheduler_commands.assert_job_succeeded(job_id) + result = remote_command_executor.run_remote_command("cat output-primary-ip.txt") + + print(result.stdout) + assert_that(result.stdout).contains("PASSED") diff --git a/tests/integration-tests/tests/trainium/test_trainium/test_trainium/test-primary-ip.sh b/tests/integration-tests/tests/trainium/test_trainium/test_trainium/test-primary-ip.sh new file mode 100644 index 0000000000..42c523121f --- /dev/null +++ b/tests/integration-tests/tests/trainium/test_trainium/test_trainium/test-primary-ip.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Test the alignment of the Route 53 IP and the the host IP. + +OUTPUT_FILE="output-primary-ip.txt" + +# Get DNS Server +DNS_SERVER="" +grep Ubuntu /etc/issue &>/dev/null && DNS_SERVER=$(systemd-resolve --status | grep "DNS Servers" | awk '{print $3}' | sort -r | head -1) + +# Determine expected entry in /etc/hosts +IP="$(host $HOSTNAME $DNS_SERVER | tail -1 | awk '{print $4}')" +DOMAIN=$(jq .cluster.dns_domain /etc/chef/dna.json | tr -d \") +EXPECTED="$IP $HOSTNAME.${DOMAIN::-1} $HOSTNAME" +echo "Expected entry in /etc/hosts: $EXPECTED" | tee $OUTPUT_FILE + +# Retrieve actual entry in /etc/hosts +ACTUAL="$(grep "$HOSTNAME" /etc/hosts)" +echo "Actual entry in /etc/hosts: $ACTUAL" | tee -a $OUTPUT_FILE + +# Check +if [[ "$ACTUAL" == "$EXPECTED" ]]; then + echo "PASSED" | tee -a $OUTPUT_FILE +else + echo "ERROR: Route53 IP does not match host IP" | tee -a $OUTPUT_FILE +fi diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 9ade467119..2e323fab33 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -90,7 +90,7 @@ def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, cluster "instance_type": "c5.xlarge", }, { - "instance_type": "c5a.xlarge", + "instance_type": "c5n.xlarge", }, { "instance_type": "c5d.xlarge", @@ -165,7 +165,6 @@ def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, cluster retry(wait_fixed=seconds(20), stop_max_delay=minutes(5))(assert_initial_conditions)( slurm_commands, 2, 0, partition="queue1" ) - updated_queues_config = { "queue1": { "compute_resources": { @@ -175,7 +174,7 @@ def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, cluster "instance_type": "c5.xlarge", }, { - "instance_type": "c5a.xlarge", + "instance_type": "c5n.xlarge", }, { "instance_type": "c5d.xlarge", @@ -209,7 +208,7 @@ def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, cluster "enable_efa": False, }, }, - "compute_type": "spot", + "compute_type": "ondemand" if "us-iso" in region else "spot", }, "queue2": { "compute_resources": { @@ -533,7 +532,7 @@ def test_update_instance_list( submit_command_args={"command": "sleep 1000", "nodes": 1, "other_options": "--exclusive"} ) # Check instance type is the expected for min count - _check_instance_type(ec2, instances, "c5.xlarge") + _check_instance_type(ec2, instances, "c5d.xlarge") # Update cluster with new configuration, adding new instance type with lower price updated_config_file = pcluster_config_reader(config_file="pcluster.config.update.yaml") @@ -550,8 +549,8 @@ def test_update_instance_list( new_instances = cluster.get_cluster_instance_ids(node_type="Compute") logging.info(new_instances) new_instances.remove(instances[0]) - # Check new instance type is the expected one - _check_instance_type(ec2, new_instances, "c5a.xlarge") + # Check new instance type is the expected one, i.e. the one with lower price. + _check_instance_type(ec2, new_instances, "c5.xlarge") # Update cluster removing instance type from the list updated_config_file = pcluster_config_reader(config_file="pcluster.config.update.remove.yaml") @@ -661,11 +660,23 @@ def test_queue_parameters_update( # Create cluster with initial configuration initial_compute_root_volume_size = 35 updated_compute_root_volume_size = 40 - pcluster_ami_id = retrieve_latest_ami(region, os, ami_type="pcluster", request=request) + # If you are running this test in your personal account, then you must have + # ParallelCluster AMIs following the official naming convention + # and set allow_private_ami to True. + # We allow private AMIs also in US isolated regions to facilitate testing. + allow_private_ami = True if "us-iso" in region else False + pcluster_ami_id = retrieve_latest_ami( + region, os, ami_type="pcluster", request=request, allow_private_ami=allow_private_ami + ) + + logging.info(f"Latest AMI retrieved: {pcluster_ami_id}") + pcluster_copy_ami_id = ami_copy( pcluster_ami_id, "-".join(["test", "update", "computenode", generate_random_string()]) ) + logging.info(f"Copy of the latest AMI {pcluster_ami_id}: {pcluster_copy_ami_id}") + init_config_file = pcluster_config_reader( global_custom_ami=pcluster_ami_id, initial_compute_root_volume_size=initial_compute_root_volume_size ) @@ -731,6 +742,7 @@ def _test_update_without_queue_strategy( def _check_queue_ami(cluster, ec2, ami, queue_name): """Check if the ami of the queue instances are expected""" + logging.info(f"Checking that queue {queue_name} is using the expected AMI {ami}") instances = cluster.get_cluster_instance_ids(node_type="Compute", queue_name=queue_name) _check_instance_ami_id(ec2, instances, ami) @@ -832,7 +844,10 @@ def _test_update_queue_strategy_with_running_job( _check_queue_ami(cluster, ec2, pcluster_ami_id, "queue1") queue2_nodes = scheduler_commands.get_compute_nodes("queue2", all_nodes=True) - # assert queue2 node state are in expected status corresponding to the queue strategy + + logging.info( + f"Checking queue2 node state are in expected status corresponding to the queue strategy {queue_update_strategy}" + ) if queue_update_strategy == "DRAIN": scheduler_commands.assert_job_state(queue2_job_id, "RUNNING") _check_queue_ami(cluster, ec2, pcluster_ami_id, "queue2") @@ -846,7 +861,8 @@ def _test_update_queue_strategy_with_running_job( scheduler_commands.wait_job_running(queue2_job_id) # cancel job in queue1 scheduler_commands.cancel_job(queue1_job_id) - # check the new launching instances are using new amis + + logging.info("Checking that new compute nodes are using the new AMI") _check_queue_ami(cluster, ec2, pcluster_ami_id, "queue1") _check_queue_ami(cluster, ec2, pcluster_copy_ami_id, "queue2") assert_compute_node_states(scheduler_commands, queue1_nodes, "idle") diff --git a/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_drain.yaml b/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_drain.yaml index 78305b9df0..b42b7f1dc5 100644 --- a/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_drain.yaml +++ b/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_drain.yaml @@ -57,7 +57,7 @@ SharedStorage: Name: /manage-ebs StorageType: Ebs EbsSettings: - VolumeType: gp3 + VolumeType: {% if "-iso" in region %}gp2{% else %}gp3{% endif %} DeletionPolicy: {{ new_ebs_deletion_policy }} - MountDir: {{ existing_ebs_mount_dir }} Name: existing_ebs diff --git a/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_rollback.yaml b/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_rollback.yaml index 9e3b67036a..bc511449a3 100644 --- a/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_rollback.yaml +++ b/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_rollback.yaml @@ -45,7 +45,7 @@ SharedStorage: Name: {{ new_ebs_mount_dir }} StorageType: Ebs EbsSettings: - VolumeType: gp3 + VolumeType: {% if "-iso" in region %}gp2{% else %}gp3{% endif %} DeletionPolicy: Delete - MountDir: {{ problematic_ebs_mount_dir }} Name: {{ problematic_ebs_mount_dir }} diff --git a/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.update.remove.yaml b/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.update.remove.yaml index e2610348cc..6b5779222e 100644 --- a/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.update.remove.yaml +++ b/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.update.remove.yaml @@ -13,9 +13,9 @@ Scheduling: ComputeResources: - Name: queue1-i1 Instances: - - InstanceType: c5a.xlarge + - InstanceType: c5.xlarge MinCount: 1 MaxCount: 2 Networking: SubnetIds: - - {{ private_subnet_id }} \ No newline at end of file + - {{ private_subnet_id }} diff --git a/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.update.yaml b/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.update.yaml index f0c775a43b..681244a247 100644 --- a/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.update.yaml @@ -13,10 +13,10 @@ Scheduling: ComputeResources: - Name: queue1-i1 Instances: + - InstanceType: c5d.xlarge - InstanceType: c5.xlarge - - InstanceType: c5a.xlarge MinCount: 1 MaxCount: 2 Networking: SubnetIds: - - {{ private_subnet_id }} \ No newline at end of file + - {{ private_subnet_id }} diff --git a/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.yaml b/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.yaml index bccc551b35..96dd8b4b99 100644 --- a/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.yaml +++ b/tests/integration-tests/tests/update/test_update/test_update_instance_list/pcluster.config.yaml @@ -13,9 +13,9 @@ Scheduling: ComputeResources: - Name: queue1-i1 Instances: - - InstanceType: c5.xlarge + - InstanceType: c5d.xlarge MinCount: 1 MaxCount: 2 Networking: SubnetIds: - - {{ private_subnet_id }} \ No newline at end of file + - {{ private_subnet_id }} diff --git a/tests/integration-tests/tests/update/test_update/test_update_slurm/pcluster.config.update.yaml b/tests/integration-tests/tests/update/test_update/test_update_slurm/pcluster.config.update.yaml index be33ee7377..32379c92e9 100644 --- a/tests/integration-tests/tests/update/test_update/test_update_slurm/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/update/test_update/test_update_slurm/pcluster.config.update.yaml @@ -42,12 +42,12 @@ Scheduling: Script: s3://{{ resource_bucket }}/scripts/updated_postinstall.sh # Updated parameter value Args: - DEF # Updated parameter value - CapacityType: SPOT # Updated parameter value + CapacityType: {% if "us-iso" in region %}ONDEMAND{% else %}SPOT{% endif %} # Updated parameter value ComputeResources: - Name: queue1-i1 Instances: - InstanceType: c5.xlarge - - InstanceType: c5a.xlarge + - InstanceType: c5n.xlarge - InstanceType: c5d.xlarge MinCount: 2 # Increased parameter value MaxCount: 4 # Increased parameter value diff --git a/tests/integration-tests/tests/update/test_update/test_update_slurm/pcluster.config.yaml b/tests/integration-tests/tests/update/test_update/test_update_slurm/pcluster.config.yaml index acbfe17a81..51b04c1d2f 100644 --- a/tests/integration-tests/tests/update/test_update/test_update_slurm/pcluster.config.yaml +++ b/tests/integration-tests/tests/update/test_update/test_update_slurm/pcluster.config.yaml @@ -44,7 +44,7 @@ Scheduling: - Name: queue1-i1 Instances: - InstanceType: c5.xlarge - - InstanceType: c5a.xlarge + - InstanceType: c5n.xlarge - InstanceType: c5d.xlarge MinCount: 1 MaxCount: 2 diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index ed21c3e1d5..9cdf756ad5 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -18,6 +18,7 @@ import socket import string import subprocess +from datetime import datetime, timedelta from hashlib import sha1 import boto3 @@ -81,7 +82,9 @@ class StackError(BaseException): def __init__(self, message, stack_events=None): message = message if message else "StackError has been raised" - self.message = _format_stack_error(message, stack_events=stack_events) + _stack_events = list(stack_events) # resolve all events so that we can return them + self.message = _format_stack_error(message, stack_events=_stack_events) + self.stack_events = _stack_events def __str__(self): return f"StackError: {self.message}" @@ -270,8 +273,17 @@ def get_cfn_resources(stack_name, region=None): def retrieve_cfn_resources(stack_name, region): """Retrieve CloudFormation Stack Resources from a given stack.""" resources = {} - for resource in get_cfn_resources(stack_name, region): - resources[resource.get("LogicalResourceId")] = resource.get("PhysicalResourceId") + + def _retrieve_cfn_resources(stack_name, region): + for resource in get_cfn_resources(stack_name, region): + if resource.get("ResourceType") == "AWS::CloudFormation::Stack": + nested_stack_arn = resource.get("PhysicalResourceId") + nested_stack_name = get_stack_name_from_stack_arn(nested_stack_arn) + _retrieve_cfn_resources(nested_stack_name, region) + else: + resources[resource.get("LogicalResourceId")] = resource.get("PhysicalResourceId") + + _retrieve_cfn_resources(stack_name, region) return resources @@ -588,11 +600,15 @@ def get_metadata(metadata_path, raise_error=True): metadata_value = None try: metadata_base_url = "http://169.254.169.254/latest" - token = requests.put(f"{metadata_base_url}/api/token", headers={"X-aws-ec2-metadata-token-ttl-seconds": "300"}) + token = requests.put( + f"{metadata_base_url}/api/token", headers={"X-aws-ec2-metadata-token-ttl-seconds": "300"}, timeout=3 + ) headers = {} if token.status_code == requests.codes.ok: headers["X-aws-ec2-metadata-token"] = token.content + elif token.status_code >= 300: + raise Exception("Imds not reachable") metadata_value = requests.get(f"{metadata_base_url}/meta-data/{metadata_path}", headers=headers).text except Exception as e: error_msg = f"Unable to get {metadata_path} metadata. Failed with exception: {e}" @@ -646,6 +662,17 @@ def get_arn_partition(region): ) +def get_stack_name_from_stack_arn(arn): + """ + Return the Stack Name from a Stack ARN + E.g. + Stack ARN: "arn:aws:cloudformation:::stack//" + :param arn: + :return: + """ + return arn.rsplit("/", 2)[-2] if arn else "" + + def check_pcluster_list_cluster_log_streams(cluster, os, expected_log_streams=None): """Test pcluster list-cluster-logs functionality and return cfn-init log stream name.""" logging.info("Testing that pcluster list-cluster-log-streams is working as expected") @@ -699,3 +726,62 @@ def create_hash_suffix(string_to_hash: str): if string_to_hash == "HeadNode" else sha1(string_to_hash.encode("utf-8")).hexdigest()[:16].capitalize() # nosec nosemgrep ) + + +def _generate_metric_data_queries(metric_name, cluster_name): + return { + "Id": metric_name.lower(), + "MetricStat": { + "Metric": { + "Namespace": "ParallelCluster", + "MetricName": metric_name, + "Dimensions": [ + { + "Name": "ClusterName", + "Value": cluster_name, + } + ], + }, + "Period": 60, + "Stat": "Sum", + }, + } + + +def retrieve_metric_data( + cluster_name, + metric_names, + region, + collection_time_min=20, +): + """Create Boto3 get_metric_data request and output the results.""" + metric_queries = [_generate_metric_data_queries(name, cluster_name) for name in metric_names] + + client = boto3.client("cloudwatch", region) + + return client.get_metric_data( + MetricDataQueries=metric_queries, + StartTime=datetime.now() - timedelta(days=collection_time_min), + EndTime=datetime.now() + timedelta(days=collection_time_min), + ScanBy="TimestampDescending", + ) + + +def assert_metrics_has_data(response): + """ + Iterates through get_metric_data query output and check for desired results, + output in MetricDataResults format which is described here + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudwatch.html#CloudWatch.Client.get_metric_data + """ + list_of_responses = response["MetricDataResults"] + for response in list_of_responses: + assert_that(response["Values"]).is_not_empty() + assert_that(max(response["Values"])).is_greater_than(0) + + +@retry(stop_max_attempt_number=8, wait_fixed=minutes(2)) +def test_cluster_health_metric(metric_names, cluster_name, region): + """Test metric value is greater than 0 when the compute node error happens.""" + logging.info(f"Testing that {metric_names} have data.") + response = retrieve_metric_data(cluster_name, metric_names, region) + assert_metrics_has_data(response) diff --git a/util/bump-version.sh b/util/bump-version.sh index 24a3d37aea..b089c70fee 100755 --- a/util/bump-version.sh +++ b/util/bump-version.sh @@ -58,12 +58,13 @@ main() { sed -i "s/aws-parallelcluster-cookbook-$CURRENT_VERSION/aws-parallelcluster-cookbook-$NEW_VERSION/g" cli/src/pcluster/constants.py sed -i "s| version: $CURRENT_VERSION_SHORT| version: $NEW_VERSION_SHORT|g" cli/src/pcluster/api/openapi/openapi.yaml - sed -i "s|pcluster-api:$CURRENT_VERSION|pcluster-api:$NEW_VERSION|g" api/infrastructure/parallelcluster-api.yaml sed -i "s|parallelcluster/$CURRENT_VERSION|parallelcluster/$NEW_VERSION|g" api/infrastructure/parallelcluster-api.yaml sed -i "s| Version: $CURRENT_VERSION| Version: $NEW_VERSION|g" api/infrastructure/parallelcluster-api.yaml sed -i "s| ShortVersion: $CURRENT_VERSION_SHORT| ShortVersion: $NEW_VERSION_SHORT|g" api/infrastructure/parallelcluster-api.yaml sed -i "s| version: $CURRENT_VERSION_SHORT| version: $NEW_VERSION_SHORT|g" api/spec/openapi/ParallelCluster.openapi.yaml sed -i "s| version: \"$CURRENT_VERSION_SHORT\"| version: \"$NEW_VERSION_SHORT\"|g" api/spec/smithy/model/parallelcluster.smithy + sed -i "s| Version: $CURRENT_VERSION| Version: $NEW_VERSION|g" cloudformation/custom_resource/cluster.yaml + sed -i "s| Version: $CURRENT_VERSION| Version: $NEW_VERSION|g" cloudformation/custom_resource/cluster-1-click.yaml cp "$PC_SUPPORT_DIR/os_$CURRENT_VERSION.json" "$PC_SUPPORT_DIR/os_$NEW_VERSION.json" git add "$PC_SUPPORT_DIR/os_$NEW_VERSION.json" diff --git a/util/create-attribution-doc.sh b/util/create-attribution-doc.sh index 3da342b0e0..950b0a4d4f 100755 --- a/util/create-attribution-doc.sh +++ b/util/create-attribution-doc.sh @@ -7,7 +7,7 @@ append_package_details_to_final_license_file(){ # Function to Append Package Details to the THIRD-PARTY-LICENSES file #Arguments -> 1- Package Name, 2- Package Version, 3- License Type , 4- URL for package, 5,6,7- URL for License # Adding a header to final License file with Package Name, Package Version, License Type , URL for package - echo "\n\n\n$1 \n$2 \n$3 \n$4" >> $final_license_file + echo -e "\n\n\n$1 \n$2 \n$3 \n$4" >> $final_license_file # Appending License curl $5 >> $final_license_file # Adding Dual Licenses if they exist @@ -137,4 +137,4 @@ function main() { create_attribution_doc } -main "$@" +main "$@" \ No newline at end of file