From 7d46cd76cd8a0bc0fe92f385aa8f82dd6a785083 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Tue, 18 Jun 2024 07:10:48 -0700 Subject: [PATCH] Slurm dbd fix Signed-off-by: Hanwen --- .../external-slurmdbd/external-slurmdbd.json | 1148 ++++++++--------- .../external-slurmdbd/resources/user_data.sh | 3 +- 2 files changed, 576 insertions(+), 575 deletions(-) diff --git a/cloudformation/external-slurmdbd/external-slurmdbd.json b/cloudformation/external-slurmdbd/external-slurmdbd.json index 8f81b2fa6b..fa9d1cbd19 100644 --- a/cloudformation/external-slurmdbd/external-slurmdbd.json +++ b/cloudformation/external-slurmdbd/external-slurmdbd.json @@ -1,602 +1,602 @@ { - "Outputs": { - "AccountingClientSecurityGroup": { - "Description": "Security Group ID that allows traffic from the slurmctld to slurmdbd", - "Value": { - "Ref": "SlurmdbdClientSecurityGroup" - } - }, - "SlurmdbdConfigS3BucketName": { - "Description": "S3 Bucket where a copy of the slurmdbd configuration files can be stored and re-used when re-provisioning the slurmdbd instance", - "Value": { - "Ref": "ExternalSlurmdbdS3Bucket" - } - }, - "SlurmdbdPort": { - "Description": "Port used to connect to slurmdbd service", - "Value": { - "Ref": "SlurmdbdPort" - } - }, - "SlurmdbdPrivateIp": { - "Description": "Secondary Private IP Address of the slurmdbd instance", - "Value": { - "Ref": "PrivateIp" - } - }, - "SshClientSecurityGroup": { - "Description": "Security Group ID that allows SSH traffic from the HeadNode to slurmdbd instance", - "Value": { - "Ref": "SSHClientSecurityGroup" - } + "Parameters": { + "VPCId": { + "Type": "String", + "Description": "The VPC to be used for the Slurmdbd stack." + }, + "SubnetId": { + "Type": "AWS::EC2::Subnet::Id", + "Description": "The Subnet to be used for the Slurmdbd stack." + }, + "DBMSUri": { + "Type": "String", + "Description": "DBMS URI for Slurmdbd." + }, + "DBMSUsername": { + "Type": "String", + "Description": "DBMS Username for Slurmdbd." + }, + "DBMSPasswordSecretArn": { + "Type": "String", + "Description": "Secret ARN for DBMS password." + }, + "DBMSDatabaseName": { + "Type": "String", + "Description": "DBMS Database Name for Slurmdbd." + }, + "MungeKeySecretArn": { + "Type": "String", + "Description": "Secret ARN for Munge key." + }, + "CustomCookbookUrl": { + "Type": "String", + "Default": "", + "Description": "URL of the custom Chef Cookbook." + }, + "EnableSlurmdbdSystemService": { + "Type": "String", + "Default": "false", + "AllowedValues": [ + "true", + "false" + ], + "Description": "[Warning] It is not recommended to enable this if the database was created by a different version of SlurmDBD. If the database contains a large number of entries, the SlurmDBD daemon may require tens of minutes to update the database and be unresponsive during this time interval. Before upgrading SlurmDBD it is strongly recommended to make a backup of the database.See Slurm documentation for details: https://slurm.schedmd.com/quickstart_admin.html#upgrade" + }, + "SlurmdbdPort": { + "Type": "Number", + "Default": 6819, + "Description": "The port the slurmdbd service listens to." + }, + "AmiId": { + "Type": "AWS::EC2::Image::Id", + "Description": "The AMI id for the EC2 instance." + }, + "InstanceType": { + "Type": "String", + "Description": "The instance type for the EC2 instance" + }, + "KeyName": { + "Type": "AWS::EC2::KeyPair::KeyName", + "Description": "The SSH key name to access the instance (for management purposes only)" + }, + "PrivateIp": { + "Type": "String", + "Description": "Static private IP address + prefix to assign to the slurmdbd instance" + }, + "PrivatePrefix": { + "Type": "String", + "Description": "Subnet prefix to assign with the private IP to the slurmdbd instance" + }, + "DBMSClientSG": { + "Type": "AWS::EC2::SecurityGroup::Id", + "Description": "DBMS Client Security Group Id" + } + }, + "Resources": { + "SSHServerSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "GroupDescription": "Allow SSH access to slurmdbd instance (server)", + "VpcId": { + "Ref": "VPCId" } + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/SSHServerSecurityGroup" + } }, - "Parameters": { - "AmiId": { - "Description": "The AMI id for the EC2 instance.", - "Type": "AWS::EC2::Image::Id" - }, - "CustomCookbookUrl": { - "Default": "", - "Description": "URL of the custom Chef Cookbook.", - "Type": "String" - }, - "DBMSClientSG": { - "Description": "DBMS Client Security Group Id", - "Type": "AWS::EC2::SecurityGroup::Id" - }, - "DBMSDatabaseName": { - "Description": "DBMS Database Name for Slurmdbd.", - "Type": "String" - }, - "DBMSPasswordSecretArn": { - "Description": "Secret ARN for DBMS password.", - "Type": "String" - }, - "DBMSUri": { - "Description": "DBMS URI for Slurmdbd.", - "Type": "String" - }, - "DBMSUsername": { - "Description": "DBMS Username for Slurmdbd.", - "Type": "String" - }, - "EnableSlurmdbdSystemService": { - "AllowedValues": [ - "true", - "false" - ], - "Default": "false", - "Description": "[Warning] It is not recommended to enable this if the Database was created by a different version of SlurmDBD. If the database contains a large number of entries, the SlurmDBD daemon may require tens of minutes to update the database and be unresponsive during this time interval. Before upgrading SlurmDBD it is strongly recommended to make a backup of the database.See Slurm documentation for details: https://slurm.schedmd.com/quickstart_admin.html#upgrade", - "Type": "String" - }, - "InstanceType": { - "Description": "The instance type for the EC2 instance", - "Type": "String" - }, - "KeyName": { - "Description": "The SSH key name to access the instance (for management purposes only)", - "Type": "AWS::EC2::KeyPair::KeyName" - }, - "MungeKeySecretArn": { - "Description": "Secret ARN for Munge key.", - "Type": "String" - }, - "PrivateIp": { - "Description": "Static private IP address + prefix to assign to the slurmdbd instance", - "Type": "String" - }, - "PrivatePrefix": { - "Description": "Subnet prefix to assign with the private IP to the slurmdbd instance", - "Type": "String" - }, - "SlurmdbdPort": { - "Default": 6819, - "Description": "The port the slurmdbd service listens to.", - "Type": "Number" - }, - "SubnetId": { - "Description": "The Subnet to be used for the Slurmdbd stack.", - "Type": "AWS::EC2::Subnet::Id" - }, - "VPCId": { - "Description": "The VPC to be used for the Slurmdbd stack.", - "Type": "String" + "SSHClientSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "GroupDescription": "Allow SSH access to slurmdbd instance (client)", + "VpcId": { + "Ref": "VPCId" } + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/SSHClientSecurityGroup" + } }, - "Resources": { - "AllowSSHaccessfromclientSG": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/Allow SSH access from client SG" - }, - "Properties": { - "FromPort": 22, - "GroupId": { - "Ref": "SSHServerSecurityGroup" - }, - "IpProtocol": "tcp", - "SourceSecurityGroupId": { - "Ref": "SSHClientSecurityGroup" - }, - "ToPort": 22 - }, - "Type": "AWS::EC2::SecurityGroupIngress" - }, - "AllowSlurmaccountingtrafficfromtheclusterheadnode": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/Allow Slurm accounting traffic from the cluster head node" - }, - "Properties": { - "FromPort": { - "Ref": "SlurmdbdPort" - }, - "GroupId": { - "Ref": "SlurmdbdServerSecurityGroup" - }, - "IpProtocol": "tcp", - "SourceSecurityGroupId": { - "Ref": "SlurmdbdClientSecurityGroup" - }, - "ToPort": { - "Ref": "SlurmdbdPort" - } - }, - "Type": "AWS::EC2::SecurityGroupIngress" + "AllowSSHaccessfromclientSG": { + "Type": "AWS::EC2::SecurityGroupIngress", + "Properties": { + "FromPort": 22, + "GroupId": { + "Ref": "SSHServerSecurityGroup" + }, + "IpProtocol": "tcp", + "SourceSecurityGroupId": { + "Ref": "SSHClientSecurityGroup" + }, + "ToPort": 22 + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/Allow SSH access from client SG" + } + }, + "SlurmdbdServerSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "GroupDescription": "Allow Slurm accounting traffic to the slurmdbd instance (server)", + "VpcId": { + "Ref": "VPCId" + } + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/SlurmdbdServerSecurityGroup" + } + }, + "SlurmdbdClientSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "GroupDescription": "Allow Slurm accounting traffic from the cluster head node (client)", + "VpcId": { + "Ref": "VPCId" + } + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/SlurmdbdClientSecurityGroup" + } + }, + "AllowSlurmaccountingtrafficfromtheclusterheadnode": { + "Type": "AWS::EC2::SecurityGroupIngress", + "Properties": { + "FromPort": { + "Ref": "SlurmdbdPort" }, - "Allowtrafficcomingfromslurmdbdinstance": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/Allow traffic coming from slurmdbd instance" - }, - "Properties": { - "FromPort": 6820, - "GroupId": { - "Ref": "SlurmdbdClientSecurityGroup" - }, - "IpProtocol": "tcp", - "SourceSecurityGroupId": { - "Ref": "SlurmdbdServerSecurityGroup" - }, - "ToPort": 6829 - }, - "Type": "AWS::EC2::SecurityGroupIngress" + "GroupId": { + "Ref": "SlurmdbdServerSecurityGroup" }, - "ExternalSlurmdbdASG": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/External-Slurmdbd-ASG" - }, - "Properties": { - "DesiredCapacity": "1", - "LaunchTemplate": { - "LaunchTemplateId": { - "Ref": "LaunchTemplate" - }, - "Version": { - "Fn::GetAtt": [ - "LaunchTemplate", - "LatestVersionNumber" - ] - } - }, - "MaxSize": "1", - "MinSize": "1", - "VPCZoneIdentifier": [ - { - "Ref": "SubnetId" - } - ] - }, - "Type": "AWS::AutoScaling::AutoScalingGroup" + "IpProtocol": "tcp", + "SourceSecurityGroupId": { + "Ref": "SlurmdbdClientSecurityGroup" }, - "ExternalSlurmdbdInstanceProfile": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/ExternalSlurmdbdInstanceProfile" - }, - "Properties": { - "Roles": [ + "ToPort": { + "Ref": "SlurmdbdPort" + } + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/Allow Slurm accounting traffic from the cluster head node" + } + }, + "Allowtrafficcomingfromslurmdbdinstance": { + "Type": "AWS::EC2::SecurityGroupIngress", + "Properties": { + "FromPort": 6820, + "GroupId": { + "Ref": "SlurmdbdClientSecurityGroup" + }, + "IpProtocol": "tcp", + "SourceSecurityGroupId": { + "Ref": "SlurmdbdServerSecurityGroup" + }, + "ToPort": 6829 + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/Allow traffic coming from slurmdbd instance" + } + }, + "SlurmdbdLogGroup7510BC34": { + "Type": "AWS::Logs::LogGroup", + "Properties": { + "LogGroupName": { + "Fn::Join": [ + "-", + [ + { + "Fn::Join": [ + "", + [ + "/aws/parallelcluster/external-slurmdbd/", { - "Ref": "SlurmdbdInstanceRole" + "Ref": "AWS::StackName" } + ] ] - }, - "Type": "AWS::IAM::InstanceProfile" - }, - "ExternalSlurmdbdPolicies": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/ExternalSlurmdbdPolicies" - }, - "Properties": { - "PolicyDocument": { - "Statement": [ - { - "Action": "secretsmanager:GetSecretValue", - "Effect": "Allow", - "Resource": [ - { - "Ref": "DBMSPasswordSecretArn" - }, - { - "Ref": "MungeKeySecretArn" - } - ], - "Sid": "SecretsManagerPolicy" - }, - { - "Action": [ - "logs:CreateLogStream", - "logs:PutLogEvents" - ], - "Effect": "Allow", - "Resource": { - "Fn::GetAtt": [ - "SlurmdbdLogGroup7510BC34", - "Arn" - ] - }, - "Sid": "CloudWatchLogsPolicy" - }, - { - "Action": "ec2:AssignPrivateIpAddresses", - "Condition": { - "StringLike": { - "ec2:Subnet": { - "Fn::Join": [ - "", - [ - "*", - { - "Ref": "SubnetId" - } - ] - ] - } - } - }, - "Effect": "Allow", - "Resource": "*", - "Sid": "IPAssignmentPolicy" - }, - { - "Action": "s3:ListBucket", - "Effect": "Allow", - "Resource": { - "Fn::GetAtt": [ - "ExternalSlurmdbdS3Bucket", - "Arn" - ] - }, - "Sid": "S3BucketPolicy" - }, - { - "Action": [ - "s3:AbortMultipartUpload", - "s3:DeleteObject", - "s3:GetObject", - "s3:PutObject" - ], - "Effect": "Allow", - "Resource": { - "Fn::Join": [ - "", - [ - { - "Fn::GetAtt": [ - "ExternalSlurmdbdS3Bucket", - "Arn" - ] - }, - "/*" - ] - ] - }, - "Sid": "S3BucketObjectsPolicy" - } - ], - "Version": "2012-10-17" - }, - "PolicyName": "ExternalSlurmdbdPolicies", - "Roles": [ - { - "Ref": "SlurmdbdInstanceRole" - } + }, + { + "Fn::Select": [ + 4, + { + "Fn::Split": [ + "-", + { + "Fn::Select": [ + 2, + { + "Fn::Split": [ + "/", + { + "Ref": "AWS::StackId" + } + ] + } + ] + } + ] + } ] - }, - "Type": "AWS::IAM::Policy" - }, - "ExternalSlurmdbdS3Bucket": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/ExternalSlurmdbdS3Bucket" - }, - "Properties": { - "PublicAccessBlockConfiguration": { - "BlockPublicAcls": true, - "BlockPublicPolicy": true, - "IgnorePublicAcls": true, - "RestrictPublicBuckets": true + } + ] + ] + }, + "RetentionInDays": 7 + }, + "UpdateReplacePolicy": "Retain", + "DeletionPolicy": "Retain", + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/SlurmdbdLogGroup/Resource" + } + }, + "ExternalSlurmdbdS3Bucket": { + "Type": "AWS::S3::Bucket", + "Properties": { + "PublicAccessBlockConfiguration": { + "BlockPublicAcls": true, + "BlockPublicPolicy": true, + "IgnorePublicAcls": true, + "RestrictPublicBuckets": true + }, + "VersioningConfiguration": { + "Status": "Enabled" + } + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/ExternalSlurmdbdS3Bucket" + } + }, + "SlurmdbdInstanceRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Action": "sts:AssumeRole", + "Effect": "Allow", + "Principal": { + "Service": { + "Fn::Join": [ + "", + [ + "ec2.", + { + "Ref": "AWS::URLSuffix" + } + ] + ] + } + } + } + ], + "Version": "2012-10-17" + }, + "Description": "Role for Slurmdbd EC2 instance to access necessary AWS resources" + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/SlurmdbdInstanceRole" + } + }, + "ExternalSlurmdbdPolicies": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": "secretsmanager:GetSecretValue", + "Effect": "Allow", + "Resource": [ + { + "Ref": "DBMSPasswordSecretArn" }, - "VersioningConfiguration": { - "Status": "Enabled" + { + "Ref": "MungeKeySecretArn" } + ], + "Sid": "SecretsManagerPolicy" }, - "Type": "AWS::S3::Bucket" - }, - "LaunchTemplate": { - "Metadata": { - "AWS::CloudFormation::Init": { - "configSets": { - "default": [ - "setup" - ] - }, - "setup": { - "commands": { - "chef": { - "command": "cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::external_slurmdbd_config", - "cwd": "/etc/chef" - } - }, - "files": { - "/etc/chef/dna.json": { - "content": { - "Fn::Join": [ - "", - [ - "{\"slurmdbd_ip\": \"", - { - "Ref": "PrivateIp" - }, - "\", \"slurmdbd_port\": ", - { - "Ref": "SlurmdbdPort" - }, - ", \"dbms_uri\": \"", - { - "Ref": "DBMSUri" - }, - "\", \"dbms_username\": \"", - { - "Ref": "DBMSUsername" - }, - "\", \"dbms_database_name\": \"", - { - "Ref": "DBMSDatabaseName" - }, - "\", \"dbms_password_secret_arn\": \"", - { - "Ref": "DBMSPasswordSecretArn" - }, - "\", \"munge_key_secret_arn\": \"", - { - "Ref": "MungeKeySecretArn" - }, - "\", \"slurmdbd_conf_bucket\": \"", - { - "Ref": "ExternalSlurmdbdS3Bucket" - }, - "\", \"cluster\": {\"region\": \"", - { - "Ref": "AWS::Region" - }, - "\", \"log_group_name\": \"", - { - "Ref": "SlurmdbdLogGroup7510BC34" - }, - "\", \"stack_name\": \"", - { - "Ref": "AWS::StackName" - }, - "\", \"node_type\": \"ExternalSlurmDbd\", \"cw_logging_enabled\": \"true\", \"slurmdbd_service_enabled\": \"", - { - "Ref": "EnableSlurmdbdSystemService" - }, - "\"}}" - ] - ] - }, - "group": "root", - "mode": "000644", - "owner": "root" - } - } - } - }, - "aws:cdk:path": "ExternalSlurmdbdStack/LaunchTemplate" + { + "Action": [ + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Effect": "Allow", + "Resource": { + "Fn::GetAtt": [ + "SlurmdbdLogGroup7510BC34", + "Arn" + ] + }, + "Sid": "CloudWatchLogsPolicy" }, - "Properties": { - "LaunchTemplateData": { - "IamInstanceProfile": { - "Name": { - "Ref": "ExternalSlurmdbdInstanceProfile" - } - }, - "ImageId": { - "Ref": "AmiId" - }, - "InstanceType": { - "Ref": "InstanceType" - }, - "KeyName": { - "Ref": "KeyName" - }, - "MetadataOptions": { - "HttpTokens": "required" - }, - "NetworkInterfaces": [ + { + "Action": "ec2:AssignPrivateIpAddresses", + "Condition": { + "StringLike": { + "ec2:Subnet": { + "Fn::Join": [ + "", + [ + "*", { - "DeviceIndex": 0, - "Groups": [ - { - "Ref": "SSHServerSecurityGroup" - }, - { - "Ref": "SlurmdbdServerSecurityGroup" - }, - { - "Ref": "DBMSClientSG" - } - ], - "SubnetId": { - "Ref": "SubnetId" - } + "Ref": "SubnetId" } - ], - "UserData": { - "Fn::Base64": { - "Fn::Sub": [ - "Content-Type: multipart/mixed; boundary=\"==BOUNDARY==\"\nMIME-Version: 1.0\n\n--==BOUNDARY==\nContent-Type: text/cloud-config; charset=us-ascii\nMIME-Version: 1.0\n\npackage_update: false\npackage_upgrade: false\nrepo_upgrade: none\ndatasource_list: [ Ec2, None ]\n\n--==BOUNDARY==\nContent-Type: text/x-shellscript; charset=\"us-ascii\"\nMIME-Version: 1.0\n#!/bin/bash -x\n\nfunction vendor_cookbook\n{\n mkdir /tmp/cookbooks\n cd /tmp/cookbooks\n tar -xzf /etc/chef/aws-parallelcluster-cookbook.tgz\n HOME_BAK=\"${!HOME}\"\n export HOME=\"/tmp\"\n for d in /tmp/cookbooks/*; do\n cd \"$d\" || continue\n LANG=en_US.UTF-8 /opt/cinc/embedded/bin/berks vendor /etc/chef/cookbooks --delete\n done;\n export HOME=\"${!HOME_BAK}\"\n}\n\nfunction wait_for_private_ip_assignment\n{\n rc=1\n retries=10\n retry=1\n sleeptime=1\n while [ \\( $rc -eq 1 \\) -a \\( $retry -le $retries \\) ]; do\n number_of_ips=$(curl -H \"X-aws-ec2-metadata-token: $TOKEN\" -v http://169.254.169.254/latest/meta-data/network/interfaces/macs/\"$MAC\"/local-ipv4s | wc -l)\n ((number_of_ips>0))\n rc=$?\n retry=$((retry+1))\n sleep $sleeptime\n done\n return $rc\n}\n\nTOKEN=`curl -X PUT \"http://169.254.169.254/latest/api/token\" -H \"X-aws-ec2-metadata-token-ttl-seconds: 21600\"`\nMAC=$(curl -H \"X-aws-ec2-metadata-token: $TOKEN\" -v http://169.254.169.254/latest/meta-data/mac)\nENI_ID=$(curl -H \"X-aws-ec2-metadata-token: $TOKEN\" -v http://169.254.169.254/latest/meta-data/network/interfaces/macs/\"$MAC\"/interface-id)\n\naws ec2 assign-private-ip-addresses --region \"${AWS::Region}\" --network-interface-id \"${!ENI_ID}\" --private-ip-addresses ${PrivateIp} --allow-reassignment\n\nwait_for_private_ip_assignment || echo \"Assignment of private IP ${PrivateIp} was not successful.\"\n\nip addr add ${PrivateIp}/${SubnetPrefix} dev eth0\n\nif [ \"${CustomCookbookUrl}\" != \"NONE\" ]; then\n curl --retry 3 -v -L -o /etc/chef/aws-parallelcluster-cookbook.tgz ${CustomCookbookUrl}\n vendor_cookbook\nfi\n\n# This is necessary to find the cfn-init application\nexport PATH=/opt/aws/bin:${!PATH}\n[ -f /etc/parallelcluster/pcluster_cookbook_environment.sh ] && . /etc/parallelcluster/pcluster_cookbook_environment.sh\n\n$CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -s ${AWS::StackName} -v -c default -r LaunchTemplate --region \"${AWS::Region}\"\n", - { - "CustomCookbookUrl": { - "Ref": "CustomCookbookUrl" - }, - "PrivateIp": { - "Ref": "PrivateIp" - }, - "Region": { - "Ref": "AWS::Region" - }, - "StackName": { - "Ref": "AWS::StackName" - }, - "SubnetPrefix": { - "Ref": "PrivatePrefix" - } - } - ] - } - } - } - }, - "Type": "AWS::EC2::LaunchTemplate" - }, - "SSHClientSecurityGroup": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/SSHClientSecurityGroup" - }, - "Properties": { - "GroupDescription": "Allow SSH access to slurmdbd instance (client)", - "VpcId": { - "Ref": "VPCId" + ] + ] + } } + }, + "Effect": "Allow", + "Resource": "*", + "Sid": "IPAssignmentPolicy" }, - "Type": "AWS::EC2::SecurityGroup" - }, - "SSHServerSecurityGroup": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/SSHServerSecurityGroup" + { + "Action": "s3:ListBucket", + "Effect": "Allow", + "Resource": { + "Fn::GetAtt": [ + "ExternalSlurmdbdS3Bucket", + "Arn" + ] + }, + "Sid": "S3BucketPolicy" }, - "Properties": { - "GroupDescription": "Allow SSH access to slurmdbd instance (server)", - "VpcId": { - "Ref": "VPCId" + { + "Action": [ + "s3:AbortMultipartUpload", + "s3:DeleteObject", + "s3:GetObject", + "s3:PutObject" + ], + "Effect": "Allow", + "Resource": { + "Fn::Join": [ + "", + [ + { + "Fn::GetAtt": [ + "ExternalSlurmdbdS3Bucket", + "Arn" + ] + }, + "/*" + ] + ] + }, + "Sid": "S3BucketObjectsPolicy" + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "ExternalSlurmdbdPolicies", + "Roles": [ + { + "Ref": "SlurmdbdInstanceRole" + } + ] + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/ExternalSlurmdbdPolicies" + } + }, + "ExternalSlurmdbdInstanceProfile": { + "Type": "AWS::IAM::InstanceProfile", + "Properties": { + "Roles": [ + { + "Ref": "SlurmdbdInstanceRole" + } + ] + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/ExternalSlurmdbdInstanceProfile" + } + }, + "LaunchTemplate": { + "Type": "AWS::EC2::LaunchTemplate", + "Properties": { + "LaunchTemplateData": { + "IamInstanceProfile": { + "Name": { + "Ref": "ExternalSlurmdbdInstanceProfile" + } + }, + "ImageId": { + "Ref": "AmiId" + }, + "InstanceType": { + "Ref": "InstanceType" + }, + "KeyName": { + "Ref": "KeyName" + }, + "MetadataOptions": { + "HttpTokens": "required" + }, + "NetworkInterfaces": [ + { + "DeviceIndex": 0, + "Groups": [ + { + "Ref": "SSHServerSecurityGroup" + }, + { + "Ref": "SlurmdbdServerSecurityGroup" + }, + { + "Ref": "DBMSClientSG" } - }, - "Type": "AWS::EC2::SecurityGroup" - }, - "SlurmdbdClientSecurityGroup": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/SlurmdbdClientSecurityGroup" - }, - "Properties": { - "GroupDescription": "Allow Slurm accounting traffic from the cluster head node (client)", - "VpcId": { - "Ref": "VPCId" + ], + "SubnetId": { + "Ref": "SubnetId" + } + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Sub": [ + "Content-Type: multipart/mixed; boundary=\"==BOUNDARY==\"\nMIME-Version: 1.0\n\n--==BOUNDARY==\nContent-Type: text/cloud-config; charset=us-ascii\nMIME-Version: 1.0\n\npackage_update: false\npackage_upgrade: false\nrepo_upgrade: none\ndatasource_list: [ Ec2, None ]\n\n--==BOUNDARY==\nContent-Type: text/x-shellscript; charset=\"us-ascii\"\nMIME-Version: 1.0\n#!/bin/bash -x\n\nfunction vendor_cookbook\n{\n mkdir /tmp/cookbooks\n cd /tmp/cookbooks\n tar -xzf /etc/chef/aws-parallelcluster-cookbook.tgz\n HOME_BAK=\"${!HOME}\"\n export HOME=\"/tmp\"\n for d in /tmp/cookbooks/*; do\n cd \"$d\" || continue\n LANG=en_US.UTF-8 /opt/cinc/embedded/bin/berks vendor /etc/chef/cookbooks --delete\n done;\n export HOME=\"${!HOME_BAK}\"\n}\n\nfunction wait_for_private_ip_assignment\n{\n rc=1\n retries=10\n retry=1\n sleeptime=1\n while [ \\( $rc -eq 1 \\) -a \\( $retry -le $retries \\) ]; do\n number_of_ips=$(curl -H \"X-aws-ec2-metadata-token: $TOKEN\" -v http://169.254.169.254/latest/meta-data/network/interfaces/macs/\"$MAC\"/local-ipv4s | wc -l)\n ((number_of_ips>0))\n rc=$?\n retry=$((retry+1))\n sleep $sleeptime\n done\n return $rc\n}\n\nTOKEN=`curl -X PUT \"http://169.254.169.254/latest/api/token\" -H \"X-aws-ec2-metadata-token-ttl-seconds: 21600\"`\nMAC=$(curl -H \"X-aws-ec2-metadata-token: $TOKEN\" -v http://169.254.169.254/latest/meta-data/mac)\nENI_ID=$(curl -H \"X-aws-ec2-metadata-token: $TOKEN\" -v http://169.254.169.254/latest/meta-data/network/interfaces/macs/\"$MAC\"/interface-id)\nDEVICE_NAME=$(ls /sys/class/net | grep e)\n\naws ec2 assign-private-ip-addresses --region \"${AWS::Region}\" --network-interface-id \"${!ENI_ID}\" --private-ip-addresses ${PrivateIp} --allow-reassignment\n\nwait_for_private_ip_assignment || echo \"Assignment of private IP ${PrivateIp} was not successful.\"\n\nip addr add ${PrivateIp}/${SubnetPrefix} dev ${DEVICE_NAME}\n\nif [ \"${CustomCookbookUrl}\" != \"NONE\" ]; then\n curl --retry 3 -v -L -o /etc/chef/aws-parallelcluster-cookbook.tgz ${CustomCookbookUrl}\n vendor_cookbook\nfi\n\n# This is necessary to find the cfn-init application\nexport PATH=/opt/aws/bin:${!PATH}\n[ -f /etc/parallelcluster/pcluster_cookbook_environment.sh ] && . /etc/parallelcluster/pcluster_cookbook_environment.sh\n\n$CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -s ${AWS::StackName} -v -c default -r LaunchTemplate --region \"${AWS::Region}\"\n", + { + "CustomCookbookUrl": { + "Ref": "CustomCookbookUrl" + }, + "StackName": { + "Ref": "AWS::StackName" + }, + "Region": { + "Ref": "AWS::Region" + }, + "PrivateIp": { + "Ref": "PrivateIp" + }, + "SubnetPrefix": { + "Ref": "PrivatePrefix" + } } - }, - "Type": "AWS::EC2::SecurityGroup" - }, - "SlurmdbdInstanceRole": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/SlurmdbdInstanceRole" - }, - "Properties": { - "AssumeRolePolicyDocument": { - "Statement": [ - { - "Action": "sts:AssumeRole", - "Effect": "Allow", - "Principal": { - "Service": { - "Fn::Join": [ - "", - [ - "ec2.", - { - "Ref": "AWS::URLSuffix" - } - ] - ] - } - } - } - ], - "Version": "2012-10-17" - }, - "Description": "Role for Slurmdbd EC2 instance to access necessary AWS resources" - }, - "Type": "AWS::IAM::Role" - }, - "SlurmdbdLogGroup7510BC34": { - "DeletionPolicy": "Retain", - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/SlurmdbdLogGroup/Resource" - }, - "Properties": { - "LogGroupName": { - "Fn::Join": [ - "-", - [ - { - "Fn::Join": [ - "", - [ - "/aws/parallelcluster/external-slurmdbd/", - { - "Ref": "AWS::StackName" - } - ] - ] - }, - { - "Fn::Select": [ - 4, - { - "Fn::Split": [ - "-", - { - "Fn::Select": [ - 2, - { - "Fn::Split": [ - "/", - { - "Ref": "AWS::StackId" - } - ] - } - ] - } - ] - } - ] - } - ] + ] + } + } + } + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/LaunchTemplate", + "AWS::CloudFormation::Init": { + "configSets": { + "default": [ + "setup" + ] + }, + "setup": { + "files": { + "/etc/chef/dna.json": { + "content": { + "Fn::Join": [ + "", + [ + "{\"slurmdbd_ip\": \"", + { + "Ref": "PrivateIp" + }, + "\", \"slurmdbd_port\": ", + { + "Ref": "SlurmdbdPort" + }, + ", \"dbms_uri\": \"", + { + "Ref": "DBMSUri" + }, + "\", \"dbms_username\": \"", + { + "Ref": "DBMSUsername" + }, + "\", \"dbms_database_name\": \"", + { + "Ref": "DBMSDatabaseName" + }, + "\", \"dbms_password_secret_arn\": \"", + { + "Ref": "DBMSPasswordSecretArn" + }, + "\", \"munge_key_secret_arn\": \"", + { + "Ref": "MungeKeySecretArn" + }, + "\", \"slurmdbd_conf_bucket\": \"", + { + "Ref": "ExternalSlurmdbdS3Bucket" + }, + "\", \"cluster\": {\"region\": \"", + { + "Ref": "AWS::Region" + }, + "\", \"log_group_name\": \"", + { + "Ref": "SlurmdbdLogGroup7510BC34" + }, + "\", \"stack_name\": \"", + { + "Ref": "AWS::StackName" + }, + "\", \"node_type\": \"ExternalSlurmDbd\", \"cw_logging_enabled\": \"true\", \"slurmdbd_service_enabled\": \"", + { + "Ref": "EnableSlurmdbdSystemService" + }, + "\"}}" ] + ] }, - "RetentionInDays": 7 - }, - "Type": "AWS::Logs::LogGroup", - "UpdateReplacePolicy": "Retain" - }, - "SlurmdbdServerSecurityGroup": { - "Metadata": { - "aws:cdk:path": "ExternalSlurmdbdStack/SlurmdbdServerSecurityGroup" + "mode": "000644", + "owner": "root", + "group": "root" + } }, - "Properties": { - "GroupDescription": "Allow Slurm accounting traffic to the slurmdbd instance (server)", - "VpcId": { - "Ref": "VPCId" - } - }, - "Type": "AWS::EC2::SecurityGroup" + "commands": { + "chef": { + "command": "cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::external_slurmdbd_config", + "cwd": "/etc/chef" + } + } + } } + } + }, + "ExternalSlurmdbdASG": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "Properties": { + "DesiredCapacity": "1", + "LaunchTemplate": { + "LaunchTemplateId": { + "Ref": "LaunchTemplate" + }, + "Version": { + "Fn::GetAtt": [ + "LaunchTemplate", + "LatestVersionNumber" + ] + } + }, + "MaxSize": "1", + "MinSize": "1", + "VPCZoneIdentifier": [ + { + "Ref": "SubnetId" + } + ] + }, + "Metadata": { + "aws:cdk:path": "ExternalSlurmdbdStack/External-Slurmdbd-ASG" + } + } + }, + "Outputs": { + "SlurmdbdPrivateIp": { + "Description": "Secondary Private IP Address of the slurmdbd instance", + "Value": { + "Ref": "PrivateIp" + } + }, + "SlurmdbdPort": { + "Description": "Port used to connect to slurmdbd service", + "Value": { + "Ref": "SlurmdbdPort" + } + }, + "AccountingClientSecurityGroup": { + "Description": "Security Group ID that allows traffic from the slurmctld to slurmdbd", + "Value": { + "Ref": "SlurmdbdClientSecurityGroup" + } + }, + "SshClientSecurityGroup": { + "Description": "Security Group ID that allows SSH traffic from the HeadNode to slurmdbd instance", + "Value": { + "Ref": "SSHClientSecurityGroup" + } + }, + "SlurmdbdConfigS3BucketName": { + "Description": "S3 Bucket where a copy of the slurmdbd configuration files can be stored and re-used when re-provisioning the slurmdbd instance", + "Value": { + "Ref": "ExternalSlurmdbdS3Bucket" + } } -} + } +} \ No newline at end of file diff --git a/cloudformation/external-slurmdbd/resources/user_data.sh b/cloudformation/external-slurmdbd/resources/user_data.sh index de32060064..516f797612 100644 --- a/cloudformation/external-slurmdbd/resources/user_data.sh +++ b/cloudformation/external-slurmdbd/resources/user_data.sh @@ -48,12 +48,13 @@ function wait_for_private_ip_assignment TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` MAC=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/mac) ENI_ID=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/network/interfaces/macs/"$MAC"/interface-id) +DEVICE_NAME=$(ls /sys/class/net | grep e) aws ec2 assign-private-ip-addresses --region "${AWS::Region}" --network-interface-id "${!ENI_ID}" --private-ip-addresses ${PrivateIp} --allow-reassignment wait_for_private_ip_assignment || echo "Assignment of private IP ${PrivateIp} was not successful." -ip addr add ${PrivateIp}/${SubnetPrefix} dev eth0 +ip addr add ${PrivateIp}/${SubnetPrefix} dev ${DEVICE_NAME} if [ "${CustomCookbookUrl}" != "NONE" ]; then curl --retry 3 -v -L -o /etc/chef/aws-parallelcluster-cookbook.tgz ${CustomCookbookUrl}