From a834ac9cb6ef7a0030a907e11a735ab5cab7b7fb Mon Sep 17 00:00:00 2001 From: Marco Dinis Date: Mon, 13 May 2024 15:17:04 +0100 Subject: [PATCH 1/3] Improve unavailable messages when EC2 Auto Discover with SSM fails EC2 Auto Discover calls ssm:SendCommand to install teleport in a set of EC2 Instances. This requires that the SSM Agent to be running and reporting back to the AWS SSM Service. This PR adds a new API call which is used to query the current status of the SSM agent in the target EC2 instance. If the agent did not register, is not currently online or the EC2 instance is running an unsupported operating system, an error is reported. The specific error is returned and the user can see this in the Audit Log. As an example, let's say we have 3 instances: - i-A: missing IAM permissions to connect to SSM - i-B: SSM ran but is now unhealthy - i-C: instance is running Windows Previously we had the following observable output after running the Discovery Service: i-A (missing iam permissions) Log message with stack trace indicating that "instance is not valid for account" with link for further troubleshoot. No audit log was emitted i-B (SSM is unhealthy) No app log, but audit log with status:failed and exit_code:-1 i-C (windows instance) No app log, but audit log with status:success and exit_code:0 After this PR, the following is reported: i-A (missing iam permissions) No app log Audit log with a clear status message (see code/tests) i-B (SSM is unhealthy) No app log Audit log with a clear status message (see code/tests) i-C (windows instance) No app log Audit log with a clear status message (see code/tests) If any other error happens, it will still be reported in the generic handler for the SendCommand API call. Given this is a new API call, if the Role does not allow it, a log warning is emitted and the behavior is the same as before. --- .../auto-discovery/servers/ec2-discovery.mdx | 2 + lib/cloud/aws/policy_statements.go | 3 +- lib/configurators/aws/aws_test.go | 8 +- .../awsoidc/ec2_ssm_iam_config.go | 5 +- lib/srv/discovery/discovery.go | 7 +- lib/srv/server/ssm_install.go | 164 +++++++++++++++++- lib/srv/server/ssm_install_test.go | 120 ++++++++++++- 7 files changed, 295 insertions(+), 14 deletions(-) diff --git a/docs/pages/auto-discovery/servers/ec2-discovery.mdx b/docs/pages/auto-discovery/servers/ec2-discovery.mdx index d58dc540aaf9d..fa421206e8865 100644 --- a/docs/pages/auto-discovery/servers/ec2-discovery.mdx +++ b/docs/pages/auto-discovery/servers/ec2-discovery.mdx @@ -165,6 +165,7 @@ AWS "Effect": "Allow", "Action": [ "ec2:DescribeInstances", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", "ssm:SendCommand" ], @@ -183,6 +184,7 @@ AWS "Effect": "Allow", "Action": [ "ec2:DescribeInstances", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", "ssm:SendCommand" ], diff --git a/lib/cloud/aws/policy_statements.go b/lib/cloud/aws/policy_statements.go index 557ae72839430..29fc489ea238b 100644 --- a/lib/cloud/aws/policy_statements.go +++ b/lib/cloud/aws/policy_statements.go @@ -145,8 +145,9 @@ func StatementForEC2SSMAutoDiscover() *Statement { Effect: EffectAllow, Actions: []string{ "ec2:DescribeInstances", - "ssm:SendCommand", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", + "ssm:SendCommand", }, Resources: allResources, } diff --git a/lib/configurators/aws/aws_test.go b/lib/configurators/aws/aws_test.go index af0590d8362d8..c61609aa44601 100644 --- a/lib/configurators/aws/aws_test.go +++ b/lib/configurators/aws/aws_test.go @@ -582,8 +582,10 @@ func TestAWSIAMDocuments(t *testing.T) { Effect: "Allow", Actions: []string{ "ec2:DescribeInstances", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", - "ssm:SendCommand"}, + "ssm:SendCommand", + }, Resources: []string{"*"}, }, }, @@ -592,8 +594,10 @@ func TestAWSIAMDocuments(t *testing.T) { Effect: "Allow", Actions: []string{ "ec2:DescribeInstances", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", - "ssm:SendCommand"}, + "ssm:SendCommand", + }, Resources: []string{"*"}, }, }, diff --git a/lib/integrations/awsoidc/ec2_ssm_iam_config.go b/lib/integrations/awsoidc/ec2_ssm_iam_config.go index 4dc82d3cd03d5..95f5cab739f9e 100644 --- a/lib/integrations/awsoidc/ec2_ssm_iam_config.go +++ b/lib/integrations/awsoidc/ec2_ssm_iam_config.go @@ -124,11 +124,14 @@ func NewEC2SSMConfigureClient(ctx context.Context, region string) (EC2SSMConfigu } // ConfigureEC2SSM creates the required resources in AWS to enable EC2 Auto Discover using script mode.. -// It creates an embedded policy with the following permissions: +// It creates an inline policy with the following permissions: // // Action: List EC2 instances where teleport is going to be installed. // - ec2:DescribeInstances // +// Action: Get SSM Agent Status +// - ssm:DescribeInstanceInformation +// // Action: Run a command and get its output. // - ssm:SendCommand // - ssm:GetCommandInvocation diff --git a/lib/srv/discovery/discovery.go b/lib/srv/discovery/discovery.go index b4742cb54721e..6fec63acd59de 100644 --- a/lib/srv/discovery/discovery.go +++ b/lib/srv/discovery/discovery.go @@ -460,9 +460,14 @@ func (s *Server) initAWSWatchers(matchers []types.AWSMatcher) error { s.caRotationCh = make(chan []types.Server) if s.ec2Installer == nil { - s.ec2Installer = server.NewSSMInstaller(server.SSMInstallerConfig{ + ec2installer, err := server.NewSSMInstaller(server.SSMInstallerConfig{ Emitter: s.Emitter, }) + if err != nil { + return trace.Wrap(err) + } + + s.ec2Installer = ec2installer } lr, err := newLabelReconciler(&labelReconcilerConfig{ diff --git a/lib/srv/server/ssm_install.go b/lib/srv/server/ssm_install.go index 7f94c6113fec7..0baf22d4cfaac 100644 --- a/lib/srv/server/ssm_install.go +++ b/lib/srv/server/ssm_install.go @@ -21,6 +21,7 @@ package server import ( "context" "errors" + "log/slog" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" @@ -30,7 +31,9 @@ import ( "github.com/gravitational/trace" "golang.org/x/sync/errgroup" + "github.com/gravitational/teleport" apievents "github.com/gravitational/teleport/api/types/events" + awslib "github.com/gravitational/teleport/lib/cloud/aws" libevents "github.com/gravitational/teleport/lib/events" ) @@ -39,6 +42,9 @@ import ( type SSMInstallerConfig struct { // Emitter is an events emitter. Emitter apievents.Emitter + // Logger is used to log messages. + // Optional. A logger is created if one not supplied. + Logger *slog.Logger } // SSMInstaller handles running SSM commands that install Teleport on EC2 instances. @@ -65,11 +71,27 @@ type SSMRunRequest struct { AccountID string } +// CheckAndSetDefaults ensures the emitter is present and creates a default logger if one is not provided. +func (c *SSMInstallerConfig) CheckAndSetDefaults() error { + if c.Emitter == nil { + return trace.BadParameter("missing audit event emitter") + } + + if c.Logger == nil { + c.Logger = slog.Default().With(teleport.ComponentKey, "ssminstaller") + } + + return nil +} + // NewSSMInstaller returns a new instance of the SSM installer that installs Teleport on EC2 instances. -func NewSSMInstaller(cfg SSMInstallerConfig) *SSMInstaller { +func NewSSMInstaller(cfg SSMInstallerConfig) (*SSMInstaller, error) { + if err := cfg.CheckAndSetDefaults(); err != nil { + return nil, trace.Wrap(err) + } return &SSMInstaller{ SSMInstallerConfig: cfg, - } + }, nil } // Run executes the SSM document and then blocks until the command has completed. @@ -84,9 +106,30 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error { params[k] = []*string{aws.String(v)} } + validInstances := ids + instancesState, err := si.ssmAgentState(ctx, req, ids) + switch { + case err != nil: + // ssmAgentState uses `ssm:DescribeInstanceInformation` to gather all the instances information. + // Previous Docs versions (pre-v16) did not ask for that permission. + // If the IAM role does not have access to that action, an Access Denied is returned here. + // The process continues but the user is warned that they should add that permission to get better diagnostics. + if !trace.IsAccessDenied(err) { + return trace.Wrap(err) + } + + si.Logger.WarnContext(ctx, "Add ssm:DescribeInstanceInformation action to IAM Role to improve diagnostics of EC2 Teleport installation failures") + + default: + if err := si.emitInvalidInstanceEvents(ctx, req, instancesState); err != nil { + return trace.Wrap(err) + } + validInstances = instancesState.valid + } + output, err := req.SSM.SendCommandWithContext(ctx, &ssm.SendCommandInput{ DocumentName: aws.String(req.DocumentName), - InstanceIds: aws.StringSlice(ids), + InstanceIds: aws.StringSlice(validInstances), Parameters: params, }) if err != nil { @@ -95,7 +138,7 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error { g, ctx := errgroup.WithContext(ctx) g.SetLimit(10) - for _, inst := range ids { + for _, inst := range validInstances { inst := inst g.Go(func() error { return trace.Wrap(si.checkCommand(ctx, req, output.Command.CommandId, &inst)) @@ -104,6 +147,119 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error { return trace.Wrap(g.Wait()) } +func invalidSSMInstanceEvent(accountID, region, instanceID, status string) apievents.SSMRun { + return apievents.SSMRun{ + Metadata: apievents.Metadata{ + Type: libevents.SSMRunEvent, + Code: libevents.SSMRunFailCode, + }, + CommandID: "no-command", + AccountID: accountID, + Region: region, + ExitCode: -1, + InstanceID: instanceID, + Status: status, + } +} + +/* +SSM SendCommand failed with ErrCodeInvalidInstanceId. Make sure that the instances have AmazonSSMManagedInstanceCore policy assigned. +Also check that SSM agent is running and registered with the SSM endpoint on that instance and try restarting or reinstalling it in case of issues. +See https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_SendCommand.html#API_SendCommand_Errors for more details. +*/ + +func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRunRequest, instancesState *instancesSSMState) error { + for _, instanceID := range instancesState.missing { + event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, + "EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.", + ) + if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { + return trace.Wrap(err) + } + } + + for _, instanceID := range instancesState.connectionLost { + event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, + "SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.", + ) + if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { + return trace.Wrap(err) + } + } + + for _, instanceID := range instancesState.unsupportedOS { + event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, + "EC2 instance is running an unsupported Operating System. Only Linux is supported.", + ) + if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { + return trace.Wrap(err) + } + } + + return nil +} + +type instancesSSMState struct { + valid []string + missing []string + connectionLost []string + unsupportedOS []string +} + +// ssmAgentState returns the instancesSSMState for all the instances. +func (si *SSMInstaller) ssmAgentState(ctx context.Context, req SSMRunRequest, allInstanceIDs []string) (*instancesSSMState, error) { + ret := &instancesSSMState{} + + // Default is 10, but AWS returns an error if less than 5. + maxResults := aws.Int64(int64(10)) + if len(allInstanceIDs) > 10 { + maxResults = aws.Int64(int64(len(allInstanceIDs))) + } + + ssmInstancesInfo, err := req.SSM.DescribeInstanceInformationWithContext(ctx, &ssm.DescribeInstanceInformationInput{ + Filters: []*ssm.InstanceInformationStringFilter{ + {Key: aws.String(ssm.InstanceInformationFilterKeyInstanceIds), Values: aws.StringSlice(allInstanceIDs)}, + }, + MaxResults: maxResults, + }) + if err != nil { + // Ignore AccessDeniedException error because users might not have granted `ssm:DescribeInstanceInformation` policy. + // Previous docs didn't require this Policy. + awsErr := awslib.ConvertRequestFailureError(err) + if trace.IsAccessDenied(awsErr) { + return nil, trace.Wrap(awsErr) + } + return nil, trace.Wrap(awsErr) + } + + instanceStateByInstanceID := make(map[string]*ssm.InstanceInformation, len(ssmInstancesInfo.InstanceInformationList)) + for _, instanceState := range ssmInstancesInfo.InstanceInformationList { + instanceStateByInstanceID[aws.StringValue(instanceState.InstanceId)] = instanceState + } + + for _, instanceID := range allInstanceIDs { + instanceState, found := instanceStateByInstanceID[instanceID] + if !found { + ret.missing = append(ret.missing, instanceID) + continue + } + + if aws.StringValue(instanceState.PingStatus) == ssm.PingStatusConnectionLost { + ret.connectionLost = append(ret.connectionLost, instanceID) + continue + } + + if aws.StringValue(instanceState.PlatformType) != ssm.PlatformTypeLinux { + ret.unsupportedOS = append(ret.unsupportedOS, instanceID) + continue + } + + ret.valid = append(ret.valid, instanceID) + } + + return ret, nil +} + // skipAWSWaitErr is used to ignore the error returned from // WaitUntilCommandExecutedWithContext if it is a resource not ready // code as this can represent one of several different errors which diff --git a/lib/srv/server/ssm_install_test.go b/lib/srv/server/ssm_install_test.go index 3114608c5e5cd..ae0cdd2ce8d0a 100644 --- a/lib/srv/server/ssm_install_test.go +++ b/lib/srv/server/ssm_install_test.go @@ -20,6 +20,7 @@ package server import ( "context" + "net/http" "testing" "github.com/aws/aws-sdk-go/aws" @@ -27,6 +28,7 @@ import ( "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/ssm" "github.com/aws/aws-sdk-go/service/ssm/ssmiface" + "github.com/google/uuid" "github.com/stretchr/testify/require" "github.com/gravitational/teleport/api/types/events" @@ -35,8 +37,9 @@ import ( type mockSSMClient struct { ssmiface.SSMAPI - commandOutput *ssm.SendCommandOutput - invokeOutput *ssm.GetCommandInvocationOutput + commandOutput *ssm.SendCommandOutput + invokeOutput *ssm.GetCommandInvocationOutput + describeOutput *ssm.DescribeInstanceInformationOutput } func (sm *mockSSMClient) SendCommandWithContext(_ context.Context, input *ssm.SendCommandInput, _ ...request.Option) (*ssm.SendCommandOutput, error) { @@ -47,6 +50,13 @@ func (sm *mockSSMClient) GetCommandInvocationWithContext(_ context.Context, inpu return sm.invokeOutput, nil } +func (sm *mockSSMClient) DescribeInstanceInformationWithContext(_ context.Context, input *ssm.DescribeInstanceInformationInput, _ ...request.Option) (*ssm.DescribeInstanceInformationOutput, error) { + if sm.describeOutput == nil { + return nil, awserr.NewRequestFailure(awserr.New("AccessDeniedException", "message", nil), http.StatusBadRequest, uuid.NewString()) + } + return sm.describeOutput, nil +} + func (sm *mockSSMClient) WaitUntilCommandExecutedWithContext(aws.Context, *ssm.GetCommandInvocationInput, ...request.WaiterOption) error { if aws.StringValue(sm.commandOutput.Command.Status) == ssm.CommandStatusFailed { return awserr.New(request.WaiterResourceNotReadyErrorCode, "err", nil) @@ -152,17 +162,117 @@ func TestSSMInstaller(t *testing.T) { }, }, }, + { + name: "detailed events if ssm:DescribeInstanceInformation is available", + req: SSMRunRequest{ + Instances: []EC2Instance{ + {InstanceID: "instance-id-1"}, + {InstanceID: "instance-id-2"}, + {InstanceID: "instance-id-3"}, + {InstanceID: "instance-id-4"}, + }, + DocumentName: document, + Params: map[string]string{"token": "abcdefg"}, + SSM: &mockSSMClient{ + commandOutput: &ssm.SendCommandOutput{ + Command: &ssm.Command{ + CommandId: aws.String("command-id-1"), + }, + }, + invokeOutput: &ssm.GetCommandInvocationOutput{ + Status: aws.String(ssm.CommandStatusSuccess), + ResponseCode: aws.Int64(0), + }, + describeOutput: &ssm.DescribeInstanceInformationOutput{ + InstanceInformationList: []*ssm.InstanceInformation{ + { + InstanceId: aws.String("instance-id-1"), + PingStatus: aws.String("Online"), + PlatformType: aws.String("Linux"), + }, + { + InstanceId: aws.String("instance-id-2"), + PingStatus: aws.String("ConnectionLost"), + PlatformType: aws.String("Linux"), + }, + { + InstanceId: aws.String("instance-id-3"), + PingStatus: aws.String("Online"), + PlatformType: aws.String("Windows"), + }, + }, + }, + }, + Region: "eu-central-1", + AccountID: "account-id", + }, + conf: SSMInstallerConfig{ + Emitter: &mockEmitter{}, + }, + expectedEvents: []events.AuditEvent{ + &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevent.SSMRunEvent, + Code: libevent.SSMRunSuccessCode, + }, + CommandID: "command-id-1", + InstanceID: "instance-id-1", + AccountID: "account-id", + Region: "eu-central-1", + ExitCode: 0, + Status: ssm.CommandStatusSuccess, + }, + &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevent.SSMRunEvent, + Code: libevent.SSMRunFailCode, + }, + CommandID: "no-command", + InstanceID: "instance-id-2", + AccountID: "account-id", + Region: "eu-central-1", + ExitCode: -1, + Status: "SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.", + }, + &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevent.SSMRunEvent, + Code: libevent.SSMRunFailCode, + }, + CommandID: "no-command", + InstanceID: "instance-id-3", + AccountID: "account-id", + Region: "eu-central-1", + ExitCode: -1, + Status: "EC2 instance is running an unsupported Operating System. Only Linux is supported.", + }, + &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevent.SSMRunEvent, + Code: libevent.SSMRunFailCode, + }, + CommandID: "no-command", + InstanceID: "instance-id-4", + AccountID: "account-id", + Region: "eu-central-1", + ExitCode: -1, + Status: "EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.", + }, + }, + }, // todo(amk): test that incomplete commands eventually return // an event once completed } { t.Run(tc.name, func(t *testing.T) { ctx := context.Background() - inst := NewSSMInstaller(tc.conf) - err := inst.Run(ctx, tc.req) + inst, err := NewSSMInstaller(tc.conf) + require.NoError(t, err) + + err = inst.Run(ctx, tc.req) require.NoError(t, err) emitter := inst.Emitter.(*mockEmitter) - require.Equal(t, tc.expectedEvents, emitter.events) + require.ElementsMatch(t, tc.expectedEvents, emitter.events) }) } } From ca3da6fa965aba9035e89a3a0945b306a6a1405c Mon Sep 17 00:00:00 2001 From: Marco Dinis Date: Tue, 14 May 2024 09:39:27 +0100 Subject: [PATCH 2/3] best effort on emitting events --- lib/srv/server/ssm_install.go | 74 ++++++++++++++++------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/lib/srv/server/ssm_install.go b/lib/srv/server/ssm_install.go index 0baf22d4cfaac..137a8b0573c89 100644 --- a/lib/srv/server/ssm_install.go +++ b/lib/srv/server/ssm_install.go @@ -72,7 +72,7 @@ type SSMRunRequest struct { } // CheckAndSetDefaults ensures the emitter is present and creates a default logger if one is not provided. -func (c *SSMInstallerConfig) CheckAndSetDefaults() error { +func (c *SSMInstallerConfig) checkAndSetDefaults() error { if c.Emitter == nil { return trace.BadParameter("missing audit event emitter") } @@ -86,7 +86,7 @@ func (c *SSMInstallerConfig) CheckAndSetDefaults() error { // NewSSMInstaller returns a new instance of the SSM installer that installs Teleport on EC2 instances. func NewSSMInstaller(cfg SSMInstallerConfig) (*SSMInstaller, error) { - if err := cfg.CheckAndSetDefaults(); err != nil { + if err := cfg.checkAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } return &SSMInstaller{ @@ -107,22 +107,26 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error { } validInstances := ids - instancesState, err := si.ssmAgentState(ctx, req, ids) + instancesState, err := si.describeSSMAgentState(ctx, req, ids) switch { - case err != nil: - // ssmAgentState uses `ssm:DescribeInstanceInformation` to gather all the instances information. + case trace.IsAccessDenied(err): + // describeSSMAgentState uses `ssm:DescribeInstanceInformation` to gather all the instances information. // Previous Docs versions (pre-v16) did not ask for that permission. // If the IAM role does not have access to that action, an Access Denied is returned here. // The process continues but the user is warned that they should add that permission to get better diagnostics. - if !trace.IsAccessDenied(err) { - return trace.Wrap(err) - } + si.Logger.WarnContext(ctx, + "Add ssm:DescribeInstanceInformation action to IAM Role to improve diagnostics of EC2 Teleport installation failures", + "error", err) - si.Logger.WarnContext(ctx, "Add ssm:DescribeInstanceInformation action to IAM Role to improve diagnostics of EC2 Teleport installation failures") + case err != nil: + return trace.Wrap(err) default: if err := si.emitInvalidInstanceEvents(ctx, req, instancesState); err != nil { - return trace.Wrap(err) + si.Logger.ErrorContext(ctx, + "Failed to emit invalid instances", + "instances", instancesState, + "error", err) } validInstances = instancesState.valid } @@ -162,78 +166,66 @@ func invalidSSMInstanceEvent(accountID, region, instanceID, status string) apiev } } -/* -SSM SendCommand failed with ErrCodeInvalidInstanceId. Make sure that the instances have AmazonSSMManagedInstanceCore policy assigned. -Also check that SSM agent is running and registered with the SSM endpoint on that instance and try restarting or reinstalling it in case of issues. -See https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_SendCommand.html#API_SendCommand_Errors for more details. -*/ - -func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRunRequest, instancesState *instancesSSMState) error { - for _, instanceID := range instancesState.missing { +func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRunRequest, instanceIDsState *instanceIDsSSMState) error { + var errs []error + for _, instanceID := range instanceIDsState.missing { event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, "EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.", ) if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { - return trace.Wrap(err) + errs = append(errs, trace.Wrap(err)) } } - for _, instanceID := range instancesState.connectionLost { + for _, instanceID := range instanceIDsState.connectionLost { event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, "SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.", ) if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { - return trace.Wrap(err) + errs = append(errs, trace.Wrap(err)) } } - for _, instanceID := range instancesState.unsupportedOS { + for _, instanceID := range instanceIDsState.unsupportedOS { event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, "EC2 instance is running an unsupported Operating System. Only Linux is supported.", ) if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { - return trace.Wrap(err) + errs = append(errs, trace.Wrap(err)) } } - return nil + return errors.Join(errs...) } -type instancesSSMState struct { +// instanceIDsSSMState contains a list of EC2 Instance IDs for a given state. +type instanceIDsSSMState struct { valid []string missing []string connectionLost []string unsupportedOS []string } -// ssmAgentState returns the instancesSSMState for all the instances. -func (si *SSMInstaller) ssmAgentState(ctx context.Context, req SSMRunRequest, allInstanceIDs []string) (*instancesSSMState, error) { - ret := &instancesSSMState{} +// describeSSMAgentState returns the instanceIDsSSMState for all the instances. +func (si *SSMInstaller) describeSSMAgentState(ctx context.Context, req SSMRunRequest, allInstanceIDs []string) (*instanceIDsSSMState, error) { + ret := &instanceIDsSSMState{} - // Default is 10, but AWS returns an error if less than 5. - maxResults := aws.Int64(int64(10)) - if len(allInstanceIDs) > 10 { - maxResults = aws.Int64(int64(len(allInstanceIDs))) - } + const defaultMaxResults = 10 ssmInstancesInfo, err := req.SSM.DescribeInstanceInformationWithContext(ctx, &ssm.DescribeInstanceInformationInput{ Filters: []*ssm.InstanceInformationStringFilter{ {Key: aws.String(ssm.InstanceInformationFilterKeyInstanceIds), Values: aws.StringSlice(allInstanceIDs)}, }, - MaxResults: maxResults, + // AWS returns an error if MaxResults is less than 5. + MaxResults: aws.Int64(max(defaultMaxResults, int64(len(allInstanceIDs)))), }) if err != nil { - // Ignore AccessDeniedException error because users might not have granted `ssm:DescribeInstanceInformation` policy. - // Previous docs didn't require this Policy. - awsErr := awslib.ConvertRequestFailureError(err) - if trace.IsAccessDenied(awsErr) { - return nil, trace.Wrap(awsErr) - } - return nil, trace.Wrap(awsErr) + return nil, trace.Wrap(awslib.ConvertRequestFailureError(err)) } instanceStateByInstanceID := make(map[string]*ssm.InstanceInformation, len(ssmInstancesInfo.InstanceInformationList)) for _, instanceState := range ssmInstancesInfo.InstanceInformationList { + // instanceState.InstanceId always has the InstanceID value according to AWS Docs. instanceStateByInstanceID[aws.StringValue(instanceState.InstanceId)] = instanceState } From 6b59009a058b2812dc6fc5cf704a65ff06c75e71 Mon Sep 17 00:00:00 2001 From: Marco Dinis Date: Thu, 16 May 2024 17:03:38 +0100 Subject: [PATCH 3/3] improve maxresults param --- lib/srv/server/ec2_watcher.go | 3 +++ lib/srv/server/ssm_install.go | 5 +---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/srv/server/ec2_watcher.go b/lib/srv/server/ec2_watcher.go index 3d2e7679d8dc0..c096393ac832c 100644 --- a/lib/srv/server/ec2_watcher.go +++ b/lib/srv/server/ec2_watcher.go @@ -272,6 +272,9 @@ const ( ) // awsEC2APIChunkSize is the max number of instances SSM will send commands to at a time +// This is used for limiting the number of instances for API Calls: +// ssm:SendCommand only accepts between 0 and 50. +// ssm:DescribeInstanceInformation only accepts between 5 and 50. const awsEC2APIChunkSize = 50 func newEC2InstanceFetcher(cfg ec2FetcherConfig) *ec2InstanceFetcher { diff --git a/lib/srv/server/ssm_install.go b/lib/srv/server/ssm_install.go index 137a8b0573c89..395a7c2874c1a 100644 --- a/lib/srv/server/ssm_install.go +++ b/lib/srv/server/ssm_install.go @@ -210,14 +210,11 @@ type instanceIDsSSMState struct { func (si *SSMInstaller) describeSSMAgentState(ctx context.Context, req SSMRunRequest, allInstanceIDs []string) (*instanceIDsSSMState, error) { ret := &instanceIDsSSMState{} - const defaultMaxResults = 10 - ssmInstancesInfo, err := req.SSM.DescribeInstanceInformationWithContext(ctx, &ssm.DescribeInstanceInformationInput{ Filters: []*ssm.InstanceInformationStringFilter{ {Key: aws.String(ssm.InstanceInformationFilterKeyInstanceIds), Values: aws.StringSlice(allInstanceIDs)}, }, - // AWS returns an error if MaxResults is less than 5. - MaxResults: aws.Int64(max(defaultMaxResults, int64(len(allInstanceIDs)))), + MaxResults: aws.Int64(awsEC2APIChunkSize), }) if err != nil { return nil, trace.Wrap(awslib.ConvertRequestFailureError(err))