From cb6b76959cab75b81f625c27647276cca0214073 Mon Sep 17 00:00:00 2001 From: Marco Dinis Date: Thu, 16 May 2024 17:24:53 +0100 Subject: [PATCH] Improve messages when EC2 Auto Discover with SSM fails (#41465) * Improve unavailable messages when EC2 Auto Discover with SSM fails EC2 Auto Discover calls ssm:SendCommand to install teleport in a set of EC2 Instances. This requires that the SSM Agent to be running and reporting back to the AWS SSM Service. This PR adds a new API call which is used to query the current status of the SSM agent in the target EC2 instance. If the agent did not register, is not currently online or the EC2 instance is running an unsupported operating system, an error is reported. The specific error is returned and the user can see this in the Audit Log. As an example, let's say we have 3 instances: - i-A: missing IAM permissions to connect to SSM - i-B: SSM ran but is now unhealthy - i-C: instance is running Windows Previously we had the following observable output after running the Discovery Service: i-A (missing iam permissions) Log message with stack trace indicating that "instance is not valid for account" with link for further troubleshoot. No audit log was emitted i-B (SSM is unhealthy) No app log, but audit log with status:failed and exit_code:-1 i-C (windows instance) No app log, but audit log with status:success and exit_code:0 After this PR, the following is reported: i-A (missing iam permissions) No app log Audit log with a clear status message (see code/tests) i-B (SSM is unhealthy) No app log Audit log with a clear status message (see code/tests) i-C (windows instance) No app log Audit log with a clear status message (see code/tests) If any other error happens, it will still be reported in the generic handler for the SendCommand API call. Given this is a new API call, if the Role does not allow it, a log warning is emitted and the behavior is the same as before. * best effort on emitting events * improve maxresults param --- .../auto-discovery/servers/ec2-discovery.mdx | 2 + lib/cloud/aws/policy_statements.go | 3 +- lib/configurators/aws/aws_test.go | 8 +- .../awsoidc/ec2_ssm_iam_config.go | 5 +- lib/srv/discovery/discovery.go | 7 +- lib/srv/server/ec2_watcher.go | 3 + lib/srv/server/ssm_install.go | 153 +++++++++++++++++- lib/srv/server/ssm_install_test.go | 120 +++++++++++++- 8 files changed, 287 insertions(+), 14 deletions(-) diff --git a/docs/pages/auto-discovery/servers/ec2-discovery.mdx b/docs/pages/auto-discovery/servers/ec2-discovery.mdx index d58dc540aaf9d..fa421206e8865 100644 --- a/docs/pages/auto-discovery/servers/ec2-discovery.mdx +++ b/docs/pages/auto-discovery/servers/ec2-discovery.mdx @@ -165,6 +165,7 @@ AWS "Effect": "Allow", "Action": [ "ec2:DescribeInstances", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", "ssm:SendCommand" ], @@ -183,6 +184,7 @@ AWS "Effect": "Allow", "Action": [ "ec2:DescribeInstances", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", "ssm:SendCommand" ], diff --git a/lib/cloud/aws/policy_statements.go b/lib/cloud/aws/policy_statements.go index 557ae72839430..29fc489ea238b 100644 --- a/lib/cloud/aws/policy_statements.go +++ b/lib/cloud/aws/policy_statements.go @@ -145,8 +145,9 @@ func StatementForEC2SSMAutoDiscover() *Statement { Effect: EffectAllow, Actions: []string{ "ec2:DescribeInstances", - "ssm:SendCommand", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", + "ssm:SendCommand", }, Resources: allResources, } diff --git a/lib/configurators/aws/aws_test.go b/lib/configurators/aws/aws_test.go index af0590d8362d8..c61609aa44601 100644 --- a/lib/configurators/aws/aws_test.go +++ b/lib/configurators/aws/aws_test.go @@ -582,8 +582,10 @@ func TestAWSIAMDocuments(t *testing.T) { Effect: "Allow", Actions: []string{ "ec2:DescribeInstances", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", - "ssm:SendCommand"}, + "ssm:SendCommand", + }, Resources: []string{"*"}, }, }, @@ -592,8 +594,10 @@ func TestAWSIAMDocuments(t *testing.T) { Effect: "Allow", Actions: []string{ "ec2:DescribeInstances", + "ssm:DescribeInstanceInformation", "ssm:GetCommandInvocation", - "ssm:SendCommand"}, + "ssm:SendCommand", + }, Resources: []string{"*"}, }, }, diff --git a/lib/integrations/awsoidc/ec2_ssm_iam_config.go b/lib/integrations/awsoidc/ec2_ssm_iam_config.go index 4dc82d3cd03d5..95f5cab739f9e 100644 --- a/lib/integrations/awsoidc/ec2_ssm_iam_config.go +++ b/lib/integrations/awsoidc/ec2_ssm_iam_config.go @@ -124,11 +124,14 @@ func NewEC2SSMConfigureClient(ctx context.Context, region string) (EC2SSMConfigu } // ConfigureEC2SSM creates the required resources in AWS to enable EC2 Auto Discover using script mode.. -// It creates an embedded policy with the following permissions: +// It creates an inline policy with the following permissions: // // Action: List EC2 instances where teleport is going to be installed. // - ec2:DescribeInstances // +// Action: Get SSM Agent Status +// - ssm:DescribeInstanceInformation +// // Action: Run a command and get its output. // - ssm:SendCommand // - ssm:GetCommandInvocation diff --git a/lib/srv/discovery/discovery.go b/lib/srv/discovery/discovery.go index b4742cb54721e..6fec63acd59de 100644 --- a/lib/srv/discovery/discovery.go +++ b/lib/srv/discovery/discovery.go @@ -460,9 +460,14 @@ func (s *Server) initAWSWatchers(matchers []types.AWSMatcher) error { s.caRotationCh = make(chan []types.Server) if s.ec2Installer == nil { - s.ec2Installer = server.NewSSMInstaller(server.SSMInstallerConfig{ + ec2installer, err := server.NewSSMInstaller(server.SSMInstallerConfig{ Emitter: s.Emitter, }) + if err != nil { + return trace.Wrap(err) + } + + s.ec2Installer = ec2installer } lr, err := newLabelReconciler(&labelReconcilerConfig{ diff --git a/lib/srv/server/ec2_watcher.go b/lib/srv/server/ec2_watcher.go index 3d2e7679d8dc0..c096393ac832c 100644 --- a/lib/srv/server/ec2_watcher.go +++ b/lib/srv/server/ec2_watcher.go @@ -272,6 +272,9 @@ const ( ) // awsEC2APIChunkSize is the max number of instances SSM will send commands to at a time +// This is used for limiting the number of instances for API Calls: +// ssm:SendCommand only accepts between 0 and 50. +// ssm:DescribeInstanceInformation only accepts between 5 and 50. const awsEC2APIChunkSize = 50 func newEC2InstanceFetcher(cfg ec2FetcherConfig) *ec2InstanceFetcher { diff --git a/lib/srv/server/ssm_install.go b/lib/srv/server/ssm_install.go index 7f94c6113fec7..395a7c2874c1a 100644 --- a/lib/srv/server/ssm_install.go +++ b/lib/srv/server/ssm_install.go @@ -21,6 +21,7 @@ package server import ( "context" "errors" + "log/slog" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" @@ -30,7 +31,9 @@ import ( "github.com/gravitational/trace" "golang.org/x/sync/errgroup" + "github.com/gravitational/teleport" apievents "github.com/gravitational/teleport/api/types/events" + awslib "github.com/gravitational/teleport/lib/cloud/aws" libevents "github.com/gravitational/teleport/lib/events" ) @@ -39,6 +42,9 @@ import ( type SSMInstallerConfig struct { // Emitter is an events emitter. Emitter apievents.Emitter + // Logger is used to log messages. + // Optional. A logger is created if one not supplied. + Logger *slog.Logger } // SSMInstaller handles running SSM commands that install Teleport on EC2 instances. @@ -65,11 +71,27 @@ type SSMRunRequest struct { AccountID string } +// CheckAndSetDefaults ensures the emitter is present and creates a default logger if one is not provided. +func (c *SSMInstallerConfig) checkAndSetDefaults() error { + if c.Emitter == nil { + return trace.BadParameter("missing audit event emitter") + } + + if c.Logger == nil { + c.Logger = slog.Default().With(teleport.ComponentKey, "ssminstaller") + } + + return nil +} + // NewSSMInstaller returns a new instance of the SSM installer that installs Teleport on EC2 instances. -func NewSSMInstaller(cfg SSMInstallerConfig) *SSMInstaller { +func NewSSMInstaller(cfg SSMInstallerConfig) (*SSMInstaller, error) { + if err := cfg.checkAndSetDefaults(); err != nil { + return nil, trace.Wrap(err) + } return &SSMInstaller{ SSMInstallerConfig: cfg, - } + }, nil } // Run executes the SSM document and then blocks until the command has completed. @@ -84,9 +106,34 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error { params[k] = []*string{aws.String(v)} } + validInstances := ids + instancesState, err := si.describeSSMAgentState(ctx, req, ids) + switch { + case trace.IsAccessDenied(err): + // describeSSMAgentState uses `ssm:DescribeInstanceInformation` to gather all the instances information. + // Previous Docs versions (pre-v16) did not ask for that permission. + // If the IAM role does not have access to that action, an Access Denied is returned here. + // The process continues but the user is warned that they should add that permission to get better diagnostics. + si.Logger.WarnContext(ctx, + "Add ssm:DescribeInstanceInformation action to IAM Role to improve diagnostics of EC2 Teleport installation failures", + "error", err) + + case err != nil: + return trace.Wrap(err) + + default: + if err := si.emitInvalidInstanceEvents(ctx, req, instancesState); err != nil { + si.Logger.ErrorContext(ctx, + "Failed to emit invalid instances", + "instances", instancesState, + "error", err) + } + validInstances = instancesState.valid + } + output, err := req.SSM.SendCommandWithContext(ctx, &ssm.SendCommandInput{ DocumentName: aws.String(req.DocumentName), - InstanceIds: aws.StringSlice(ids), + InstanceIds: aws.StringSlice(validInstances), Parameters: params, }) if err != nil { @@ -95,7 +142,7 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error { g, ctx := errgroup.WithContext(ctx) g.SetLimit(10) - for _, inst := range ids { + for _, inst := range validInstances { inst := inst g.Go(func() error { return trace.Wrap(si.checkCommand(ctx, req, output.Command.CommandId, &inst)) @@ -104,6 +151,104 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error { return trace.Wrap(g.Wait()) } +func invalidSSMInstanceEvent(accountID, region, instanceID, status string) apievents.SSMRun { + return apievents.SSMRun{ + Metadata: apievents.Metadata{ + Type: libevents.SSMRunEvent, + Code: libevents.SSMRunFailCode, + }, + CommandID: "no-command", + AccountID: accountID, + Region: region, + ExitCode: -1, + InstanceID: instanceID, + Status: status, + } +} + +func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRunRequest, instanceIDsState *instanceIDsSSMState) error { + var errs []error + for _, instanceID := range instanceIDsState.missing { + event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, + "EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.", + ) + if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { + errs = append(errs, trace.Wrap(err)) + } + } + + for _, instanceID := range instanceIDsState.connectionLost { + event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, + "SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.", + ) + if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { + errs = append(errs, trace.Wrap(err)) + } + } + + for _, instanceID := range instanceIDsState.unsupportedOS { + event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID, + "EC2 instance is running an unsupported Operating System. Only Linux is supported.", + ) + if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil { + errs = append(errs, trace.Wrap(err)) + } + } + + return errors.Join(errs...) +} + +// instanceIDsSSMState contains a list of EC2 Instance IDs for a given state. +type instanceIDsSSMState struct { + valid []string + missing []string + connectionLost []string + unsupportedOS []string +} + +// describeSSMAgentState returns the instanceIDsSSMState for all the instances. +func (si *SSMInstaller) describeSSMAgentState(ctx context.Context, req SSMRunRequest, allInstanceIDs []string) (*instanceIDsSSMState, error) { + ret := &instanceIDsSSMState{} + + ssmInstancesInfo, err := req.SSM.DescribeInstanceInformationWithContext(ctx, &ssm.DescribeInstanceInformationInput{ + Filters: []*ssm.InstanceInformationStringFilter{ + {Key: aws.String(ssm.InstanceInformationFilterKeyInstanceIds), Values: aws.StringSlice(allInstanceIDs)}, + }, + MaxResults: aws.Int64(awsEC2APIChunkSize), + }) + if err != nil { + return nil, trace.Wrap(awslib.ConvertRequestFailureError(err)) + } + + instanceStateByInstanceID := make(map[string]*ssm.InstanceInformation, len(ssmInstancesInfo.InstanceInformationList)) + for _, instanceState := range ssmInstancesInfo.InstanceInformationList { + // instanceState.InstanceId always has the InstanceID value according to AWS Docs. + instanceStateByInstanceID[aws.StringValue(instanceState.InstanceId)] = instanceState + } + + for _, instanceID := range allInstanceIDs { + instanceState, found := instanceStateByInstanceID[instanceID] + if !found { + ret.missing = append(ret.missing, instanceID) + continue + } + + if aws.StringValue(instanceState.PingStatus) == ssm.PingStatusConnectionLost { + ret.connectionLost = append(ret.connectionLost, instanceID) + continue + } + + if aws.StringValue(instanceState.PlatformType) != ssm.PlatformTypeLinux { + ret.unsupportedOS = append(ret.unsupportedOS, instanceID) + continue + } + + ret.valid = append(ret.valid, instanceID) + } + + return ret, nil +} + // skipAWSWaitErr is used to ignore the error returned from // WaitUntilCommandExecutedWithContext if it is a resource not ready // code as this can represent one of several different errors which diff --git a/lib/srv/server/ssm_install_test.go b/lib/srv/server/ssm_install_test.go index 3114608c5e5cd..ae0cdd2ce8d0a 100644 --- a/lib/srv/server/ssm_install_test.go +++ b/lib/srv/server/ssm_install_test.go @@ -20,6 +20,7 @@ package server import ( "context" + "net/http" "testing" "github.com/aws/aws-sdk-go/aws" @@ -27,6 +28,7 @@ import ( "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/ssm" "github.com/aws/aws-sdk-go/service/ssm/ssmiface" + "github.com/google/uuid" "github.com/stretchr/testify/require" "github.com/gravitational/teleport/api/types/events" @@ -35,8 +37,9 @@ import ( type mockSSMClient struct { ssmiface.SSMAPI - commandOutput *ssm.SendCommandOutput - invokeOutput *ssm.GetCommandInvocationOutput + commandOutput *ssm.SendCommandOutput + invokeOutput *ssm.GetCommandInvocationOutput + describeOutput *ssm.DescribeInstanceInformationOutput } func (sm *mockSSMClient) SendCommandWithContext(_ context.Context, input *ssm.SendCommandInput, _ ...request.Option) (*ssm.SendCommandOutput, error) { @@ -47,6 +50,13 @@ func (sm *mockSSMClient) GetCommandInvocationWithContext(_ context.Context, inpu return sm.invokeOutput, nil } +func (sm *mockSSMClient) DescribeInstanceInformationWithContext(_ context.Context, input *ssm.DescribeInstanceInformationInput, _ ...request.Option) (*ssm.DescribeInstanceInformationOutput, error) { + if sm.describeOutput == nil { + return nil, awserr.NewRequestFailure(awserr.New("AccessDeniedException", "message", nil), http.StatusBadRequest, uuid.NewString()) + } + return sm.describeOutput, nil +} + func (sm *mockSSMClient) WaitUntilCommandExecutedWithContext(aws.Context, *ssm.GetCommandInvocationInput, ...request.WaiterOption) error { if aws.StringValue(sm.commandOutput.Command.Status) == ssm.CommandStatusFailed { return awserr.New(request.WaiterResourceNotReadyErrorCode, "err", nil) @@ -152,17 +162,117 @@ func TestSSMInstaller(t *testing.T) { }, }, }, + { + name: "detailed events if ssm:DescribeInstanceInformation is available", + req: SSMRunRequest{ + Instances: []EC2Instance{ + {InstanceID: "instance-id-1"}, + {InstanceID: "instance-id-2"}, + {InstanceID: "instance-id-3"}, + {InstanceID: "instance-id-4"}, + }, + DocumentName: document, + Params: map[string]string{"token": "abcdefg"}, + SSM: &mockSSMClient{ + commandOutput: &ssm.SendCommandOutput{ + Command: &ssm.Command{ + CommandId: aws.String("command-id-1"), + }, + }, + invokeOutput: &ssm.GetCommandInvocationOutput{ + Status: aws.String(ssm.CommandStatusSuccess), + ResponseCode: aws.Int64(0), + }, + describeOutput: &ssm.DescribeInstanceInformationOutput{ + InstanceInformationList: []*ssm.InstanceInformation{ + { + InstanceId: aws.String("instance-id-1"), + PingStatus: aws.String("Online"), + PlatformType: aws.String("Linux"), + }, + { + InstanceId: aws.String("instance-id-2"), + PingStatus: aws.String("ConnectionLost"), + PlatformType: aws.String("Linux"), + }, + { + InstanceId: aws.String("instance-id-3"), + PingStatus: aws.String("Online"), + PlatformType: aws.String("Windows"), + }, + }, + }, + }, + Region: "eu-central-1", + AccountID: "account-id", + }, + conf: SSMInstallerConfig{ + Emitter: &mockEmitter{}, + }, + expectedEvents: []events.AuditEvent{ + &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevent.SSMRunEvent, + Code: libevent.SSMRunSuccessCode, + }, + CommandID: "command-id-1", + InstanceID: "instance-id-1", + AccountID: "account-id", + Region: "eu-central-1", + ExitCode: 0, + Status: ssm.CommandStatusSuccess, + }, + &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevent.SSMRunEvent, + Code: libevent.SSMRunFailCode, + }, + CommandID: "no-command", + InstanceID: "instance-id-2", + AccountID: "account-id", + Region: "eu-central-1", + ExitCode: -1, + Status: "SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.", + }, + &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevent.SSMRunEvent, + Code: libevent.SSMRunFailCode, + }, + CommandID: "no-command", + InstanceID: "instance-id-3", + AccountID: "account-id", + Region: "eu-central-1", + ExitCode: -1, + Status: "EC2 instance is running an unsupported Operating System. Only Linux is supported.", + }, + &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevent.SSMRunEvent, + Code: libevent.SSMRunFailCode, + }, + CommandID: "no-command", + InstanceID: "instance-id-4", + AccountID: "account-id", + Region: "eu-central-1", + ExitCode: -1, + Status: "EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.", + }, + }, + }, // todo(amk): test that incomplete commands eventually return // an event once completed } { t.Run(tc.name, func(t *testing.T) { ctx := context.Background() - inst := NewSSMInstaller(tc.conf) - err := inst.Run(ctx, tc.req) + inst, err := NewSSMInstaller(tc.conf) + require.NoError(t, err) + + err = inst.Run(ctx, tc.req) require.NoError(t, err) emitter := inst.Emitter.(*mockEmitter) - require.Equal(t, tc.expectedEvents, emitter.events) + require.ElementsMatch(t, tc.expectedEvents, emitter.events) }) } }