Skip to content

Commit

Permalink
Improve messages when EC2 Auto Discover with SSM fails (#41465)
Browse files Browse the repository at this point in the history
* Improve unavailable messages when EC2 Auto Discover with SSM fails

EC2 Auto Discover calls ssm:SendCommand to install teleport in a set of
EC2 Instances.
This requires that the SSM Agent to be running and reporting back to the
AWS SSM Service.

This PR adds a new API call which is used to query the current status of
the SSM agent in the target EC2 instance.

If the agent did not register, is not currently online or the EC2
instance is running an unsupported operating system, an error is
reported.
The specific error is returned and the user can see this in the Audit
Log.

As an example, let's say we have 3 instances:
- i-A: missing IAM permissions to connect to SSM
- i-B: SSM ran but is now unhealthy
- i-C: instance is running Windows

Previously we had the following observable output after running the
Discovery Service:
i-A (missing iam permissions)
Log message with stack trace indicating that "instance is not valid for
account" with link for further troubleshoot.
No audit log was emitted

i-B (SSM is unhealthy)
No app log, but audit log with status:failed and exit_code:-1

i-C (windows instance)
No app log, but audit log with status:success and exit_code:0

After this PR, the following is reported:
i-A (missing iam permissions)
No app log
Audit log with a clear status message (see code/tests)

i-B (SSM is unhealthy)
No app log
Audit log with a clear status message (see code/tests)

i-C (windows instance)
No app log
Audit log with a clear status message (see code/tests)

If any other error happens, it will still be reported in the generic
handler for the SendCommand API call.

Given this is a new API call, if the Role does not allow it, a log
warning is emitted and the behavior is the same as before.

* best effort on emitting events

* improve maxresults param
  • Loading branch information
marcoandredinis authored May 16, 2024
1 parent 8587249 commit cb6b769
Show file tree
Hide file tree
Showing 8 changed files with 287 additions and 14 deletions.
2 changes: 2 additions & 0 deletions docs/pages/auto-discovery/servers/ec2-discovery.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ AWS
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand"
],
Expand All @@ -183,6 +184,7 @@ AWS
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand"
],
Expand Down
3 changes: 2 additions & 1 deletion lib/cloud/aws/policy_statements.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,9 @@ func StatementForEC2SSMAutoDiscover() *Statement {
Effect: EffectAllow,
Actions: []string{
"ec2:DescribeInstances",
"ssm:SendCommand",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand",
},
Resources: allResources,
}
Expand Down
8 changes: 6 additions & 2 deletions lib/configurators/aws/aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,8 +582,10 @@ func TestAWSIAMDocuments(t *testing.T) {
Effect: "Allow",
Actions: []string{
"ec2:DescribeInstances",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand"},
"ssm:SendCommand",
},
Resources: []string{"*"},
},
},
Expand All @@ -592,8 +594,10 @@ func TestAWSIAMDocuments(t *testing.T) {
Effect: "Allow",
Actions: []string{
"ec2:DescribeInstances",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand"},
"ssm:SendCommand",
},
Resources: []string{"*"},
},
},
Expand Down
5 changes: 4 additions & 1 deletion lib/integrations/awsoidc/ec2_ssm_iam_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,14 @@ func NewEC2SSMConfigureClient(ctx context.Context, region string) (EC2SSMConfigu
}

// ConfigureEC2SSM creates the required resources in AWS to enable EC2 Auto Discover using script mode..
// It creates an embedded policy with the following permissions:
// It creates an inline policy with the following permissions:
//
// Action: List EC2 instances where teleport is going to be installed.
// - ec2:DescribeInstances
//
// Action: Get SSM Agent Status
// - ssm:DescribeInstanceInformation
//
// Action: Run a command and get its output.
// - ssm:SendCommand
// - ssm:GetCommandInvocation
Expand Down
7 changes: 6 additions & 1 deletion lib/srv/discovery/discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,9 +460,14 @@ func (s *Server) initAWSWatchers(matchers []types.AWSMatcher) error {
s.caRotationCh = make(chan []types.Server)

if s.ec2Installer == nil {
s.ec2Installer = server.NewSSMInstaller(server.SSMInstallerConfig{
ec2installer, err := server.NewSSMInstaller(server.SSMInstallerConfig{
Emitter: s.Emitter,
})
if err != nil {
return trace.Wrap(err)
}

s.ec2Installer = ec2installer
}

lr, err := newLabelReconciler(&labelReconcilerConfig{
Expand Down
3 changes: 3 additions & 0 deletions lib/srv/server/ec2_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,9 @@ const (
)

// awsEC2APIChunkSize is the max number of instances SSM will send commands to at a time
// This is used for limiting the number of instances for API Calls:
// ssm:SendCommand only accepts between 0 and 50.
// ssm:DescribeInstanceInformation only accepts between 5 and 50.
const awsEC2APIChunkSize = 50

func newEC2InstanceFetcher(cfg ec2FetcherConfig) *ec2InstanceFetcher {
Expand Down
153 changes: 149 additions & 4 deletions lib/srv/server/ssm_install.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package server
import (
"context"
"errors"
"log/slog"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
Expand All @@ -30,7 +31,9 @@ import (
"github.com/gravitational/trace"
"golang.org/x/sync/errgroup"

"github.com/gravitational/teleport"
apievents "github.com/gravitational/teleport/api/types/events"
awslib "github.com/gravitational/teleport/lib/cloud/aws"
libevents "github.com/gravitational/teleport/lib/events"
)

Expand All @@ -39,6 +42,9 @@ import (
type SSMInstallerConfig struct {
// Emitter is an events emitter.
Emitter apievents.Emitter
// Logger is used to log messages.
// Optional. A logger is created if one not supplied.
Logger *slog.Logger
}

// SSMInstaller handles running SSM commands that install Teleport on EC2 instances.
Expand All @@ -65,11 +71,27 @@ type SSMRunRequest struct {
AccountID string
}

// CheckAndSetDefaults ensures the emitter is present and creates a default logger if one is not provided.
func (c *SSMInstallerConfig) checkAndSetDefaults() error {
if c.Emitter == nil {
return trace.BadParameter("missing audit event emitter")
}

if c.Logger == nil {
c.Logger = slog.Default().With(teleport.ComponentKey, "ssminstaller")
}

return nil
}

// NewSSMInstaller returns a new instance of the SSM installer that installs Teleport on EC2 instances.
func NewSSMInstaller(cfg SSMInstallerConfig) *SSMInstaller {
func NewSSMInstaller(cfg SSMInstallerConfig) (*SSMInstaller, error) {
if err := cfg.checkAndSetDefaults(); err != nil {
return nil, trace.Wrap(err)
}
return &SSMInstaller{
SSMInstallerConfig: cfg,
}
}, nil
}

// Run executes the SSM document and then blocks until the command has completed.
Expand All @@ -84,9 +106,34 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error {
params[k] = []*string{aws.String(v)}
}

validInstances := ids
instancesState, err := si.describeSSMAgentState(ctx, req, ids)
switch {
case trace.IsAccessDenied(err):
// describeSSMAgentState uses `ssm:DescribeInstanceInformation` to gather all the instances information.
// Previous Docs versions (pre-v16) did not ask for that permission.
// If the IAM role does not have access to that action, an Access Denied is returned here.
// The process continues but the user is warned that they should add that permission to get better diagnostics.
si.Logger.WarnContext(ctx,
"Add ssm:DescribeInstanceInformation action to IAM Role to improve diagnostics of EC2 Teleport installation failures",
"error", err)

case err != nil:
return trace.Wrap(err)

default:
if err := si.emitInvalidInstanceEvents(ctx, req, instancesState); err != nil {
si.Logger.ErrorContext(ctx,
"Failed to emit invalid instances",
"instances", instancesState,
"error", err)
}
validInstances = instancesState.valid
}

output, err := req.SSM.SendCommandWithContext(ctx, &ssm.SendCommandInput{
DocumentName: aws.String(req.DocumentName),
InstanceIds: aws.StringSlice(ids),
InstanceIds: aws.StringSlice(validInstances),
Parameters: params,
})
if err != nil {
Expand All @@ -95,7 +142,7 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error {

g, ctx := errgroup.WithContext(ctx)
g.SetLimit(10)
for _, inst := range ids {
for _, inst := range validInstances {
inst := inst
g.Go(func() error {
return trace.Wrap(si.checkCommand(ctx, req, output.Command.CommandId, &inst))
Expand All @@ -104,6 +151,104 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error {
return trace.Wrap(g.Wait())
}

func invalidSSMInstanceEvent(accountID, region, instanceID, status string) apievents.SSMRun {
return apievents.SSMRun{
Metadata: apievents.Metadata{
Type: libevents.SSMRunEvent,
Code: libevents.SSMRunFailCode,
},
CommandID: "no-command",
AccountID: accountID,
Region: region,
ExitCode: -1,
InstanceID: instanceID,
Status: status,
}
}

func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRunRequest, instanceIDsState *instanceIDsSSMState) error {
var errs []error
for _, instanceID := range instanceIDsState.missing {
event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID,
"EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.",
)
if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil {
errs = append(errs, trace.Wrap(err))
}
}

for _, instanceID := range instanceIDsState.connectionLost {
event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID,
"SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.",
)
if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil {
errs = append(errs, trace.Wrap(err))
}
}

for _, instanceID := range instanceIDsState.unsupportedOS {
event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID,
"EC2 instance is running an unsupported Operating System. Only Linux is supported.",
)
if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil {
errs = append(errs, trace.Wrap(err))
}
}

return errors.Join(errs...)
}

// instanceIDsSSMState contains a list of EC2 Instance IDs for a given state.
type instanceIDsSSMState struct {
valid []string
missing []string
connectionLost []string
unsupportedOS []string
}

// describeSSMAgentState returns the instanceIDsSSMState for all the instances.
func (si *SSMInstaller) describeSSMAgentState(ctx context.Context, req SSMRunRequest, allInstanceIDs []string) (*instanceIDsSSMState, error) {
ret := &instanceIDsSSMState{}

ssmInstancesInfo, err := req.SSM.DescribeInstanceInformationWithContext(ctx, &ssm.DescribeInstanceInformationInput{
Filters: []*ssm.InstanceInformationStringFilter{
{Key: aws.String(ssm.InstanceInformationFilterKeyInstanceIds), Values: aws.StringSlice(allInstanceIDs)},
},
MaxResults: aws.Int64(awsEC2APIChunkSize),
})
if err != nil {
return nil, trace.Wrap(awslib.ConvertRequestFailureError(err))
}

instanceStateByInstanceID := make(map[string]*ssm.InstanceInformation, len(ssmInstancesInfo.InstanceInformationList))
for _, instanceState := range ssmInstancesInfo.InstanceInformationList {
// instanceState.InstanceId always has the InstanceID value according to AWS Docs.
instanceStateByInstanceID[aws.StringValue(instanceState.InstanceId)] = instanceState
}

for _, instanceID := range allInstanceIDs {
instanceState, found := instanceStateByInstanceID[instanceID]
if !found {
ret.missing = append(ret.missing, instanceID)
continue
}

if aws.StringValue(instanceState.PingStatus) == ssm.PingStatusConnectionLost {
ret.connectionLost = append(ret.connectionLost, instanceID)
continue
}

if aws.StringValue(instanceState.PlatformType) != ssm.PlatformTypeLinux {
ret.unsupportedOS = append(ret.unsupportedOS, instanceID)
continue
}

ret.valid = append(ret.valid, instanceID)
}

return ret, nil
}

// skipAWSWaitErr is used to ignore the error returned from
// WaitUntilCommandExecutedWithContext if it is a resource not ready
// code as this can represent one of several different errors which
Expand Down
Loading

0 comments on commit cb6b769

Please sign in to comment.