Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve messages when EC2 Auto Discover with SSM fails #41465

Merged
merged 3 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/pages/auto-discovery/servers/ec2-discovery.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ AWS
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand"
],
Expand All @@ -183,6 +184,7 @@ AWS
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand"
],
Expand Down
3 changes: 2 additions & 1 deletion lib/cloud/aws/policy_statements.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,9 @@ func StatementForEC2SSMAutoDiscover() *Statement {
Effect: EffectAllow,
Actions: []string{
"ec2:DescribeInstances",
"ssm:SendCommand",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand",
},
Resources: allResources,
}
Expand Down
8 changes: 6 additions & 2 deletions lib/configurators/aws/aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,8 +582,10 @@ func TestAWSIAMDocuments(t *testing.T) {
Effect: "Allow",
Actions: []string{
"ec2:DescribeInstances",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand"},
"ssm:SendCommand",
},
Resources: []string{"*"},
},
},
Expand All @@ -592,8 +594,10 @@ func TestAWSIAMDocuments(t *testing.T) {
Effect: "Allow",
Actions: []string{
"ec2:DescribeInstances",
"ssm:DescribeInstanceInformation",
"ssm:GetCommandInvocation",
"ssm:SendCommand"},
"ssm:SendCommand",
},
Resources: []string{"*"},
},
},
Expand Down
5 changes: 4 additions & 1 deletion lib/integrations/awsoidc/ec2_ssm_iam_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,14 @@ func NewEC2SSMConfigureClient(ctx context.Context, region string) (EC2SSMConfigu
}

// ConfigureEC2SSM creates the required resources in AWS to enable EC2 Auto Discover using script mode..
// It creates an embedded policy with the following permissions:
// It creates an inline policy with the following permissions:
//
// Action: List EC2 instances where teleport is going to be installed.
// - ec2:DescribeInstances
//
// Action: Get SSM Agent Status
// - ssm:DescribeInstanceInformation
//
// Action: Run a command and get its output.
// - ssm:SendCommand
// - ssm:GetCommandInvocation
Expand Down
7 changes: 6 additions & 1 deletion lib/srv/discovery/discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,9 +460,14 @@ func (s *Server) initAWSWatchers(matchers []types.AWSMatcher) error {
s.caRotationCh = make(chan []types.Server)

if s.ec2Installer == nil {
s.ec2Installer = server.NewSSMInstaller(server.SSMInstallerConfig{
ec2installer, err := server.NewSSMInstaller(server.SSMInstallerConfig{
Emitter: s.Emitter,
})
if err != nil {
return trace.Wrap(err)
}

s.ec2Installer = ec2installer
}

lr, err := newLabelReconciler(&labelReconcilerConfig{
Expand Down
3 changes: 3 additions & 0 deletions lib/srv/server/ec2_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,9 @@ const (
)

// awsEC2APIChunkSize is the max number of instances SSM will send commands to at a time
// This is used for limiting the number of instances for API Calls:
// ssm:SendCommand only accepts between 0 and 50.
// ssm:DescribeInstanceInformation only accepts between 5 and 50.
const awsEC2APIChunkSize = 50

func newEC2InstanceFetcher(cfg ec2FetcherConfig) *ec2InstanceFetcher {
Expand Down
153 changes: 149 additions & 4 deletions lib/srv/server/ssm_install.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package server
import (
"context"
"errors"
"log/slog"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
Expand All @@ -30,7 +31,9 @@ import (
"github.com/gravitational/trace"
"golang.org/x/sync/errgroup"

"github.com/gravitational/teleport"
apievents "github.com/gravitational/teleport/api/types/events"
awslib "github.com/gravitational/teleport/lib/cloud/aws"
libevents "github.com/gravitational/teleport/lib/events"
)

Expand All @@ -39,6 +42,9 @@ import (
type SSMInstallerConfig struct {
// Emitter is an events emitter.
Emitter apievents.Emitter
// Logger is used to log messages.
// Optional. A logger is created if one not supplied.
Logger *slog.Logger
}

// SSMInstaller handles running SSM commands that install Teleport on EC2 instances.
Expand All @@ -65,11 +71,27 @@ type SSMRunRequest struct {
AccountID string
}

// CheckAndSetDefaults ensures the emitter is present and creates a default logger if one is not provided.
func (c *SSMInstallerConfig) checkAndSetDefaults() error {
if c.Emitter == nil {
return trace.BadParameter("missing audit event emitter")
}

if c.Logger == nil {
c.Logger = slog.Default().With(teleport.ComponentKey, "ssminstaller")
}

return nil
}

// NewSSMInstaller returns a new instance of the SSM installer that installs Teleport on EC2 instances.
func NewSSMInstaller(cfg SSMInstallerConfig) *SSMInstaller {
func NewSSMInstaller(cfg SSMInstallerConfig) (*SSMInstaller, error) {
if err := cfg.checkAndSetDefaults(); err != nil {
return nil, trace.Wrap(err)
}
return &SSMInstaller{
SSMInstallerConfig: cfg,
}
}, nil
}

// Run executes the SSM document and then blocks until the command has completed.
Expand All @@ -84,9 +106,34 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error {
params[k] = []*string{aws.String(v)}
}

validInstances := ids
instancesState, err := si.describeSSMAgentState(ctx, req, ids)
switch {
case trace.IsAccessDenied(err):
// describeSSMAgentState uses `ssm:DescribeInstanceInformation` to gather all the instances information.
// Previous Docs versions (pre-v16) did not ask for that permission.
// If the IAM role does not have access to that action, an Access Denied is returned here.
// The process continues but the user is warned that they should add that permission to get better diagnostics.
si.Logger.WarnContext(ctx,
"Add ssm:DescribeInstanceInformation action to IAM Role to improve diagnostics of EC2 Teleport installation failures",
"error", err)

case err != nil:
return trace.Wrap(err)

default:
if err := si.emitInvalidInstanceEvents(ctx, req, instancesState); err != nil {
si.Logger.ErrorContext(ctx,
"Failed to emit invalid instances",
"instances", instancesState,
"error", err)
}
validInstances = instancesState.valid
}

output, err := req.SSM.SendCommandWithContext(ctx, &ssm.SendCommandInput{
DocumentName: aws.String(req.DocumentName),
InstanceIds: aws.StringSlice(ids),
InstanceIds: aws.StringSlice(validInstances),
Parameters: params,
})
if err != nil {
Expand All @@ -95,7 +142,7 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error {

g, ctx := errgroup.WithContext(ctx)
g.SetLimit(10)
for _, inst := range ids {
for _, inst := range validInstances {
inst := inst
g.Go(func() error {
return trace.Wrap(si.checkCommand(ctx, req, output.Command.CommandId, &inst))
Expand All @@ -104,6 +151,104 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error {
return trace.Wrap(g.Wait())
}

func invalidSSMInstanceEvent(accountID, region, instanceID, status string) apievents.SSMRun {
return apievents.SSMRun{
Metadata: apievents.Metadata{
Type: libevents.SSMRunEvent,
Code: libevents.SSMRunFailCode,
},
CommandID: "no-command",
AccountID: accountID,
Region: region,
ExitCode: -1,
InstanceID: instanceID,
Status: status,
}
}

func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRunRequest, instanceIDsState *instanceIDsSSMState) error {
var errs []error
for _, instanceID := range instanceIDsState.missing {
event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID,
"EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.",
)
if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil {
errs = append(errs, trace.Wrap(err))
}
}

for _, instanceID := range instanceIDsState.connectionLost {
event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID,
"SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.",
)
if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil {
errs = append(errs, trace.Wrap(err))
}
}

for _, instanceID := range instanceIDsState.unsupportedOS {
event := invalidSSMInstanceEvent(req.AccountID, req.Region, instanceID,
"EC2 instance is running an unsupported Operating System. Only Linux is supported.",
)
if err := si.Emitter.EmitAuditEvent(ctx, &event); err != nil {
errs = append(errs, trace.Wrap(err))
}
}

return errors.Join(errs...)
}

// instanceIDsSSMState contains a list of EC2 Instance IDs for a given state.
type instanceIDsSSMState struct {
valid []string
codingllama marked this conversation as resolved.
Show resolved Hide resolved
missing []string
connectionLost []string
unsupportedOS []string
}

// describeSSMAgentState returns the instanceIDsSSMState for all the instances.
func (si *SSMInstaller) describeSSMAgentState(ctx context.Context, req SSMRunRequest, allInstanceIDs []string) (*instanceIDsSSMState, error) {
ret := &instanceIDsSSMState{}

ssmInstancesInfo, err := req.SSM.DescribeInstanceInformationWithContext(ctx, &ssm.DescribeInstanceInformationInput{
Filters: []*ssm.InstanceInformationStringFilter{
{Key: aws.String(ssm.InstanceInformationFilterKeyInstanceIds), Values: aws.StringSlice(allInstanceIDs)},
},
MaxResults: aws.Int64(awsEC2APIChunkSize),
})
if err != nil {
return nil, trace.Wrap(awslib.ConvertRequestFailureError(err))
}

instanceStateByInstanceID := make(map[string]*ssm.InstanceInformation, len(ssmInstancesInfo.InstanceInformationList))
for _, instanceState := range ssmInstancesInfo.InstanceInformationList {
// instanceState.InstanceId always has the InstanceID value according to AWS Docs.
instanceStateByInstanceID[aws.StringValue(instanceState.InstanceId)] = instanceState
marcoandredinis marked this conversation as resolved.
Show resolved Hide resolved
}

for _, instanceID := range allInstanceIDs {
instanceState, found := instanceStateByInstanceID[instanceID]
if !found {
ret.missing = append(ret.missing, instanceID)
continue
}

if aws.StringValue(instanceState.PingStatus) == ssm.PingStatusConnectionLost {
ret.connectionLost = append(ret.connectionLost, instanceID)
continue
}

if aws.StringValue(instanceState.PlatformType) != ssm.PlatformTypeLinux {
ret.unsupportedOS = append(ret.unsupportedOS, instanceID)
continue
}

ret.valid = append(ret.valid, instanceID)
}

return ret, nil
}

// skipAWSWaitErr is used to ignore the error returned from
// WaitUntilCommandExecutedWithContext if it is a resource not ready
// code as this can represent one of several different errors which
Expand Down
Loading
Loading