Skip to content

Commit

Permalink
Increase ingest/publish rate to 1K granules/min
Browse files Browse the repository at this point in the history
In conjunction, decrease discover/queue rate to try to make it roughly
the same rate as ingest/publish so that messages are never in jeopardy
of reaching the retention period of 4 days (AWS allows up to 14 days,
but Cumulus does not allow this to be configured for the background job
queue), no matter how large the collection is.  If we can manage to
have the ingest/publish rate equal to the discover/queue rate, we can
ingest a collection of any size without concern because no messages will
ever be in the queue for more than perhaps a few minutes.

However, since getting both rates to be identical is impossible, it
would be better to err in favor of a slightly greater rate for
ingest/publish because this would never allow messages to remain on the
queue for more than a few moments.  If we were to err slightly on the
other side, where discover/queue is slightly faster, we would ever so
slowly grow the queue.  Given a large enough collection, even this slow
growth would eventually lead to messages exceeding the retention period,
but this would likely require a collection containing several million
granules, perhaps at least 10M.

Also, make error handling a bit more robust to make sure we do
our utmost to retry and if all else fails we make sure we record
the error for Athena queries. There have been recent disrepancies
between the number of errors we see in Athena and the number
of granules with status "failed", where Athena appears to be missing
failures. This may be due to the RecordFailure step not more reliably
capturing and writing failures to S3.

Fixes #337
  • Loading branch information
chuckwondo committed Jan 26, 2024
1 parent 8e38c52 commit b747a6a
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 38 deletions.
6 changes: 3 additions & 3 deletions app/stacks/cumulus/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
input = jsonencode(
{
messageLimit = 300
messageLimit = 1000
queueUrl = aws_sqs_queue.background_job_queue.id
timeLimit = 30
timeLimit = 60
}
)
}
Expand Down Expand Up @@ -576,7 +576,7 @@ module "cumulus" {
{
id = "backgroundJobQueue",
url = aws_sqs_queue.background_job_queue.id,
execution_limit = 500
execution_limit = 1000
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"DiscoverGranulesMap": {
"Type": "Map",
"End": true,
"MaxConcurrency": 7,
"MaxConcurrency": 3,
"ToleratedFailurePercentage": 1,
"ItemReader": {
"Resource": "arn:aws:states:::s3:getObject",
Expand Down Expand Up @@ -157,7 +157,7 @@
},
"QueueGranulesMap": {
"Type": "Map",
"MaxConcurrency": 2,
"MaxConcurrency": 1,
"ToleratedFailurePercentage": 0,
"ItemsPath": "$",
"ResultWriter": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"JitterStrategy": "FULL"
}
]
},
Expand Down Expand Up @@ -95,23 +106,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"MaxAttempts": 3
"JitterStrategy": "FULL"
}
]
},
Expand Down Expand Up @@ -147,23 +161,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"MaxAttempts": 3
"JitterStrategy": "FULL"
}
]
},
Expand Down Expand Up @@ -200,23 +217,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"MaxAttempts": 3
"JitterStrategy": "FULL"
}
]
},
Expand Down Expand Up @@ -244,23 +264,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"MaxAttempts": 3
"JitterStrategy": "FULL"
}
]
},
Expand Down Expand Up @@ -297,23 +320,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"MaxAttempts": 3
"JitterStrategy": "FULL"
}
]
},
Expand All @@ -339,23 +365,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"MaxAttempts": 3
"JitterStrategy": "FULL"
}
]
},
Expand All @@ -382,23 +411,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"MaxAttempts": 3
"JitterStrategy": "FULL"
}
],
"End": true
Expand All @@ -407,6 +439,18 @@
}
],
"Catch": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"Next": "RecordFailure",
"ResultPath": "$.exception"
},
{
"ErrorEquals": [
"States.ALL"
Expand All @@ -425,13 +469,26 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.Unknown",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"IntervalSeconds": 4,
"MaxAttempts": 6,
"BackoffRate": 2
"BackoffRate": 2,
"JitterStrategy": "FULL"
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 8,
"MaxAttempts": 3,
"BackoffRate": 2,
"JitterStrategy": "FULL"
}
]
},
Expand Down

0 comments on commit b747a6a

Please sign in to comment.