From b747a6aef73c334c8f9ab139ad3e64ad60d2eb3f Mon Sep 17 00:00:00 2001 From: Chuck Daniels Date: Thu, 18 Jan 2024 16:02:50 -0500 Subject: [PATCH] Increase ingest/publish rate to 1K granules/min In conjunction, decrease discover/queue rate to try to make it roughly the same rate as ingest/publish so that messages are never in jeopardy of reaching the retention period of 4 days (AWS allows up to 14 days, but Cumulus does not allow this to be configured for the background job queue), no matter how large the collection is. If we can manage to have the ingest/publish rate equal to the discover/queue rate, we can ingest a collection of any size without concern because no messages will ever be in the queue for more than perhaps a few minutes. However, since getting both rates to be identical is impossible, it would be better to err in favor of a slightly greater rate for ingest/publish because this would never allow messages to remain on the queue for more than a few moments. If we were to err slightly on the other side, where discover/queue is slightly faster, we would ever so slowly grow the queue. Given a large enough collection, even this slow growth would eventually lead to messages exceeding the retention period, but this would likely require a collection containing several million granules, perhaps at least 10M. Also, make error handling a bit more robust to make sure we do our utmost to retry and if all else fails we make sure we record the error for Athena queries. There have been recent disrepancies between the number of errors we see in Athena and the number of granules with status "failed", where Athena appears to be missing failures. This may be due to the RecordFailure step not more reliably capturing and writing failures to S3. Fixes #337 --- app/stacks/cumulus/main.tf | 6 +- .../discover-granules-workflow.asl.json | 4 +- ...gest-and-publish-granule-workflow.asl.json | 123 +++++++++++++----- 3 files changed, 95 insertions(+), 38 deletions(-) diff --git a/app/stacks/cumulus/main.tf b/app/stacks/cumulus/main.tf index 2b16e03..1b22ffa 100644 --- a/app/stacks/cumulus/main.tf +++ b/app/stacks/cumulus/main.tf @@ -161,9 +161,9 @@ resource "aws_cloudwatch_event_target" "background_job_queue_watcher" { arn = module.cumulus.sqs2sfThrottle_lambda_function_arn input = jsonencode( { - messageLimit = 300 + messageLimit = 1000 queueUrl = aws_sqs_queue.background_job_queue.id - timeLimit = 30 + timeLimit = 60 } ) } @@ -576,7 +576,7 @@ module "cumulus" { { id = "backgroundJobQueue", url = aws_sqs_queue.background_job_queue.id, - execution_limit = 500 + execution_limit = 1000 } ] } diff --git a/app/stacks/cumulus/templates/discover-granules-workflow.asl.json b/app/stacks/cumulus/templates/discover-granules-workflow.asl.json index 186696c..84cbaba 100644 --- a/app/stacks/cumulus/templates/discover-granules-workflow.asl.json +++ b/app/stacks/cumulus/templates/discover-granules-workflow.asl.json @@ -30,7 +30,7 @@ "DiscoverGranulesMap": { "Type": "Map", "End": true, - "MaxConcurrency": 7, + "MaxConcurrency": 3, "ToleratedFailurePercentage": 1, "ItemReader": { "Resource": "arn:aws:states:::s3:getObject", @@ -157,7 +157,7 @@ }, "QueueGranulesMap": { "Type": "Map", - "MaxConcurrency": 2, + "MaxConcurrency": 1, "ToleratedFailurePercentage": 0, "ItemsPath": "$", "ResultWriter": { diff --git a/app/stacks/cumulus/templates/ingest-and-publish-granule-workflow.asl.json b/app/stacks/cumulus/templates/ingest-and-publish-granule-workflow.asl.json index b42fb59..ddeb7af 100644 --- a/app/stacks/cumulus/templates/ingest-and-publish-granule-workflow.asl.json +++ b/app/stacks/cumulus/templates/ingest-and-publish-granule-workflow.asl.json @@ -40,15 +40,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", "Lambda.SdkClientException", "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" + }, + { + "ErrorEquals": [ + "States.ALL" + ], + "IntervalSeconds": 8, + "MaxAttempts": 3, + "BackoffRate": 2, + "JitterStrategy": "FULL" } ] }, @@ -95,23 +106,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", "Lambda.SdkClientException", "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" }, { "ErrorEquals": [ "States.ALL" ], - "IntervalSeconds": 2, + "IntervalSeconds": 8, + "MaxAttempts": 3, "BackoffRate": 2, - "MaxAttempts": 3 + "JitterStrategy": "FULL" } ] }, @@ -147,23 +161,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", "Lambda.SdkClientException", "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" }, { "ErrorEquals": [ "States.ALL" ], - "IntervalSeconds": 2, + "IntervalSeconds": 8, + "MaxAttempts": 3, "BackoffRate": 2, - "MaxAttempts": 3 + "JitterStrategy": "FULL" } ] }, @@ -200,23 +217,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", "Lambda.SdkClientException", "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" }, { "ErrorEquals": [ "States.ALL" ], - "IntervalSeconds": 2, + "IntervalSeconds": 8, + "MaxAttempts": 3, "BackoffRate": 2, - "MaxAttempts": 3 + "JitterStrategy": "FULL" } ] }, @@ -244,23 +264,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", "Lambda.SdkClientException", "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" }, { "ErrorEquals": [ "States.ALL" ], - "IntervalSeconds": 2, + "IntervalSeconds": 8, + "MaxAttempts": 3, "BackoffRate": 2, - "MaxAttempts": 3 + "JitterStrategy": "FULL" } ] }, @@ -297,23 +320,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", "Lambda.SdkClientException", "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" }, { "ErrorEquals": [ "States.ALL" ], - "IntervalSeconds": 2, + "IntervalSeconds": 8, + "MaxAttempts": 3, "BackoffRate": 2, - "MaxAttempts": 3 + "JitterStrategy": "FULL" } ] }, @@ -339,23 +365,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", "Lambda.SdkClientException", "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" }, { "ErrorEquals": [ "States.ALL" ], - "IntervalSeconds": 2, + "IntervalSeconds": 8, + "MaxAttempts": 3, "BackoffRate": 2, - "MaxAttempts": 3 + "JitterStrategy": "FULL" } ] }, @@ -382,23 +411,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", "Lambda.SdkClientException", "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" }, { "ErrorEquals": [ "States.ALL" ], - "IntervalSeconds": 2, + "IntervalSeconds": 8, + "MaxAttempts": 3, "BackoffRate": 2, - "MaxAttempts": 3 + "JitterStrategy": "FULL" } ], "End": true @@ -407,6 +439,18 @@ } ], "Catch": [ + { + "ErrorEquals": [ + "Lambda.Unknown", + "Lambda.ClientExecutionTimeoutException", + "Lambda.ServiceException", + "Lambda.AWSLambdaException", + "Lambda.SdkClientException", + "Lambda.TooManyRequestsException" + ], + "Next": "RecordFailure", + "ResultPath": "$.exception" + }, { "ErrorEquals": [ "States.ALL" @@ -425,13 +469,26 @@ "Retry": [ { "ErrorEquals": [ + "Lambda.Unknown", + "Lambda.ClientExecutionTimeoutException", "Lambda.ServiceException", "Lambda.AWSLambdaException", - "Lambda.SdkClientException" + "Lambda.SdkClientException", + "Lambda.TooManyRequestsException" ], - "IntervalSeconds": 2, + "IntervalSeconds": 4, "MaxAttempts": 6, - "BackoffRate": 2 + "BackoffRate": 2, + "JitterStrategy": "FULL" + }, + { + "ErrorEquals": [ + "States.ALL" + ], + "IntervalSeconds": 8, + "MaxAttempts": 3, + "BackoffRate": 2, + "JitterStrategy": "FULL" } ] },