Skip to content

Commit

Permalink
Add error reporting via Athena
Browse files Browse the repository at this point in the history
Fixes #315
  • Loading branch information
chuckwondo committed Jan 3, 2024
1 parent 222ebd7 commit daf7229
Show file tree
Hide file tree
Showing 14 changed files with 2,556 additions and 1,779 deletions.
205 changes: 205 additions & 0 deletions app/stacks/cumulus/athena.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
resource "aws_athena_database" "cumulus" {
name = lower(replace("${var.prefix}_failures", "-", "_"))
bucket = var.system_bucket
}

resource "aws_glue_catalog_table" "ingest_and_publish_workflow_failures" {
database_name = aws_athena_database.cumulus.name
name = "ingest_and_publish_workflow_failures"

table_type = "EXTERNAL_TABLE"

parameters = {
EXTERNAL = "TRUE"
classification = "json"
}

storage_descriptor {
location = "s3://${var.system_bucket}/failures/${module.ingest_and_publish_granule_workflow.name}/"
input_format = "org.apache.hadoop.mapred.TextInputFormat"
output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"

ser_de_info {
serialization_library = "org.openx.data.jsonserde.JsonSerDe"
}

columns {
name = "stack"
type = "string"
comment = "Name of the stack in which the failed workflow execution ran (e.g., prod, uat, dev1, dev2, etc.)"
}
columns {
name = "cumulus_version"
type = "string"
comment = "Version of Cumulus deployed during workflow execution"
}
columns {
name = "state_machine_arn"
type = "string"
comment = "ARN of the workflow (step function)"
}
columns {
name = "state_machine_name"
type = "string"
comment = "Name of the workflow (step function)"
}
columns {
name = "execution_name"
type = "string"
comment = "UUID of the workflow execution that failed"
}
columns {
name = "start_time"
type = "timestamp"
comment = "UTC time (in seconds, floating point) when the workflow execution started"
}
columns {
name = "parent_execution_arn"
type = "string"
comment = "ARN of the parent workflow execution that triggered this failed workflow execution"
}
columns {
name = "collection_name"
type = "string"
comment = "Name of the collection that was being processed"
}
columns {
name = "collection_version"
type = "string"
comment = "Version of the collection that was being processed"
}
columns {
name = "provider_bucket"
type = "string"
comment = "Name of the provider (source) bucket from which granules were being ingested"
}
columns {
name = "granule_ids"
type = "array<string>"
comment = "IDs of the granules that were being processed (controlled by the collection's meta.preferredQueueBatchSize setting [default: 1])"
}
columns {
name = "error_type"
type = "string"
comment = "Type of error that caused the workflow to fail"
}
columns {
name = "error_message"
type = "string"
comment = "Description of the error that caused the workflow to fail"
}
columns {
name = "error_trace"
type = "array<string>"
comment = "Stack trace of the error that caused the workflow to fail"
}
}
}

#-------------------------------------------------------------------------------
# Queries across all parent executions.
#-------------------------------------------------------------------------------

resource "aws_athena_named_query" "ingestion_failure_counts_by_error_type" {
name = "ingestion_failure_counts_by_error_type"
database = aws_athena_database.cumulus.name
description = "Failure counts by error type across all ingestions"
query = <<-QUERY
SELECT error_type, count(*) as count
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
GROUP BY error_type
ORDER BY count DESC;
QUERY
}

resource "aws_athena_named_query" "ingestion_failure_counts_by_error_type_by_parent" {
name = "ingestion_failure_counts_by_error_type_by_parent"
database = aws_athena_database.cumulus.name
description = "Failure counts by ingestion and error type across all ingestions"
query = <<-QUERY
SELECT parent_execution_arn, error_type, count(*) as count
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
GROUP BY parent_execution_arn, error_type
ORDER BY parent_execution_arn, count DESC;
QUERY
}

resource "aws_athena_named_query" "ingestion_failures_for_error_type" {
name = "ingestion_failures_for_error_type"
database = aws_athena_database.cumulus.name
description = "Failures for given error type across all ingestions"
query = <<-QUERY
SELECT error_type, error_message, error_trace
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE error_type = ?;
QUERY
}

#-------------------------------------------------------------------------------
# Queries for latest parent execution ARN, which is the most recently executed
# workflow execution, regardless of current status (i.e., it may still be
# running).
#-------------------------------------------------------------------------------

resource "aws_athena_named_query" "ingestion_failure_counts_by_error_type_for_latest" {
name = "ingestion_failure_counts_by_error_type_for_latest"
database = aws_athena_database.cumulus.name
description = "Failure counts by error type for most recent ingestion"
query = <<-QUERY
SELECT error_type, count(*) as count
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE parent_execution_arn = (
SELECT parent_execution_arn
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
ORDER BY start_time DESC
LIMIT 1
)
GROUP BY error_type
ORDER BY count DESC;
QUERY
}

resource "aws_athena_named_query" "ingestion_failures_for_error_type_for_latest" {
name = "ingestion_failures_for_error_type_for_latest"
database = aws_athena_database.cumulus.name
description = "Failures for given error type for most recent ingestion"
query = <<-QUERY
SELECT parent_execution_arn, error_type, error_message, error_trace
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE parent_execution_arn = (
SELECT parent_execution_arn
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
ORDER BY start_time DESC
LIMIT 1
)
AND error_type = ?;
QUERY
}

#-------------------------------------------------------------------------------
# Parameterized queries for a given parent execution ARN
#-------------------------------------------------------------------------------

resource "aws_athena_named_query" "ingestion_failure_counts_by_error_type_for_parent" {
name = "ingestion_failure_counts_by_error_type_for_parent"
database = aws_athena_database.cumulus.name
description = "Failure counts by error type for given parent execution"
query = <<-QUERY
SELECT error_type, count(*) as count
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE parent_execution_arn = ?
GROUP BY error_type
ORDER BY count DESC;
QUERY
}

resource "aws_athena_named_query" "ingestion_failures_for_error_type_for_parent" {
name = "ingestion_failures_for_error_type_for_parent"
database = aws_athena_database.cumulus.name
description = "Failures for given parent execution and error type"
query = <<-QUERY
SELECT error_type, error_message, error_trace
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE parent_execution_arn = ? AND error_type = ?;
QUERY
}
26 changes: 25 additions & 1 deletion app/stacks/cumulus/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,29 @@ resource "aws_lambda_function" "add_ummg_checksums" {
}
}

resource "aws_lambda_function" "record_workflow_failure" {
function_name = "${var.prefix}-RecordWorkflowFailure"
filename = data.archive_file.lambda.output_path
role = module.cumulus.lambda_processing_role_arn
handler = "index.recordWorkflowFailureHandler"
runtime = local.lambda_runtime
timeout = 60
memory_size = 128

source_code_hash = data.archive_file.lambda.output_base64sha256
layers = [module.cma.lambda_layer_version_arn]

tags = local.tags

dynamic "vpc_config" {
for_each = length(module.vpc.subnets.ids) == 0 ? [] : [1]
content {
subnet_ids = module.vpc.subnets.ids
security_group_ids = [aws_security_group.egress_only.id]
}
}
}

#-------------------------------------------------------------------------------
# MODULES
#-------------------------------------------------------------------------------
Expand Down Expand Up @@ -447,7 +470,8 @@ module "ingest_and_publish_granule_workflow" {
move_granules_task_arn : module.cumulus.move_granules_task.task_arn,
update_granules_cmr_metadata_file_links_task_arn : module.cumulus.update_granules_cmr_metadata_file_links_task.task_arn,
copy_to_archive_adapter_task_arn : module.cumulus.orca_copy_to_archive_adapter_task.task_arn,
post_to_cmr_task_arn : module.cumulus.post_to_cmr_task.task_arn
post_to_cmr_task_arn : module.cumulus.post_to_cmr_task.task_arn,
record_workflow_failure_task_arn : aws_lambda_function.record_workflow_failure.arn,
})
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
"rule": {
"type": "onetime"
},
"provider": "maxar",
"provider": "cumulus",
"collection": {
"name": "WV04_MSI_L1B",
"version": "1"
},
"workflow": "DiscoverAndQueueGranules",
"meta": {
"discoverOnly": false,
"providerPathFormat": "'css/nga/WV04/1B/'yyyy/DDD",
"providerPathFormat": "'WV04_MSI_L1B___1/'yyyy/DDD",
"ingestedPathFormat": "'WV04_MSI_L1B___1/'yyyy/DDD",
"startDate": "2017-05-04T00:00:00Z",
"endDate": "2017-05-05T00:00:00Z",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,13 +304,30 @@
"ErrorEquals": [
"States.ALL"
],
"Next": "Failed",
"Next": "RecordFailure",
"ResultPath": "$.exception"
}
],
"OutputPath": "$[0]",
"End": true
},
"RecordFailure": {
"Type": "Task",
"Resource": "${record_workflow_failure_task_arn}",
"Next": "Failed",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
]
},
"Failed": {
"Type": "Fail",
"Cause": "Workflow failed"
Expand Down
14 changes: 7 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"@cumulus/common": "18.1.0",
"@cumulus/cumulus-message-adapter-js": "2.0.4",
"@smithy/util-stream": "^2.0.17",
"date-fns": "^2.29.3",
"date-fns": "^3.0.6",
"duration-fns": "^3.0.1",
"fp-ts": "^2.11.5",
"fp-ts-contrib": "^0.1.29",
Expand All @@ -76,7 +76,7 @@
"xml2js": "^0.6.0"
},
"devDependencies": {
"@ava/typescript": "^3.0.1",
"@ava/typescript": "^4.1.0",
"@aws-sdk/client-dynamodb": "^3.370.0",
"@cumulus/api-client": "18.1.0",
"@cumulus/types": "18.1.0",
Expand All @@ -86,9 +86,9 @@
"@types/lodash": "^4.14.177",
"@types/node": "^16.11.1",
"@types/uuid": "^9.0.7",
"@typescript-eslint/eslint-plugin": "^5.1.0",
"@typescript-eslint/parser": "^5.1.0",
"ava": "^4.3.3",
"@typescript-eslint/eslint-plugin": "^5.62.0",
"@typescript-eslint/parser": "^5.62.0",
"ava": "^5.3.1",
"codecov": "^3.5.0",
"cumulus-cli": "github:NASA-IMPACT/cumulus-cli#68b8eb5",
"cz-conventional-changelog": "^3.3.0",
Expand All @@ -101,12 +101,12 @@
"npm-run-all": "^4.1.5",
"nyc": "^15.1.0",
"open-cli": "^7.0.1",
"prettier": "2.8.0",
"prettier": "^2.8.0",
"source-map-support": "^0.5.19",
"standard-version": "^9.0.0",
"ts-node": "^10.3.0",
"typedoc": "^0.25.3",
"typescript": "^4.4.4"
"typescript": "^4.9.0"
},
"resolutions": {
"@cumulus/**/@aws-sdk/client-s3": "^3.370.0",
Expand Down
6 changes: 3 additions & 3 deletions scripts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
"terraform-doctor": "yarn build && cd .. && node scripts/build/terraform-doctor.js"
},
"devDependencies": {
"@types/uuid": "^8.3.4",
"@types/uuid": "^9.0.7",
"env-cmd": "^10.1.0",
"typescript": "^4.5.4"
"typescript": "^4.9.0"
},
"dependencies": {
"@aws-sdk/client-sfn": "^3.370.0",
"@aws-sdk/client-sts": "^3.370.0",
"cmd-ts": "^0.10.0",
"cmd-ts": "0.13.0",
"lodash": "^4.17.21"
}
}
2 changes: 1 addition & 1 deletion scripts/src/generate-test-granule-files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ import * as fs from 'fs';
import * as path from 'path';

import * as Cmd from 'cmd-ts';
import { Directory } from 'cmd-ts/batteries/fs';
import * as Result from 'cmd-ts/dist/cjs/Result';
import { Directory } from 'cmd-ts/dist/cjs/batteries/fs';
import { Exit } from 'cmd-ts/dist/cjs/effects';
import * as fp from 'lodash/fp';

Expand Down
Loading

0 comments on commit daf7229

Please sign in to comment.