Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add error reporting via Athena #316

Merged
merged 1 commit into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions app/stacks/cumulus/athena.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
resource "aws_athena_database" "cumulus" {
name = lower(replace("${var.prefix}_failures", "-", "_"))
bucket = var.system_bucket
}

resource "aws_glue_catalog_table" "ingest_and_publish_workflow_failures" {
database_name = aws_athena_database.cumulus.name
name = "ingest_and_publish_workflow_failures"

table_type = "EXTERNAL_TABLE"

parameters = {
EXTERNAL = "TRUE"
classification = "json"
}

storage_descriptor {
location = "s3://${var.system_bucket}/failures/${module.ingest_and_publish_granule_workflow.name}/"
input_format = "org.apache.hadoop.mapred.TextInputFormat"
output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"

ser_de_info {
serialization_library = "org.openx.data.jsonserde.JsonSerDe"
}

columns {
name = "stack"
type = "string"
comment = "Name of the stack in which the failed workflow execution ran (e.g., prod, uat, dev1, dev2, etc.)"
}
columns {
name = "cumulus_version"
type = "string"
comment = "Version of Cumulus deployed during workflow execution"
}
columns {
name = "state_machine_arn"
type = "string"
comment = "ARN of the workflow (step function)"
}
columns {
name = "state_machine_name"
type = "string"
comment = "Name of the workflow (step function)"
}
columns {
name = "execution_name"
type = "string"
comment = "UUID of the workflow execution that failed"
}
columns {
name = "start_time"
type = "timestamp"
comment = "UTC time (in seconds, floating point) when the workflow execution started"
}
columns {
name = "parent_execution_arn"
type = "string"
comment = "ARN of the parent workflow execution that triggered this failed workflow execution"
}
columns {
name = "collection_name"
type = "string"
comment = "Name of the collection that was being processed"
}
columns {
name = "collection_version"
type = "string"
comment = "Version of the collection that was being processed"
}
columns {
name = "provider_bucket"
type = "string"
comment = "Name of the provider (source) bucket from which granules were being ingested"
}
columns {
name = "granule_ids"
type = "array<string>"
comment = "IDs of the granules that were being processed (controlled by the collection's meta.preferredQueueBatchSize setting [default: 1])"
}
columns {
name = "error_type"
type = "string"
comment = "Type of error that caused the workflow to fail"
}
columns {
name = "error_message"
type = "string"
comment = "Description of the error that caused the workflow to fail"
}
columns {
name = "error_trace"
type = "array<string>"
comment = "Stack trace of the error that caused the workflow to fail"
}
}
}

#-------------------------------------------------------------------------------
# Queries across all parent executions.
#-------------------------------------------------------------------------------

resource "aws_athena_named_query" "ingestion_failure_counts_by_error_type" {
name = "ingestion_failure_counts_by_error_type"
database = aws_athena_database.cumulus.name
description = "Failure counts by error type across all ingestions"
query = <<-QUERY
SELECT error_type, count(*) as count
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
GROUP BY error_type
ORDER BY count DESC;
QUERY
}

resource "aws_athena_named_query" "ingestion_failure_counts_by_error_type_by_parent" {
name = "ingestion_failure_counts_by_error_type_by_parent"
database = aws_athena_database.cumulus.name
description = "Failure counts by ingestion and error type across all ingestions"
query = <<-QUERY
SELECT parent_execution_arn, error_type, count(*) as count
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
GROUP BY parent_execution_arn, error_type
ORDER BY parent_execution_arn, count DESC;
QUERY
}

resource "aws_athena_named_query" "ingestion_failures_for_error_type" {
name = "ingestion_failures_for_error_type"
database = aws_athena_database.cumulus.name
description = "Failures for given error type across all ingestions"
query = <<-QUERY
SELECT error_type, error_message, error_trace
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE error_type = ?;
QUERY
}

#-------------------------------------------------------------------------------
# Queries for latest parent execution ARN, which is the most recently executed
# workflow execution, regardless of current status (i.e., it may still be
# running).
#-------------------------------------------------------------------------------

resource "aws_athena_named_query" "ingestion_failure_counts_by_error_type_for_latest" {
name = "ingestion_failure_counts_by_error_type_for_latest"
database = aws_athena_database.cumulus.name
description = "Failure counts by error type for most recent ingestion"
query = <<-QUERY
SELECT error_type, count(*) as count
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE parent_execution_arn = (
SELECT parent_execution_arn
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
ORDER BY start_time DESC
LIMIT 1
)
GROUP BY error_type
ORDER BY count DESC;
QUERY
}

resource "aws_athena_named_query" "ingestion_failures_for_error_type_for_latest" {
name = "ingestion_failures_for_error_type_for_latest"
database = aws_athena_database.cumulus.name
description = "Failures for given error type for most recent ingestion"
query = <<-QUERY
SELECT parent_execution_arn, error_type, error_message, error_trace
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE parent_execution_arn = (
SELECT parent_execution_arn
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
ORDER BY start_time DESC
LIMIT 1
)
AND error_type = ?;
QUERY
}

#-------------------------------------------------------------------------------
# Parameterized queries for a given parent execution ARN
#-------------------------------------------------------------------------------

resource "aws_athena_named_query" "ingestion_failure_counts_by_error_type_for_parent" {
name = "ingestion_failure_counts_by_error_type_for_parent"
database = aws_athena_database.cumulus.name
description = "Failure counts by error type for given parent execution"
query = <<-QUERY
SELECT error_type, count(*) as count
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE parent_execution_arn = ?
GROUP BY error_type
ORDER BY count DESC;
QUERY
}

resource "aws_athena_named_query" "ingestion_failures_for_error_type_for_parent" {
name = "ingestion_failures_for_error_type_for_parent"
database = aws_athena_database.cumulus.name
description = "Failures for given parent execution and error type"
query = <<-QUERY
SELECT error_type, error_message, error_trace
FROM ${aws_glue_catalog_table.ingest_and_publish_workflow_failures.name}
WHERE parent_execution_arn = ? AND error_type = ?;
QUERY
}
26 changes: 25 additions & 1 deletion app/stacks/cumulus/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,29 @@ resource "aws_lambda_function" "add_ummg_checksums" {
}
}

resource "aws_lambda_function" "record_workflow_failure" {
function_name = "${var.prefix}-RecordWorkflowFailure"
filename = data.archive_file.lambda.output_path
role = module.cumulus.lambda_processing_role_arn
handler = "index.recordWorkflowFailureHandler"
runtime = local.lambda_runtime
timeout = 60
memory_size = 128

source_code_hash = data.archive_file.lambda.output_base64sha256
layers = [module.cma.lambda_layer_version_arn]

tags = local.tags

dynamic "vpc_config" {
for_each = length(module.vpc.subnets.ids) == 0 ? [] : [1]
content {
subnet_ids = module.vpc.subnets.ids
security_group_ids = [aws_security_group.egress_only.id]
}
}
}

#-------------------------------------------------------------------------------
# MODULES
#-------------------------------------------------------------------------------
Expand Down Expand Up @@ -447,7 +470,8 @@ module "ingest_and_publish_granule_workflow" {
move_granules_task_arn : module.cumulus.move_granules_task.task_arn,
update_granules_cmr_metadata_file_links_task_arn : module.cumulus.update_granules_cmr_metadata_file_links_task.task_arn,
copy_to_archive_adapter_task_arn : module.cumulus.orca_copy_to_archive_adapter_task.task_arn,
post_to_cmr_task_arn : module.cumulus.post_to_cmr_task.task_arn
post_to_cmr_task_arn : module.cumulus.post_to_cmr_task.task_arn,
record_workflow_failure_task_arn : aws_lambda_function.record_workflow_failure.arn,
})
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
"rule": {
"type": "onetime"
},
"provider": "maxar",
"provider": "cumulus",
"collection": {
"name": "WV04_MSI_L1B",
"version": "1"
},
"workflow": "DiscoverAndQueueGranules",
"meta": {
"discoverOnly": false,
"providerPathFormat": "'css/nga/WV04/1B/'yyyy/DDD",
"providerPathFormat": "'WV04_MSI_L1B___1/'yyyy/DDD",
"ingestedPathFormat": "'WV04_MSI_L1B___1/'yyyy/DDD",
"startDate": "2017-05-04T00:00:00Z",
"endDate": "2017-05-05T00:00:00Z",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,13 +304,30 @@
"ErrorEquals": [
"States.ALL"
],
"Next": "Failed",
"Next": "RecordFailure",
"ResultPath": "$.exception"
}
],
"OutputPath": "$[0]",
"End": true
},
"RecordFailure": {
"Type": "Task",
"Resource": "${record_workflow_failure_task_arn}",
"Next": "Failed",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
]
},
"Failed": {
"Type": "Fail",
"Cause": "Workflow failed"
Expand Down
14 changes: 7 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"@cumulus/common": "18.1.0",
"@cumulus/cumulus-message-adapter-js": "2.0.4",
"@smithy/util-stream": "^2.0.17",
"date-fns": "^2.29.3",
"date-fns": "^3.0.6",
"duration-fns": "^3.0.1",
"fp-ts": "^2.11.5",
"fp-ts-contrib": "^0.1.29",
Expand All @@ -76,7 +76,7 @@
"xml2js": "^0.6.0"
},
"devDependencies": {
"@ava/typescript": "^3.0.1",
"@ava/typescript": "^4.1.0",
"@aws-sdk/client-dynamodb": "^3.370.0",
"@cumulus/api-client": "18.1.0",
"@cumulus/types": "18.1.0",
Expand All @@ -86,9 +86,9 @@
"@types/lodash": "^4.14.177",
"@types/node": "^16.11.1",
"@types/uuid": "^9.0.7",
"@typescript-eslint/eslint-plugin": "^5.1.0",
"@typescript-eslint/parser": "^5.1.0",
"ava": "^4.3.3",
"@typescript-eslint/eslint-plugin": "^5.62.0",
"@typescript-eslint/parser": "^5.62.0",
"ava": "^5.3.1",
"codecov": "^3.5.0",
"cumulus-cli": "github:NASA-IMPACT/cumulus-cli#68b8eb5",
"cz-conventional-changelog": "^3.3.0",
Expand All @@ -101,12 +101,12 @@
"npm-run-all": "^4.1.5",
"nyc": "^15.1.0",
"open-cli": "^7.0.1",
"prettier": "2.8.0",
"prettier": "^2.8.0",
"source-map-support": "^0.5.19",
"standard-version": "^9.0.0",
"ts-node": "^10.3.0",
"typedoc": "^0.25.3",
"typescript": "^4.4.4"
"typescript": "^4.9.0"
},
"resolutions": {
"@cumulus/**/@aws-sdk/client-s3": "^3.370.0",
Expand Down
6 changes: 3 additions & 3 deletions scripts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
"terraform-doctor": "yarn build && cd .. && node scripts/build/terraform-doctor.js"
},
"devDependencies": {
"@types/uuid": "^8.3.4",
"@types/uuid": "^9.0.7",
"env-cmd": "^10.1.0",
"typescript": "^4.5.4"
"typescript": "^4.9.0"
},
"dependencies": {
"@aws-sdk/client-sfn": "^3.370.0",
"@aws-sdk/client-sts": "^3.370.0",
"cmd-ts": "^0.10.0",
"cmd-ts": "0.13.0",
"lodash": "^4.17.21"
}
}
2 changes: 1 addition & 1 deletion scripts/src/generate-test-granule-files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ import * as fs from 'fs';
import * as path from 'path';

import * as Cmd from 'cmd-ts';
import { Directory } from 'cmd-ts/batteries/fs';
import * as Result from 'cmd-ts/dist/cjs/Result';
import { Directory } from 'cmd-ts/dist/cjs/batteries/fs';
import { Exit } from 'cmd-ts/dist/cjs/effects';
import * as fp from 'lodash/fp';

Expand Down
Loading
Loading