From d859c320e0680fcf25496f9f5ce4f2b744370b12 Mon Sep 17 00:00:00 2001 From: paduin Date: Tue, 28 Nov 2023 09:33:21 +0100 Subject: [PATCH 1/9] Override keepAlive time to be lower then NLB idle time (350s) --- templates/waggledance.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/templates/waggledance.json b/templates/waggledance.json index d7475b4..d8fa91c 100644 --- a/templates/waggledance.json +++ b/templates/waggledance.json @@ -70,6 +70,12 @@ "softLimit": 65536, "hardLimit": 65536 } - ] + ], + "systemControls": [ + { + "namespace": "net.ipv4.tcp_keepalive_time", + "value": "300" + } + ] } ] From 324f89635994667edb261e733b014dd1e6680aa3 Mon Sep 17 00:00:00 2001 From: paduin Date: Tue, 28 Nov 2023 11:39:18 +0100 Subject: [PATCH 2/9] reducing probes --- templates/waggledance.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/templates/waggledance.json b/templates/waggledance.json index d8fa91c..15f7fc6 100644 --- a/templates/waggledance.json +++ b/templates/waggledance.json @@ -75,7 +75,11 @@ { "namespace": "net.ipv4.tcp_keepalive_time", "value": "300" + }, + { + "namespace": "net.ipv4.tcp_keepalive_probes", + "value": "2" } - ] + ] } ] From a38643dc0f23de25fac62b98e9ab331c78e32319 Mon Sep 17 00:00:00 2001 From: paduin Date: Wed, 29 Nov 2023 09:35:31 +0100 Subject: [PATCH 3/9] setting keepAlive interval to all be belowe NLB 350s idle timeout --- templates/waggledance.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/templates/waggledance.json b/templates/waggledance.json index 15f7fc6..d8deb38 100644 --- a/templates/waggledance.json +++ b/templates/waggledance.json @@ -74,7 +74,11 @@ "systemControls": [ { "namespace": "net.ipv4.tcp_keepalive_time", - "value": "300" + "value": "200" + }, + { + "namespacer": "tcp_keepalive_intvl", + "value": "30" }, { "namespace": "net.ipv4.tcp_keepalive_probes", From 6741917fd310837855c182293aeb7af046845c35 Mon Sep 17 00:00:00 2001 From: paduin Date: Wed, 29 Nov 2023 09:52:15 +0100 Subject: [PATCH 4/9] fix --- templates/waggledance.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/waggledance.json b/templates/waggledance.json index d8deb38..f8b3f31 100644 --- a/templates/waggledance.json +++ b/templates/waggledance.json @@ -77,7 +77,7 @@ "value": "200" }, { - "namespacer": "tcp_keepalive_intvl", + "namespace": "net.ipv4.tcp_keepalive_intvl", "value": "30" }, { From 8c05033fbfbb1750d4e224f61f960c64674eaef5 Mon Sep 17 00:00:00 2001 From: paduin Date: Thu, 30 Nov 2023 11:05:31 +0100 Subject: [PATCH 5/9] Added changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdaf086..d9ba0e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [4.1.5] - 2023-11-30 +### Fixed +- Issue in where request can hit 10min connection timeout, TCP keepalive prevents NLB closing idle connections. + ## [4.1.4] - 2023-11-08 ### Fixed - Added tags to ECS service and tasks. From dd28a8d1a48a094882a0fdde947c8bba4c730a02 Mon Sep 17 00:00:00 2001 From: paduin Date: Thu, 30 Nov 2023 12:14:53 +0100 Subject: [PATCH 6/9] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9ba0e0..0bcf1e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a ## [4.1.5] - 2023-11-30 ### Fixed -- Issue in where request can hit 10min connection timeout, TCP keepalive prevents NLB closing idle connections. +- Issue where requests can hit 10min connection timeout, TCP keepalive prevents NLB closing idle connections. Similar to the issue explained here: https://paramount.tech/blog/2021/07/26/mitigation-of-connection-reset-in-aws.html ## [4.1.4] - 2023-11-08 ### Fixed From c2ff704a95cd7a4d4f5a8d44cbce34ac272ad176 Mon Sep 17 00:00:00 2001 From: paduin Date: Thu, 30 Nov 2023 15:11:27 +0100 Subject: [PATCH 7/9] added variables --- templates/waggledance.json | 6 +++--- variables.tf | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/templates/waggledance.json b/templates/waggledance.json index f8b3f31..f08d912 100644 --- a/templates/waggledance.json +++ b/templates/waggledance.json @@ -74,15 +74,15 @@ "systemControls": [ { "namespace": "net.ipv4.tcp_keepalive_time", - "value": "200" + "value": "${tcp_keepalive_time}" }, { "namespace": "net.ipv4.tcp_keepalive_intvl", - "value": "30" + "value": "${tcp_keepalive_intvl}" }, { "namespace": "net.ipv4.tcp_keepalive_probes", - "value": "2" + "value": "${tcp_keepalive_probes}" } ] } diff --git a/variables.tf b/variables.tf index 35be524..ee96c42 100644 --- a/variables.tf +++ b/variables.tf @@ -359,3 +359,21 @@ variable "datadog_metrics_enabled" { type = bool default = false } + +variable "tcp_keepalive_time" { + description = "sets net.ipv4.tcp_keepalive_time (seconds), currently only supported in ECS. " + type = string + default = 200 +} + +variable "tcp_keepalive_intvl" { + description = "sets net.ipv4.tcp_keepalive_intvl (seconds), currently only supported in ECS. " + type = string + default = 30 +} + +variable "tcp_keepalive_probes" { + description = "sets net.ipv4.tcp_keepalive_probes (number), currently only supported in ECS. " + type = string + default = 2 +} \ No newline at end of file From 4d41e4bc54c3b48ce8125855d638b1cca5616fd6 Mon Sep 17 00:00:00 2001 From: paduin Date: Thu, 30 Nov 2023 15:14:51 +0100 Subject: [PATCH 8/9] added variables --- templates.tf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/templates.tf b/templates.tf index 220835c..1c67e03 100644 --- a/templates.tf +++ b/templates.tf @@ -183,5 +183,8 @@ data "template_file" "waggledance" { hive_site_xml = var.alluxio_endpoints == [] ? "" : base64encode(data.template_file.hive_site_xml.rendered) bastion_ssh_key_arn = var.bastion_ssh_key_secret_name == "" ? "" : join("", data.aws_secretsmanager_secret.bastion_ssh_key.*.arn) docker_auth = var.docker_registry_auth_secret_name == "" ? "" : format("\"repositoryCredentials\" :{\n \"credentialsParameter\":\"%s\"\n},", join("\",\"", concat(data.aws_secretsmanager_secret.docker_registry.*.arn))) + tcp_keepalive_time = var.tcp_keepalive_time + tcp_keepalive_intvl = var.tcp_keepalive_intvl + tcp_keepalive_probes = var.tcp_keepalive_probes } } From 8216dac89334903a08df3c39cb000193cc0da38f Mon Sep 17 00:00:00 2001 From: paduin Date: Thu, 30 Nov 2023 15:37:34 +0100 Subject: [PATCH 9/9] added variables to readme --- README.md | 3 +++ variables.tf | 12 ++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d928daa..2288fa7 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,9 @@ For more information please refer to the main [Apiary](https://github.com/Expedi | root_vol_type | Waggle Dance EC2 root volume type. | string | `gp2` | no | | root_vol_size | Waggle Dance EC2 root volume size. | string | `10` | no | |enable_query_functions_across_all_metastores | This controls the thrift call for `get_all_functions`. It is generally used to initialize a client and get built-in functions and registered UDF's from a metastore. Setting this to `false` is more performant as WD then only gets the functions from the `primary` metastore. However, setting this to `true` will collate results by calling `get_all_functions` from all configured metastores. This could be potentially slow if some of the metastores are slow to respond. If all the metastores configured are of the same version and no additional UDF's are installed, then WD gets the same functions back so it's not very useful to call this across metastores. For backwards compatibility, this property can be set to `true`. Further read: https://github.com/ExpediaGroup/waggle-dance#server | bool | false | no | +| tcp_keepalive_time | Sets net.ipv4.tcp_keepalive_time (seconds), currently only supported in ECS. | number | `200` | no | +| tcp_keepalive_intvl | Sets net.ipv4.tcp_keepalive_intvl (seconds), currently only supported in ECS. | number | `30` | no | +| tcp_keepalive_probes | Sets net.ipv4.tcp_keepalive_probes (seconds), currently only supported in ECS. | number | `2` | no | ## Usage diff --git a/variables.tf b/variables.tf index ee96c42..d6dd62f 100644 --- a/variables.tf +++ b/variables.tf @@ -361,19 +361,19 @@ variable "datadog_metrics_enabled" { } variable "tcp_keepalive_time" { - description = "sets net.ipv4.tcp_keepalive_time (seconds), currently only supported in ECS. " - type = string + description = "Sets net.ipv4.tcp_keepalive_time (seconds), currently only supported in ECS." + type = number default = 200 } variable "tcp_keepalive_intvl" { - description = "sets net.ipv4.tcp_keepalive_intvl (seconds), currently only supported in ECS. " - type = string + description = "Sets net.ipv4.tcp_keepalive_intvl (seconds), currently only supported in ECS." + type = number default = 30 } variable "tcp_keepalive_probes" { - description = "sets net.ipv4.tcp_keepalive_probes (number), currently only supported in ECS. " - type = string + description = "Sets net.ipv4.tcp_keepalive_probes (number), currently only supported in ECS." + type = number default = 2 } \ No newline at end of file