diff --git a/infrastructure/application/Pulumi.production.yaml b/infrastructure/application/Pulumi.production.yaml index 87d073744c..37b51f211c 100644 --- a/infrastructure/application/Pulumi.production.yaml +++ b/infrastructure/application/Pulumi.production.yaml @@ -29,7 +29,7 @@ config: secure: AAABADo05EPv/HWj7Rkf19nBeTcPJd4pEcRi2/uhyB3agraFODpLvNMx2bXfISf5pZ4HA41GYCE4f7OLcJN6hIV6ZMWUlEriPzvkoUAixbLlz1LIERiyk73R8E4F2bV65/9aFqi4l7caLS5c8iDJrE+JAvu2i7oS application:hasura-admin-secret: secure: AAABAHfDtVpAD8w32yINWTjgvuRQixWXYFf3/rEcyh59/pRSz+J4ZYCXNq5jqBiIXM2emB+7zOY= - application:hasura-cpu: "512" + application:hasura-cpu: "1024" application:hasura-memory: "2048" application:hasura-planx-api-key: secure: AAABAExsXFL7HabeK0Z1oSUJzI2NqVqEmKJ1ojYXyX4Hi8Sbt1Ht9QJc/Yn3cPBAB2r32HKa4HtqqLmfGjS+04lFB/I= diff --git a/infrastructure/application/Pulumi.staging.yaml b/infrastructure/application/Pulumi.staging.yaml index 06ee49d150..5cc9b010c9 100644 --- a/infrastructure/application/Pulumi.staging.yaml +++ b/infrastructure/application/Pulumi.staging.yaml @@ -30,7 +30,7 @@ config: secure: AAABACgwjEmlLmE19ofRO8e/JpD8sHDV2lcDmSXbU/Mw8ZRh5gTgll8DZ3BVjpDWfQfIecBAIf2TFgeo9CsBSLjfaRJ7eJyKDSWm7i8LlMC2JN/PN+Ig8oeI0H0oLkqJIziNKKjx+e97zDiXO9LZ1CVzrywR application:hasura-admin-secret: secure: AAABAHsoh7ZNkr6ep3xXsUZpp/JIjshBX+tJ0KOFgGnJ4wxR0oIcB6VewVDuwSyFJRVix72YahM= - application:hasura-cpu: "512" + application:hasura-cpu: "1024" application:hasura-memory: "2048" application:hasura-planx-api-key: secure: AAABANHLs3ItPxkteh0chwMP2bKuHO3ovuRLi4FsIrCqerzXVIaTLFDqNR+4KBTeMPz4cnF5tCTwsrJv9GruZdXU+lg= diff --git a/infrastructure/application/services/hasura.ts b/infrastructure/application/services/hasura.ts index c4b3f63028..7e4b5a5756 100644 --- a/infrastructure/application/services/hasura.ts +++ b/infrastructure/application/services/hasura.ts @@ -35,6 +35,10 @@ export const createHasuraService = async ({ protocol: "HTTP", healthCheck: { path: "/healthz", + interval: 30, + timeout: 10, + healthyThreshold: 3, + unhealthyThreshold: 5, }, }); const hasuraListenerHttp = targetHasura.createListener("hasura-http", { protocol: "HTTP" }); @@ -51,6 +55,15 @@ export const createHasuraService = async ({ const hasuraService = new awsx.ecs.FargateService("hasura", { cluster, subnets: networking.requireOutput("publicSubnetIds"), + desiredCount: 1, + deploymentMinimumHealthyPercent: 50, + deploymentMaximumPercent: 200, + // extend service-level health check grace period to match hasura server migrations timeout + healthCheckGracePeriodSeconds: 600, + deploymentCircuitBreaker: { + enable: true, + rollback: true, + }, taskDefinitionArgs: { logGroup: new aws.cloudwatch.LogGroup("hasura", { namePrefix: "hasura", @@ -62,15 +75,38 @@ export const createHasuraService = async ({ cpu: config.requireNumber("hasura-proxy-cpu"), memory: config.requireNumber("hasura-proxy-memory"), portMappings: [hasuraListenerHttp], + // hasuraProxy should wait for the hasura container to spin up before starting + dependsOn: [{ + containerName: "hasura", + condition: "HEALTHY" + }], + healthCheck: { + // hasuraProxy health depends on hasura health + command: ["CMD-SHELL", `curl --head http://localhost:${HASURA_PROXY_PORT}/healthz || exit 1`], + interval: 15, + timeout: 3, + retries: 3, + }, environment: [ { name: "HASURA_PROXY_PORT", value: String(HASURA_PROXY_PORT) }, { name: "HASURA_NETWORK_LOCATION", value: "localhost" }, ], }, hasura: { + // hasuraProxy dependency timeout should mirror migration timeout + startTimeout: 600, + stopTimeout: 120, image: repo.buildAndPushImage("../../hasura.planx.uk"), cpu: config.requireNumber("hasura-cpu"), memory: config.requireNumber("hasura-memory"), + healthCheck: { + command: ["CMD-SHELL", "curl --head http://localhost:8080/healthz || exit 1"], + // wait 5m before running container-level health check, using same params as docker-compose + startPeriod: 300, + interval: 15, + timeout: 3, + retries: 10, + }, environment: [ { name: "HASURA_GRAPHQL_ENABLE_CONSOLE", value: "true" }, { @@ -98,7 +134,6 @@ export const createHasuraService = async ({ name: "HASURA_GRAPHQL_DATABASE_URL", value: dbRootUrl, }, - { name: "HASURA_GRAPHQL_MIGRATIONS_SERVER_TIMEOUT", value: "300" }, { name: "HASURA_PLANX_API_URL", value: `https://api.${DOMAIN}`, @@ -107,15 +142,68 @@ export const createHasuraService = async ({ name: "HASURA_PLANX_API_KEY", value: config.require("hasura-planx-api-key"), }, + // extend timeout for migrations during setup to 10 mins (default is 30s) + { + name: "HASURA_GRAPHQL_MIGRATIONS_SERVER_TIMEOUT", + value: "600", + }, + // ensure migrations run sequentially + { + name: "HASURA_GRAPHQL_MIGRATIONS_CONCURRENCY", + value: "1", + }, + // get more detailed logs during attempted migration + { + name: "HASURA_GRAPHQL_MIGRATIONS_LOG_LEVEL", + value: "debug", + }, ], }, }, }, - desiredCount: 1, - // experiment with non-zero grace period to see if it resolves scale up failure - healthCheckGracePeriodSeconds: 180, }); - + + // TODO: bump awsx to 1.x to use the FargateService scaleConfig option to replace more verbose config below + const hasuraScalingTarget = new aws.appautoscaling.Target("hasura-scaling-target", { + // start conservative, can always bump max as required + maxCapacity: 3, + minCapacity: 1, + resourceId: pulumi.interpolate`service/${cluster.cluster.name}/${hasuraService.service.name}`, + scalableDimension: "ecs:service:DesiredCount", + serviceNamespace: "ecs", + }); + + const hasuraCpuScaling = new aws.appautoscaling.Policy("hasura-cpu-scaling", { + policyType: "TargetTrackingScaling", + resourceId: hasuraScalingTarget.resourceId, + scalableDimension: hasuraScalingTarget.scalableDimension, + serviceNamespace: hasuraScalingTarget.serviceNamespace, + targetTrackingScalingPolicyConfiguration: { + predefinedMetricSpecification: { + predefinedMetricType: "ECSServiceAverageCPUUtilization", + }, + // scale out quickly for responsiveness, but scale in more slowly to avoid thrashing + targetValue: 60.0, + scaleInCooldown: 300, + scaleOutCooldown: 60, + }, + }); + + const hasuraMemoryScaling = new aws.appautoscaling.Policy("hasura-memory-scaling", { + policyType: "TargetTrackingScaling", + resourceId: hasuraScalingTarget.resourceId, + scalableDimension: hasuraScalingTarget.scalableDimension, + serviceNamespace: hasuraScalingTarget.serviceNamespace, + targetTrackingScalingPolicyConfiguration: { + predefinedMetricSpecification: { + predefinedMetricType: "ECSServiceAverageMemoryUtilization", + }, + targetValue: 75.0, + scaleInCooldown: 300, + scaleOutCooldown: 60, + }, + }); + new cloudflare.Record("hasura", { name: tldjs.getSubdomain(DOMAIN) ? `hasura.${tldjs.getSubdomain(DOMAIN)}`