Skip to content

Commit

Permalink
[infrastructure] add explicit autoscaling logic to Hasura service (#4096
Browse files Browse the repository at this point in the history
)
  • Loading branch information
freemvmt authored Dec 20, 2024
1 parent b7d7465 commit 3421ae5
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 7 deletions.
2 changes: 1 addition & 1 deletion infrastructure/application/Pulumi.production.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ config:
secure: AAABADo05EPv/HWj7Rkf19nBeTcPJd4pEcRi2/uhyB3agraFODpLvNMx2bXfISf5pZ4HA41GYCE4f7OLcJN6hIV6ZMWUlEriPzvkoUAixbLlz1LIERiyk73R8E4F2bV65/9aFqi4l7caLS5c8iDJrE+JAvu2i7oS
application:hasura-admin-secret:
secure: AAABAHfDtVpAD8w32yINWTjgvuRQixWXYFf3/rEcyh59/pRSz+J4ZYCXNq5jqBiIXM2emB+7zOY=
application:hasura-cpu: "512"
application:hasura-cpu: "1024"
application:hasura-memory: "2048"
application:hasura-planx-api-key:
secure: AAABAExsXFL7HabeK0Z1oSUJzI2NqVqEmKJ1ojYXyX4Hi8Sbt1Ht9QJc/Yn3cPBAB2r32HKa4HtqqLmfGjS+04lFB/I=
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/application/Pulumi.staging.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ config:
secure: AAABACgwjEmlLmE19ofRO8e/JpD8sHDV2lcDmSXbU/Mw8ZRh5gTgll8DZ3BVjpDWfQfIecBAIf2TFgeo9CsBSLjfaRJ7eJyKDSWm7i8LlMC2JN/PN+Ig8oeI0H0oLkqJIziNKKjx+e97zDiXO9LZ1CVzrywR
application:hasura-admin-secret:
secure: AAABAHsoh7ZNkr6ep3xXsUZpp/JIjshBX+tJ0KOFgGnJ4wxR0oIcB6VewVDuwSyFJRVix72YahM=
application:hasura-cpu: "512"
application:hasura-cpu: "1024"
application:hasura-memory: "2048"
application:hasura-planx-api-key:
secure: AAABANHLs3ItPxkteh0chwMP2bKuHO3ovuRLi4FsIrCqerzXVIaTLFDqNR+4KBTeMPz4cnF5tCTwsrJv9GruZdXU+lg=
Expand Down
98 changes: 93 additions & 5 deletions infrastructure/application/services/hasura.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ export const createHasuraService = async ({
protocol: "HTTP",
healthCheck: {
path: "/healthz",
interval: 30,
timeout: 10,
healthyThreshold: 3,
unhealthyThreshold: 5,
},
});
const hasuraListenerHttp = targetHasura.createListener("hasura-http", { protocol: "HTTP" });
Expand All @@ -51,6 +55,15 @@ export const createHasuraService = async ({
const hasuraService = new awsx.ecs.FargateService("hasura", {
cluster,
subnets: networking.requireOutput("publicSubnetIds"),
desiredCount: 1,
deploymentMinimumHealthyPercent: 50,
deploymentMaximumPercent: 200,
// extend service-level health check grace period to match hasura server migrations timeout
healthCheckGracePeriodSeconds: 600,
deploymentCircuitBreaker: {
enable: true,
rollback: true,
},
taskDefinitionArgs: {
logGroup: new aws.cloudwatch.LogGroup("hasura", {
namePrefix: "hasura",
Expand All @@ -62,15 +75,38 @@ export const createHasuraService = async ({
cpu: config.requireNumber("hasura-proxy-cpu"),
memory: config.requireNumber("hasura-proxy-memory"),
portMappings: [hasuraListenerHttp],
// hasuraProxy should wait for the hasura container to spin up before starting
dependsOn: [{
containerName: "hasura",
condition: "HEALTHY"
}],
healthCheck: {
// hasuraProxy health depends on hasura health
command: ["CMD-SHELL", `curl --head http://localhost:${HASURA_PROXY_PORT}/healthz || exit 1`],
interval: 15,
timeout: 3,
retries: 3,
},
environment: [
{ name: "HASURA_PROXY_PORT", value: String(HASURA_PROXY_PORT) },
{ name: "HASURA_NETWORK_LOCATION", value: "localhost" },
],
},
hasura: {
// hasuraProxy dependency timeout should mirror migration timeout
startTimeout: 600,
stopTimeout: 120,
image: repo.buildAndPushImage("../../hasura.planx.uk"),
cpu: config.requireNumber("hasura-cpu"),
memory: config.requireNumber("hasura-memory"),
healthCheck: {
command: ["CMD-SHELL", "curl --head http://localhost:8080/healthz || exit 1"],
// wait 5m before running container-level health check, using same params as docker-compose
startPeriod: 300,
interval: 15,
timeout: 3,
retries: 10,
},
environment: [
{ name: "HASURA_GRAPHQL_ENABLE_CONSOLE", value: "true" },
{
Expand Down Expand Up @@ -98,7 +134,6 @@ export const createHasuraService = async ({
name: "HASURA_GRAPHQL_DATABASE_URL",
value: dbRootUrl,
},
{ name: "HASURA_GRAPHQL_MIGRATIONS_SERVER_TIMEOUT", value: "300" },
{
name: "HASURA_PLANX_API_URL",
value: `https://api.${DOMAIN}`,
Expand All @@ -107,15 +142,68 @@ export const createHasuraService = async ({
name: "HASURA_PLANX_API_KEY",
value: config.require("hasura-planx-api-key"),
},
// extend timeout for migrations during setup to 10 mins (default is 30s)
{
name: "HASURA_GRAPHQL_MIGRATIONS_SERVER_TIMEOUT",
value: "600",
},
// ensure migrations run sequentially
{
name: "HASURA_GRAPHQL_MIGRATIONS_CONCURRENCY",
value: "1",
},
// get more detailed logs during attempted migration
{
name: "HASURA_GRAPHQL_MIGRATIONS_LOG_LEVEL",
value: "debug",
},
],
},
},
},
desiredCount: 1,
// experiment with non-zero grace period to see if it resolves scale up failure
healthCheckGracePeriodSeconds: 180,
});


// TODO: bump awsx to 1.x to use the FargateService scaleConfig option to replace more verbose config below
const hasuraScalingTarget = new aws.appautoscaling.Target("hasura-scaling-target", {
// start conservative, can always bump max as required
maxCapacity: 3,
minCapacity: 1,
resourceId: pulumi.interpolate`service/${cluster.cluster.name}/${hasuraService.service.name}`,
scalableDimension: "ecs:service:DesiredCount",
serviceNamespace: "ecs",
});

const hasuraCpuScaling = new aws.appautoscaling.Policy("hasura-cpu-scaling", {
policyType: "TargetTrackingScaling",
resourceId: hasuraScalingTarget.resourceId,
scalableDimension: hasuraScalingTarget.scalableDimension,
serviceNamespace: hasuraScalingTarget.serviceNamespace,
targetTrackingScalingPolicyConfiguration: {
predefinedMetricSpecification: {
predefinedMetricType: "ECSServiceAverageCPUUtilization",
},
// scale out quickly for responsiveness, but scale in more slowly to avoid thrashing
targetValue: 60.0,
scaleInCooldown: 300,
scaleOutCooldown: 60,
},
});

const hasuraMemoryScaling = new aws.appautoscaling.Policy("hasura-memory-scaling", {
policyType: "TargetTrackingScaling",
resourceId: hasuraScalingTarget.resourceId,
scalableDimension: hasuraScalingTarget.scalableDimension,
serviceNamespace: hasuraScalingTarget.serviceNamespace,
targetTrackingScalingPolicyConfiguration: {
predefinedMetricSpecification: {
predefinedMetricType: "ECSServiceAverageMemoryUtilization",
},
targetValue: 75.0,
scaleInCooldown: 300,
scaleOutCooldown: 60,
},
});

new cloudflare.Record("hasura", {
name: tldjs.getSubdomain(DOMAIN)
? `hasura.${tldjs.getSubdomain(DOMAIN)}`
Expand Down

0 comments on commit 3421ae5

Please sign in to comment.