add aro-hpc scability test

Signed-off-by: Wei Liu <[email protected]>
openshift-online · Sep 5, 2024 · 94e09a4 · 94e09a4
1 parent 681b895
commit 94e09a4
Show file tree

Hide file tree

Showing 42 changed files with 2,937 additions and 3 deletions.
diff --git a/go.mod b/go.mod
@@ -28,6 +28,7 @@ require (
 	github.com/onsi/gomega v1.32.0
 	github.com/openshift-online/ocm-common v0.0.0-20240620110211-2ecfa6ec5707
 	github.com/openshift-online/ocm-sdk-go v0.1.421
+	github.com/openshift/library-go v0.0.0-20240621150525-4bb4238aef81
 	github.com/prometheus/client_golang v1.18.0
 	github.com/segmentio/ksuid v1.0.2
 	github.com/spf13/cobra v1.8.0
@@ -41,12 +42,14 @@ require (
 	gorm.io/gorm v1.24.7-0.20230306060331-85eaf9eeda11
 	k8s.io/api v0.30.2
 	k8s.io/apimachinery v0.30.2
+	k8s.io/apiserver v0.30.1
 	k8s.io/client-go v0.30.2
 	k8s.io/component-base v0.30.2
 	k8s.io/klog/v2 v2.120.1
 	open-cluster-management.io/api v0.14.1-0.20240627145512-bd6f2229b53c
 	open-cluster-management.io/ocm v0.13.1-0.20240618054845-e2a7b9e78b33
 	open-cluster-management.io/sdk-go v0.14.1-0.20240829071054-7bd852f2b2a8
+	sigs.k8s.io/yaml v1.4.0
 )
 
 require (
@@ -109,7 +112,6 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/openshift/api v0.0.0-20240527133614-ba11c1587003 // indirect
 	github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87 // indirect
-	github.com/openshift/library-go v0.0.0-20240621150525-4bb4238aef81 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pkg/profile v1.3.0 // indirect
 	github.com/prometheus/client_model v0.5.0 // indirect
@@ -155,7 +157,6 @@ require (
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 	gorm.io/driver/mysql v1.4.7 // indirect
 	k8s.io/apiextensions-apiserver v0.30.1 // indirect
-	k8s.io/apiserver v0.30.1 // indirect
 	k8s.io/kms v0.30.1 // indirect
 	k8s.io/kube-aggregator v0.30.1 // indirect
 	k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
@@ -165,5 +166,4 @@ require (
 	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
 	sigs.k8s.io/kube-storage-version-migrator v0.0.6-0.20230721195810-5c8923c5ff96 // indirect
 	sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
-	sigs.k8s.io/yaml v1.4.0 // indirect
 )
diff --git a/go.sum b/go.sum
diff --git a/test/performance/README.md b/test/performance/README.md
@@ -0,0 +1,167 @@
+# Performance Test
+
+## ARO HCP
+
+### Workloads
+
+There are 10 consumers in the maestro server and each consumer has 600 resource bundles, include
+
+- 300 managed cluster resource bundles, one managed cluster resource bundle contains a [ManagedCluster](./pkg/hub/workloads/manifests/aro-hpc/managedcluster.yaml) CR
+- 300 manifestworks resource bundles, one managed cluster resource bundle contains two ManifestWork CRs: [namespace](./pkg/hub/workloads/manifests/aro-hpc/manifestwork.namespace.yaml) and [hypershift](./pkg/hub/workloads/manifests/aro-hpc/manifestwork.hypershift.yaml)
+
+And after the resources are applied on the consumer agent part, there is a status simulator to add the mock status for the resources, finally, one resource will have spec and status, the whole sample can be found from [here](https://drive.google.com/file/d/1OXJX_RFsMqvHgVmroR1XiOAU6LrNYF0y/view?usp=sharing) for managed cluster and manifestworks resources.
+
+#### Workload Size
+
+```
+total_records=10x300x2=6000
+
+one_managed_cluster_resource_bundles_record_size=3K  (3127, spec=742, status=2067)
+one_manifestworks_resource_bundles_record_size=49K (49899, spec=30771, status=18802)
+total_size_records_per_consumer=300x3+300x49=15M
+total_size_records=15x10=150M
+```
+
+### Test Steps
+
+1. Follow [ARO-HCP doc](https://github.com/Azure/ARO-HCP/blob/38b459d9e88898d79780e6aa0eacb841828aab07/dev-infrastructure/docs/development-setup.md#maestro-infrastructure) to deploy maestro in the ARO
+
+2. Add 10 consumers in the maestro 
+
+```sh
+counts=10 test/performance/hack/aro-hpc/prepare.consumer.sh
+```
+
+3. Prepare a KinD cluster to run consumer agents
+
+```sh
+test/performance/hack/aro-hpc/prepare.kind.sh
+```
+
+4. Start 10 consumer agents
+
+```sh
+# tail -f _output/performance/aro/logs/agents.log
+counts=10 test/performance/hack/aro-hpc/start-consumer-agents.sh
+```
+
+5. Start a watcher to simulate a controller to update the resource status
+
+```sh
+# tail -f _output/performance/aro/logs/watcher.log
+counts=10 test/performance/hack/aro-hpc/start-spoke-watcher.sh
+```
+
+6. Create resource bundles for two consumers: 1 and 2
+
+```sh
+index=9 test/performance/hack/aro-hpc/create-works.sh
+index=10 test/performance/hack/aro-hpc/create-works.sh
+```
+
+7. Wait the resources are updated on spoke, repeat the step 6 
+
+### Maestro server cpu/memory consumption
+
+![cpu-avg](./result/aro-hpc/resource-usage/cpu-mem/svc-cpu-avg.png)
+
+![cpu-max](./result/aro-hpc/resource-usage/cpu-mem/svc-cpu-max.png)
+
+![mem-ws-avg](./result/aro-hpc/resource-usage/cpu-mem/svc-mem-ws-avg.png)
+
+![mem-ws-max](./result/aro-hpc/resource-usage/cpu-mem/svc-mem-ws-max.png)
+
+![mem-rss-avg](./result/aro-hpc/resource-usage/cpu-mem/svc-mem-avg.png)
+
+![mem-rss-max](./result/aro-hpc/resource-usage/cpu-mem/svc-mem-max.png)
+
+### PostgreSQL cpu/memory/storage consumption
+
+![cpu-avg](./result/aro-hpc/resource-usage/cpu-mem/db-cpu-avg.png)
+
+![cpu-max](./result/aro-hpc/resource-usage/cpu-mem/db-cpu-max.png)
+
+![mem-ws-avg](./result/aro-hpc/resource-usage/cpu-mem/db-mem-ws-avg.png)
+
+![mem-ws-max](./result/aro-hpc/resource-usage/cpu-mem/db-mem-ws-max.png)
+
+![mem-rss-avg](./result/aro-hpc/resource-usage/cpu-mem/db-mem-avg.png)
+
+![mem-rss-max](./result/aro-hpc/resource-usage/cpu-mem/db-mem-max.png)
+
+```
+# PostgreSQL Table Size
+total |  records 
+-------+---------
+15 MB | 1200
+27 MB | 2400
+40 MB | 3600
+52 MB | 4800
+65 MB | 6000
+```
+
+### Event Grid consumption
+
+![mqtt-connections](./result/aro-hpc/resource-usage/mqtt/conns.png)
+
+![mqtt-throughput](./result/aro-hpc/resource-usage/mqtt/throughput.png)
+
+![mqtt-request-counts](./result/aro-hpc/resource-usage/mqtt/req-counts.png)
+
+### Responsiveness
+
+1. The maestro server resource creation velocity: avg=54r/s, max=93r/s (source client sends 6000 requests with a given QPS(avg=56, max=93) )
+2. The maestro server resource status update velocity: avg=2r/s, max=15r/s (10 agents, each agent sync the resource status every 10s)
+3. List time consumption
+
+```
+# list all resources for one consumer (total resources 6000) 
+lists resources (counts=600) from consumer 1, time=3266ms
+lists resources (counts=600) from consumer 2, time=3182ms
+lists resources (counts=600) from consumer 3, time=3158ms
+lists resources (counts=600) from consumer 4, time=3147ms
+lists resources (counts=600) from consumer 5, time=3245ms
+lists resources (counts=600) from consumer 6, time=3309ms
+lists resources (counts=600) from consumer 7, time=3097ms
+lists resources (counts=600) from consumer 8, time=4037ms
+lists resources (counts=600) from consumer 9, time=3191ms
+lists resources (counts=600) from consumer 10, time=3167ms
+avg_time=3279ms
+
+# only list managed cluster resource bundles for one consumer (total resources 6000) 
+lists resources (counts=300) from consumer 1, time=345ms
+lists resources (counts=300) from consumer 2, time=275ms
+lists resources (counts=300) from consumer 3, time=279ms
+lists resources (counts=300) from consumer 4, time=249ms
+lists resources (counts=300) from consumer 5, time=272ms
+lists resources (counts=300) from consumer 6, time=335ms
+lists resources (counts=300) from consumer 7, time=313ms
+lists resources (counts=300) from consumer 8, time=281ms
+lists resources (counts=300) from consumer 9, time=295ms
+lists resources (counts=300) from consumer 10, time=298ms
+avg_time=294ms
+
+# only list hyper shift resource bundles for one consumer (total resources 6000) 
+lists resources (counts=300) from consumer 1, time=2915ms
+lists resources (counts=300) from consumer 2, time=2707ms
+lists resources (counts=300) from consumer 3, time=3455ms
+lists resources (counts=300) from consumer 4, time=2842ms
+lists resources (counts=300) from consumer 5, time=2741ms
+lists resources (counts=300) from consumer 6, time=2741ms
+lists resources (counts=300) from consumer 7, time=2820ms
+lists resources (counts=300) from consumer 8, time=2794ms
+lists resources (counts=300) from consumer 9, time=3070ms
+lists resources (counts=300) from consumer 10, time=2851ms
+avg_time=2893ms
+
+# list all (total resources 6000)
+lists resources 1200 from 2 consumers, time=6643ms
+lists resources 1800 from 3 consumers, time=9452ms
+lists resources 2400 from 4 consumers, time=12864ms
+lists resources 3000 from 5 consumers, time=15584ms
+lists resources 3600 from 6 consumers, time=17807ms
+lists resources 4200 from 7 consumers, time=20371ms
+lists resources 4800 from 8 consumers, time=22623ms
+lists resources 5400 from 9 consumers, time=25392ms
+lists resources 6000 from 10 consumers, time=28377ms
+```
diff --git a/test/performance/cmd/main.go b/test/performance/cmd/main.go
@@ -0,0 +1,134 @@
+package main
+
+import (
+	"context"
+	goflag "flag"
+	"fmt"
+	"os"
+
+	"github.com/spf13/cobra"
+	"github.com/spf13/pflag"
+
+	"k8s.io/apiserver/pkg/server"
+	utilflag "k8s.io/component-base/cli/flag"
+	"k8s.io/component-base/logs"
+	"k8s.io/klog/v2"
+
+	"github.com/openshift-online/maestro/test/performance/pkg/hub"
+	"github.com/openshift-online/maestro/test/performance/pkg/spoke"
+	"github.com/openshift-online/maestro/test/performance/pkg/watcher"
+)
+
+func main() {
+	pflag.CommandLine.SetNormalizeFunc(utilflag.WordSepNormalizeFunc)
+	pflag.CommandLine.AddGoFlagSet(goflag.CommandLine)
+
+	logs.AddFlags(pflag.CommandLine)
+	logs.InitLogs()
+	defer logs.FlushLogs()
+
+	cmd := &cobra.Command{
+		Use:   "maestroperf",
+		Short: "Maestro Performance Test Tool",
+		Run: func(cmd *cobra.Command, args []string) {
+			_ = cmd.Help()
+			os.Exit(1)
+		},
+	}
+
+	cmd.AddCommand(
+		newAROHCPPreparationCommand(),
+		newAROHCPSpokeCommand(),
+		newAROHCPWatchCommand(),
+	)
+
+	if err := cmd.Execute(); err != nil {
+		fmt.Fprintf(os.Stderr, "%v\n", err)
+		os.Exit(1)
+	}
+}
+
+func newAROHCPPreparationCommand() *cobra.Command {
+	o := hub.NewAROHCPPreparerOptions()
+	cmd := &cobra.Command{
+		Use:   "aro-hcp-prepare",
+		Short: "Prepare clusters or works in Maestro for ARO HCP",
+		Long:  "Prepare clusters or works in Maestro for ARO HCP",
+		Run: func(cmd *cobra.Command, args []string) {
+			// handle SIGTERM and SIGINT by cancelling the context.
+			ctx, cancel := context.WithCancel(context.Background())
+			shutdownHandler := server.SetupSignalHandler()
+			go func() {
+				defer cancel()
+				<-shutdownHandler
+				klog.Infof("\nShutting down aro-hcp-prepare.")
+			}()
+
+			if err := o.Run(ctx); err != nil {
+				klog.Errorf("failed to run aro-hcp-prepare, %v", err)
+			}
+		},
+	}
+
+	flags := cmd.Flags()
+	o.AddFlags(flags)
+	return cmd
+}
+
+func newAROHCPSpokeCommand() *cobra.Command {
+	o := spoke.NewAROHCPSpokeOptions()
+	cmd := &cobra.Command{
+		Use:   "aro-hcp-spoke",
+		Short: "Start agents for ARO HCP",
+		Long:  "Start agents for ARO HCP",
+		Run: func(cmd *cobra.Command, args []string) {
+			// handle SIGTERM and SIGINT by cancelling the context.
+			ctx, cancel := context.WithCancel(context.Background())
+			shutdownHandler := server.SetupSignalHandler()
+			go func() {
+				defer cancel()
+				<-shutdownHandler
+				klog.Infof("\nShutting down aro-hcp-spoke.")
+			}()
+
+			if err := o.Run(ctx); err != nil {
+				klog.Errorf("failed to run aro-hcp-spoke, %v", err)
+			}
+
+			<-ctx.Done()
+		},
+	}
+
+	flags := cmd.Flags()
+	o.AddFlags(flags)
+	return cmd
+}
+
+func newAROHCPWatchCommand() *cobra.Command {
+	o := watcher.NewAROHCPWatcherOptions()
+	cmd := &cobra.Command{
+		Use:   "aro-hcp-watch",
+		Short: "Start watcher for ARO HCP",
+		Long:  "Start watcher for ARO HCP",
+		Run: func(cmd *cobra.Command, args []string) {
+			// handle SIGTERM and SIGINT by cancelling the context.
+			ctx, cancel := context.WithCancel(context.Background())
+			shutdownHandler := server.SetupSignalHandler()
+			go func() {
+				defer cancel()
+				<-shutdownHandler
+				klog.Infof("\nShutting down aro-hcp-watch.")
+			}()
+
+			if err := o.Run(ctx); err != nil {
+				klog.Errorf("failed to run aro-hcp-watch, %v", err)
+			}
+
+			<-ctx.Done()
+		},
+	}
+
+	flags := cmd.Flags()
+	o.AddFlags(flags)
+	return cmd
+}
diff --git a/test/performance/hack/aro-hcp/check-result.sh b/test/performance/hack/aro-hcp/check-result.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+db_pod_name=$(kubectl -n maestro get pods -l name=maestro-db -ojsonpath='{.items[0].metadata.name}')
+
+kubectl -n maestro exec ${db_pod_name} -- psql -d maestro -U maestro -c 'select count(*) from resources'
+kubectl -n maestro exec ${db_pod_name} -- psql -d maestro -U maestro -c "select created_at,updated_at,extract(epoch from age(updated_at,created_at)) from resources where consumer_name='maestro-cluster-9' order by created_at"
+kubectl -n maestro exec ${db_pod_name} -- psql -d maestro -U maestro -c "select created_at,updated_at,extract(epoch from age(updated_at,created_at)) from resources where consumer_name='maestro-cluster-10' order by created_at"
+kubectl -n maestro exec ${db_pod_name} -- psql -d maestro -U maestro -c "select pg_size_pretty(pg_total_relation_size('resources')) as total, pg_size_pretty(pg_relation_size('resources')) as data"
diff --git a/test/performance/hack/aro-hcp/cleanup.sh b/test/performance/hack/aro-hcp/cleanup.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+ARO_HCP_REPO_PATH="$HOME/go/src/github.com/Azure/ARO-HCP"
+
+ls _output/performance/aro/pids | xargs kill
+kind delete clusters --all
+
+pushd $ARO_HCP_REPO_PATH/dev-infrastructure
+AKSCONFIG=svc-cluster make clean
+popd
diff --git a/test/performance/hack/aro-hcp/create-clusters.sh b/test/performance/hack/aro-hcp/create-clusters.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+total=${total:-10}
+begin_index=${begin_index:-1}
+
+lastIndex=$(($begin_index + $total - 1))
+echo "create clusters from maestro-cluster-$begin_index to maestro-cluster-$lastIndex"
+
+kubectl apply -f - <<EOF
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: clusters-$begin_index-$lastIndex
+  namespace: maestro
+spec:
+  template:
+    spec:
+      containers:
+      - name: aro-hcp-clusters
+        image: quay.io/skeeey/maestro-perf-tool:aro-hcp
+        imagePullPolicy: IfNotPresent
+        args:
+          - "/maestroperf"
+          - "aro-hcp-prepare"
+          - "--cluster-begin-index=$begin_index"
+          - "--cluster-counts=$total"
+          - "--only-clusters=true"
+      restartPolicy: Never
+  backoffLimit: 4
+EOF