From dac8e3a0b2d563a8b8dae92bd959c4bc11130599 Mon Sep 17 00:00:00 2001
From: Florian Bacher <florian.bacher@dynatrace.com>
Date: Tue, 19 Dec 2023 14:04:05 +0100
Subject: [PATCH] docs: add analysis blog post (#2701)

Signed-off-by: Florian Bacher <florian.bacher@dynatrace.com>
Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
---
 .github/actions/spelling/expect.txt           |   2 +
 docs-new/blog/.authors.yml                    |   5 +
 .../analyzing-application-performance.md      | 238 ++++++++++++++++++
 .../analysis-breakdown.json                   |  77 ++++++
 .../analysis-definition.yaml                  |  26 ++
 .../analysis-status.yaml                      |  19 ++
 .../analysis.yaml                             |  12 +
 .../error-rate.yaml                           |   9 +
 .../memory-usage.yaml                         |   9 +
 .../metric-providers.yaml                     |  20 ++
 10 files changed, 417 insertions(+)
 create mode 100644 docs-new/blog/posts/analyzing-application-performance.md
 create mode 100644 docs-new/blog/posts/analyzing-application-performance/analysis-breakdown.json
 create mode 100644 docs-new/blog/posts/analyzing-application-performance/analysis-definition.yaml
 create mode 100644 docs-new/blog/posts/analyzing-application-performance/analysis-status.yaml
 create mode 100644 docs-new/blog/posts/analyzing-application-performance/analysis.yaml
 create mode 100644 docs-new/blog/posts/analyzing-application-performance/error-rate.yaml
 create mode 100644 docs-new/blog/posts/analyzing-application-performance/memory-usage.yaml
 create mode 100644 docs-new/blog/posts/analyzing-application-performance/metric-providers.yaml

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index df63487d54..8cb042ba5e 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -49,6 +49,7 @@ autoprefixer
 autoprovision
 autoscaler
 azurecr
+Bacher
 bacherfl
 badanalysis
 badmetric
@@ -569,6 +570,7 @@ spanitem
 spdx
 spf
 squidfunk
+sre
 stakeholders
 standalone
 statefulset
diff --git a/docs-new/blog/.authors.yml b/docs-new/blog/.authors.yml
index da7a68110b..c31561b7da 100644
--- a/docs-new/blog/.authors.yml
+++ b/docs-new/blog/.authors.yml
@@ -4,3 +4,8 @@ authors:
     description: Maintainer
     avatar: https://avatars.githubusercontent.com/u/6901203?v=4
     url: https://github.com/mowies
+  bacherfl:
+    name: Florian Bacher
+    description: Maintainer
+    avatar: https://avatars.githubusercontent.com/u/2143586?v=4
+    url: https://github.com/bacherfl
diff --git a/docs-new/blog/posts/analyzing-application-performance.md b/docs-new/blog/posts/analyzing-application-performance.md
new file mode 100644
index 0000000000..2f69cf9ee3
--- /dev/null
+++ b/docs-new/blog/posts/analyzing-application-performance.md
@@ -0,0 +1,238 @@
+---
+date: 2023-12-19
+authors: [bacherfl]
+description: >
+  In this blog post you will learn how to use Keptn for analyzing the performance of an application.
+categories:
+  - SRE
+  - Application Performance
+  - Analysis
+---
+
+# Analyzing Application Performance with Keptn
+
+In the dynamic world of DevOps and continuous delivery, keeping applications reliable
+and high-performing is a top priority.
+
+Site reliability engineers (SREs) rely on Service Level Objectives (SLOs) to set the standards that the
+Service Level Indicators (SLIs) of an application must meet, like response time, error rate,
+or any other metric that might be relevant to the application.
+
+The use of SLOs is not a new concept, but integrating them into an application comes with its own set of issues:
+
+- Figuring out which SLIs and SLOs to use— do you get the SLI values from one monitoring source or multiple?
+This complexity makes it harder to use them effectively.
+- Defining SLO priorities.
+Imagine a new version of a service that fixes a concurrency problem but
+slows down response time.
+In this case, this may be a valid trade-off and the new version should
+not be denied due to an increase in the response time, given that the error rate will decrease.
+Situations like these call for a way of defining a grading logic
+where different priorities can be assigned to SLOs.
+
+- Defining and storing SLOs.
+It's crucial to clearly define and store these goals in one central place,
+ideally a declarative resource in a GitOps repository, where each change can be easily traced back.
+
+In this article, we'll explore how Keptn tackles these challenges with its new Analysis feature.
+We will deploy a demo application onto a Kubernetes cluster to show Keptn helps SREs gather
+and make sense of SLOs, making the whole process more straightforward and efficient.
+
+The example application will provide some metrics by itself by serving them via its Prometheus endpoint,
+while other data will come from Dynatrace.
+
+## Defining data providers
+
+Everything in Keptn is configured via Kubernetes Custom Resources.
+We notify Keptn about our monitoring data sources by adding two KeptnMetricsProvider
+resources to our Kubernetes cluster - one for our Prometheus instance, the other one for our Dynatrace tenant.
+
+```yaml
+{% include "./analyzing-application-performance/metric-providers.yaml" %}
+```
+
+## Defining SLIs
+
+Now that we have defined our data sources, let's tell Keptn what SLIs we want to monitor
+and how to retrieve them from Prometheus and Dynatrace.
+This is done by applying `AnalysisValueTemplate` resources to the cluster.
+If you have worked with Keptn in the past, you will notice that the structure of these
+resources is similar to the `KeptnMetrics` resources
+(see [this article](https://www.linkedin.com/pulse/scaling-kubernetes-workloads-based-dynatrace-metrics-keptnproject),
+if you would like to learn more about KeptnMetrics and how to use them to automatically scale your workloads).
+
+The difference between `KeptnMetrics` and `AnalysisValueTemplates` is:
+
+- `KeptnMetrics` are monitored and updated continuously, meaning that they always represent the
+latest known value of the given metric.
+This makes them a good candidate for being observed by a `HorizontalPodAutoscaler` to make scaling decisions.
+
+- `AnalysisValueTemplates` provide the means to get the value of a metric during a concrete time window.
+This makes them well-suited for tasks such as analyzing the results of a load test
+that has been executed after the deployment of a new version.
+
+In our case, we will create two `AnalysisValueTemplates` resources.
+The first one measures the error rate of our workload, using data from Prometheus:
+
+```yaml
+{% include "./analyzing-application-performance/error-rate.yaml" %}
+```
+
+As a second metric, we measure the memory usage of our application using the following `AnalysisValueTemplate`:
+
+```yaml
+{% include "./analyzing-application-performance/memory-usage.yaml" %}
+```
+
+As can be seen in the `spec.query` field of the resource above,
+`AnalysisValueTemplate` resources support the [Go templating syntax](https://pkg.go.dev/text/template).
+With that, you can include placeholders in the query that are substituted at the time the
+concrete values for the metrics are retrieved.
+This comes in handy when, e.g., the query for the metrics is the same for different workloads
+and only differs slightly, perhaps due to different label selectors being used for different workloads.
+This way you do not need to create one `AnalysisValueTemplate` resource per workload
+but can reuse one for different workloads, and pass through the value for the
+actual workload at the time you perform an Analysis.
+
+## Defining SLOs
+
+The next step is to set up our expectations towards our SLOs, i.e. the
+goals we would like them to meet.
+This is done via an `AnalysisDefinition` resource like the following:
+
+```yaml
+{% include "./analyzing-application-performance/analysis-definition.yaml" %}
+```
+
+This `AnalysisDefinition` resource has two objectives, which both refer
+to the `AnalysisValueTemplate` resources we created previously.
+If you closely inspect both, you will notice that they differ in the weights they have been assigned,
+meaning that the goal for the error-rate has a higher priority than memory consumption.
+In combination with the target scores defined in the totalScore object,
+this means that passing the objective for the error-rate is mandatory for an analysis to be successful,
+or at least to achieve the warning state.
+The latter would be achieved if, for example, the error rate objective is passed,
+but the memory consumption exceeds the defined limit of `30M`.
+Also, note that even though we use values coming from different data sources,
+i.e. Prometheus and Dynatrace, in the `AnalysisDefinition`, we do not need to consider any
+implementation-specific details when referring to them.
+You only need to provide the name of the `AnalysisValueTemplate`,
+and the metrics-operator determines where to retrieve the data based on the information in the KeptnMetricsProviders.
+
+## Executing an Analysis
+
+Now, it is time to trigger an Analysis.
+This is done by applying an Analysis resource which looks as follows:
+
+```yaml
+{% include "./analyzing-application-performance/analysis.yaml" %}
+```
+
+Applying this resource causes Keptn to:
+
+- Retrieve the values of the `AnalysisValueTemplate` resource referenced in the
+`AnalysisDefinition` that is used for this Analysis instance.
+- After all required values have been retrieved, the objectives of the `AnalysisDefinition` are evaluated,
+and the overall result is computed.
+- This analysis uses the values of the last ten minutes (due to spec.timeframe.recent being set to 10m).
+Alternatively, you can also specify a concrete timeframe,
+using the `spec.timeframe.from` and `spec.timeframe.to` properties.
+- We also provide the argument workload to the analysis, using the `spec.args` property.
+Arguments passed to the analysis via this property are used when computing the actual query,
+using the templating string of the `AnalysisValueTemplates` resource.
+In our case, we use this in the error-rate `AnalysisValueTemplate`, where we set the
+query to `rate(http_requests_total{status_code='500', job='{{.workload}}'}[1m]) or on() vector(0)`.
+
+Applying this resource causes Keptn to retrieve the values of the `AnalysisValueTemplate`
+resource referenced in the `AnalysisDefinition` that is used for this `Analysis` instance.
+After all required values have been retrieved, the objectives of the `AnalysisDefinition` are evaluated,
+and the overall result is computed.
+
+This analysis uses the values of the last ten minutes (due to `spec.timeframe.recent` being set to `10m`)
+but you can also specify a concrete timeframe,
+using the `spec.timeframe.from` and `spec.timeframe.to` properties.
+
+We also provide the argument workload to the analysis, using the spec.args property.
+Arguments passed to the analysis via this property are used when computing the actual query,
+using the templating string of the `AnalysisValueTemplates` resource.
+In our case, we use this in the error-rate `AnalysisValueTemplate`,
+and set the query to `rate(http_requests_total{status_code='500', job='{{.workload}}'}[1m]) or on() vector(0)`.
+
+For our `Analysis` with `spec.args.workload` set to `simple-go-service`, the resulting query is:
+
+```shell
+rate(http_requests_total{status_code='500', job='simple-go-service'}[1m]) or on() vector(0). 
+```
+
+## Inspecting the results
+
+After applying an `Analysis` resource, we can do a quick check of its state using `kubectl`:
+
+```shell
+$ kubectl get analysis -n simple-go 
+  
+NAME               ANALYSISDEFINITION         STATE       WARNING   PASS 
+service-analysis   my-analysis-definition     Completed             true 
+```
+
+The output of that command tells us if the Analysis has been completed already.
+As seen above, this is the case, and we can already see that it has passed.
+So now it's time to dive deeper into the results and see what information we get in the status of the resource:
+
+```shell
+kubectl get analysis service-analysis -n simple-go –oyaml
+```
+
+This command gives us the complete YAML representation of the `Analysis`:
+
+```yaml
+{% include "./analyzing-application-performance/analysis-status.yaml" %}
+```
+
+As you can see, this already gives us a lot more information,
+with the meatiest piece being the status.raw field.
+This is a JSON representation of the retrieved values and the goals we have set for them.
+However, this raw information is not easily digestible for our human eyes, so let's format it using:
+
+```shell
+kubectl get analyses service-analysis -n simple-go -o=jsonpath='{.status.raw}' | jq .
+```
+
+Giving us the following as a result:
+
+```json
+{% include "./analyzing-application-performance/analysis-breakdown.json" %}
+```
+
+In the JSON object, we see:
+
+- A list of the objectives that we defined earlier in our `AnalysisDefinition`
+- The values of the related metrics
+- The actual query that was used for retrieving their data from our monitoring data sources
+
+Based on that, each objective is assigned a score,
+which is equal to the weight of that objective if the objective has been met.
+If not, the objective gets a score of 0.
+
+Note: You can specify `warning` criteria in addition to `failure` criteria.
+If that is the case, and the value of a metric does not violate the `failure` criteria,
+but the warning criteria, it gets a score that is half of the weight.
+This allows you to be even more granular with the grading of your analysis.
+In our case, both objectives have been met, so we get the full score
+and therefore pass the evaluation with flying colors.
+
+## Summary
+
+To summarize, we have seen how we can define multiple monitoring data sources
+and let Keptn fetch the data we are interested in and provide us with a unified way of accessing this data.
+In the next step, we created a clear set of criteria we expect from that data to
+decide whether the related application is healthy or not.
+Finally, we have seen how we can easily perform an analysis and interpret its result.
+We did all this by using Kubernetes manifests and placing them in a
+GitOps repository next to our application's manifests.
+
+If you would like to try out Keptn and its analysis capabilities yourself,
+feel free to head over to the [Keptn docs](https://lifecycle.keptn.sh/docs/)
+and follow the guides to [install Keptn](https://lifecycle.keptn.sh/docs/install/),
+if you haven't done so already,
+and try out the [Analysis example](https://github.com/keptn/lifecycle-toolkit/tree/main/examples/support/analysis).
diff --git a/docs-new/blog/posts/analyzing-application-performance/analysis-breakdown.json b/docs-new/blog/posts/analyzing-application-performance/analysis-breakdown.json
new file mode 100644
index 0000000000..d6699696f4
--- /dev/null
+++ b/docs-new/blog/posts/analyzing-application-performance/analysis-breakdown.json
@@ -0,0 +1,77 @@
+{
+  "objectiveResults": [
+    {
+      "result": {
+        "failResult": {
+          "operator": {
+            "greaterThan": {
+              "fixedValue": "50M"
+            }
+          },
+          "fulfilled": false
+        },
+        "warnResult": {
+          "operator": {},
+          "fulfilled": false
+        },
+        "warning": false,
+        "pass": true
+      },
+      "objective": {
+        "analysisValueTemplateRef": {
+          "name": "memory-usage"
+        },
+        "target": {
+          "failure": {
+            "greaterThan": {
+              "fixedValue": "50M"
+            }
+          }
+        },
+        "weight": 1
+      },
+      "value": 25978197.333333,
+      "query": "builtin:kubernetes.workload.memory_working_set:filter(eq(\"dt.entity.cloud_application\",CLOUD_APPLICATION-3B2BD00402B933C2)):splitBy(\"dt.entity.cloud_application\"):sum",
+      "score": 1
+    },
+    {
+      "result": {
+        "failResult": {
+          "operator": {
+            "greaterThan": {
+              "fixedValue": "0"
+            }
+          },
+          "fulfilled": false
+        },
+        "warnResult": {
+          "operator": {},
+          "fulfilled": false
+        },
+        "warning": false,
+        "pass": true
+      },
+      "objective": {
+        "analysisValueTemplateRef": {
+          "name": "error-rate"
+        },
+        "target": {
+          "failure": {
+            "greaterThan": {
+              "fixedValue": "0"
+            }
+          }
+        },
+        "weight": 3,
+        "keyObjective": true
+      },
+      "value": 0,
+      "query": "rate(http_requests_total{status_code='500', job='simple-go-service'}[1m]) or on() vector(0)",
+      "score": 3
+    }
+  ],
+  "totalScore": 4,
+  "maximumScore": 4,
+  "pass": true,
+  "warning": false
+}
diff --git a/docs-new/blog/posts/analyzing-application-performance/analysis-definition.yaml b/docs-new/blog/posts/analyzing-application-performance/analysis-definition.yaml
new file mode 100644
index 0000000000..c981d86d5a
--- /dev/null
+++ b/docs-new/blog/posts/analyzing-application-performance/analysis-definition.yaml
@@ -0,0 +1,26 @@
+apiVersion: metrics.keptn.sh/v1alpha3
+kind: AnalysisDefinition
+metadata:
+  name: my-analysis-definition
+  namespace: simple-go
+spec:
+  objectives:
+    - analysisValueTemplateRef:
+        name: memory-usage
+      keyObjective: false
+      target:
+        failure:
+          greaterThan:
+            fixedValue: 30M
+      weight: 1
+    - analysisValueTemplateRef:
+        name: error-rate
+      keyObjective: true
+      target:
+        failure:
+          greaterThan:
+            fixedValue: 0
+      weight: 3
+  totalScore:
+    passPercentage: 100
+    warningPercentage: 75
diff --git a/docs-new/blog/posts/analyzing-application-performance/analysis-status.yaml b/docs-new/blog/posts/analyzing-application-performance/analysis-status.yaml
new file mode 100644
index 0000000000..247217e38d
--- /dev/null
+++ b/docs-new/blog/posts/analyzing-application-performance/analysis-status.yaml
@@ -0,0 +1,19 @@
+apiVersion: metrics.keptn.sh/v1alpha3
+kind: Analysis
+metadata:
+  name: service-analysis
+  namespace: simple-go
+spec:
+  analysisDefinition:
+    name: my-analysis-definition-2
+  args:
+    workload: simple-go-service
+  timeframe:
+    recent: 10m
+status:
+  pass: true
+  raw: '…'
+  state: Completed
+  timeframe:
+    from: "2023-11-15T08:15:15Z"
+    to: "2023-11-15T08:25:15Z"
diff --git a/docs-new/blog/posts/analyzing-application-performance/analysis.yaml b/docs-new/blog/posts/analyzing-application-performance/analysis.yaml
new file mode 100644
index 0000000000..1afbd3da08
--- /dev/null
+++ b/docs-new/blog/posts/analyzing-application-performance/analysis.yaml
@@ -0,0 +1,12 @@
+apiVersion: metrics.keptn.sh/v1alpha3
+kind: Analysis
+metadata:
+  name: service-analysis
+  namespace: simple-go
+spec:
+  timeframe:
+    recent: 10m
+  args:
+    "workload": "simple-go-service"
+  analysisDefinition:
+    name: my-analysis-definition
diff --git a/docs-new/blog/posts/analyzing-application-performance/error-rate.yaml b/docs-new/blog/posts/analyzing-application-performance/error-rate.yaml
new file mode 100644
index 0000000000..53aa15d8fa
--- /dev/null
+++ b/docs-new/blog/posts/analyzing-application-performance/error-rate.yaml
@@ -0,0 +1,9 @@
+apiVersion: metrics.keptn.sh/v1alpha3
+kind: AnalysisValueTemplate
+metadata:
+  name: error-rate
+  namespace: simple-go
+spec:
+  provider:
+    name: my-prometheus-provider
+  query: "rate(http_requests_total{status_code='500', job='{{.workload}}'}[1m]) or on() vector(0)"
diff --git a/docs-new/blog/posts/analyzing-application-performance/memory-usage.yaml b/docs-new/blog/posts/analyzing-application-performance/memory-usage.yaml
new file mode 100644
index 0000000000..ba1d3b0c8d
--- /dev/null
+++ b/docs-new/blog/posts/analyzing-application-performance/memory-usage.yaml
@@ -0,0 +1,9 @@
+apiVersion: metrics.keptn.sh/v1alpha3
+kind: AnalysisValueTemplate
+metadata:
+  name: memory-usage
+  namespace: simple-go
+spec:
+  provider:
+    name: my-dynatrace-provider
+  query: 'metricSelector=builtin:kubernetes.workload.memory_working_set:filter(eq("dt.entity.cloud_application",CLOUD_APPLICATION-3B2BD00402B933C2)):splitBy("dt.entity.cloud_application"):sum' # yamllint disable-line rule:line-length
diff --git a/docs-new/blog/posts/analyzing-application-performance/metric-providers.yaml b/docs-new/blog/posts/analyzing-application-performance/metric-providers.yaml
new file mode 100644
index 0000000000..fadc64d4d9
--- /dev/null
+++ b/docs-new/blog/posts/analyzing-application-performance/metric-providers.yaml
@@ -0,0 +1,20 @@
+apiVersion: metrics.keptn.sh/v1alpha3
+kind: KeptnMetricsProvider
+metadata:
+  name: my-prometheus-provider
+  namespace: simple-go
+spec:
+  targetServer: <prometheus-url>
+  type: prometheus
+---
+apiVersion: metrics.keptn.sh/v1alpha3
+kind: KeptnMetricsProvider
+metadata:
+  name: my-dynatrace-provider
+  namespace: simple-go
+spec:
+  targetServer: "https://<tenant-id>.live.dynatrace.com"
+  type: dynatrace
+  secretKeyRef:
+    name: dt-api-token
+    key: DT_TOKEN