docs: Manual performance test for tracing (#726)

Co-authored-by: Nina Hingerl <[email protected]>
kyma-project · Jan 18, 2024 · d0d655b · d0d655b
1 parent 90a3945
commit d0d655b
Show file tree

Hide file tree

Showing 7 changed files with 707 additions and 0 deletions.
diff --git a/docs/contributor/performance-tests/traces/README.md b/docs/contributor/performance-tests/traces/README.md
@@ -0,0 +1,89 @@
+# Traces KPIs and Limit Test
+
+This document describes a reproducible test setup to determine the limits and KPis of the Kyma TracePipeline.
+
+## Prerequisites
+
+- Kyma as the target deployment environment, 2 Nodes with 4 CPU and 16G Memory (n1-standard-4 on GCP)
+- Telemetry Module installed
+- Istio Module installed
+- Kubectl > 1.22.x
+- Helm 3.x
+- curl 8.4.x
+- jq 1.6
+
+## Test Cases
+
+### Assumptions
+
+The tests are executed for 20 minutes for each test case to have a stabilized output and reliable KPIs. Generated traces contain at least 2 spans, and each span has 40 attributes to simulate an average trace span size.  
+
+The following test cases are identified:
+
+1. Test average throughput end-to-end. 
+2. Test queuing and retry capabilities of TracePipeline with simulated backend outages.
+3. Test average throughput with 3 TracePipelines simultaneously end-to-end.
+4. Test queuing and retry capabilities of 3 TracePipeline with simulated backend outages.
+
+
+## Setup
+
+The following diagram shows the test setup used for all test cases. 
+
+![Metric gateway exported metrics](./assets/trace_perf_test_setup.jpeg)
+
+In all test scenarios, a preconfigured trace load generator is deployed on the test cluster. To ensure all trace gateway instances are loaded with test data, the trace load generator feeds the test TracePipeline over a pipeline service instance .
+
+A Prometheus instance is deployed on the test cluster to collect relevant metrics from trace gateway instances and to fetch the metrics at the end of the test as test scenario result.
+
+All test scenarios also have a test backend deployed to simulate end-to-end behaviour.
+
+Each test scenario has its own test scripts responsible for preparing test scenario and deploying on test cluster, running the scenario, and fetching relevant metrics/KPIs at the end of the test run. After the test, the test results are printed out.
+
+A typical test result output looks like the following example:
+
+```shell
+ Receiver accepted spans,Average,12867.386144069678
+ Exporter exported spans,Average,38585.09404079456
+ Exporter queue size,Average,0
+ Pod memory,telemetry-trace-collector-9fd48899-7l6f7,147464192
+ Pod memory,telemetry-trace-collector-9fd48899-wdx2g,160010240
+ Pod CPU,telemetry-trace-collector-9fd48899-72knt,1.4228919657370949
+ Pod CPU,telemetry-trace-collector-9fd48899-7l6f7,1.414138202062809
+```
+
+## Test Script
+
+All test scenarios use a single test script [run-load-test.sh](assets/run-load-test.sh), which provides two parameters: `-m` for multi TracePipeline scenarios, and `-b` for backpressure scenarios
+1. To test the average throughput end-to-end, run:
+
+```shell
+./run-load-test.sh
+```
+2. To test the queuing and retry capabilities of TracePipeline with simulated backend outages, run:
+
+```shell
+./run-load-test.sh -b true
+```
+
+3. To test the average throughput with 3 TracePipelines simultaneously end-to-end, run:
+
+```shell
+./run-load-test.sh -m true
+```
+
+4. To test the queuing and retry capabilities of 3 TracePipelines with simulated backend outages, run:
+
+```shell
+./run-load-test.sh -m true -b true
+```
+
+## Test Results
+
+|               Test Name                |                  Receiver Accepted Spans / sec | Exporter Exported Spans / sec | Exporter Queue Size |  Pod Memory Usage (Bytes) | Pod CPU Usage |
+|:--------------------------------------:|-----------------------------------------------:|------------------------------:|--------------------:|--------------------------:|--------------:|
+|  OTEL Image Version 0.91.0 Throughput  |                                       19815.05 |                      19815.05 |                   0 |      137007232, 139920064 |  0.979, 0.921 |
+| OTEL Image Version 0.91.0 Backpressure |                                         9574.4 |                          1280 |                 509 |    1929478144, 1726021632 |  0.723, 0.702 |
+                                                                                                                                                             
+
+
diff --git a/docs/contributor/performance-tests/traces/assets/run-load-test.sh b/docs/contributor/performance-tests/traces/assets/run-load-test.sh
@@ -0,0 +1,104 @@
+#!/bin/sh
+
+JQ_COLORS="0;90:0;39:0;39:0;39:0;32:1;39:1;39:1;34"
+PROMETHEUS_NAMESPACE="prometheus"
+HELM_PROM_RELEASE="prometheus"
+NAMESPACE="trace-load-test"
+MAX_PIPELINE="false"
+BACKPRESSURE_TEST="false"
+
+while getopts m:b: flag; do
+    case "$flag" in
+        m) MAX_PIPELINE="true" ;;
+        b) BACKPRESSURE_TEST="true" ;;
+    esac
+done
+
+# shellcheck disable=SC2112
+function setup() {
+
+    # Deploy prometheus
+    helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+    helm repo update
+    helm upgrade --install -n ${PROMETHEUS_NAMESPACE} ${HELM_PROM_RELEASE} prometheus-community/kube-prometheus-stack -f values.yaml --set grafana.adminPassword=myPwd
+
+    if "$MAX_PIPELINE"; then
+        kubectl apply -f trace-max-pipeline.yaml
+    fi
+    # Deploy test setup
+    kubectl apply -f trace-load-test-setup.yaml
+
+    if "$BACKPRESSURE_TEST"; then
+        kubectl apply -f trace-backpressure-config.yaml
+        sleep 3
+        kubectl rollout restart deployment trace-receiver -n trace-load-test
+    fi
+}
+
+# shellcheck disable=SC2112
+function wait_for_resources() {
+    while [ -z $TRACEPIPELINE_READY ]; do
+        TRACEPIPELINE_READY=$(kubectl get tracepipelines.telemetry.kyma-project.io load-test-1 -o jsonpath='{.status.conditions[?(@.type=="Running")].type}')
+        echo "Waiting for TracePipeline 1"
+        [ -z "$TRACEPIPELINE_READY" ] && sleep 10
+    done
+
+    if "$MAX_PIPELINE"; then
+
+        while [ -z $TRACEPIPELINE_READY ]; do
+            TRACEPIPELINE_READY=$(kubectl get tracepipelines.telemetry.kyma-project.io load-test-2 -o jsonpath='{.status.conditions[?(@.type=="Running")].type}')
+            echo "Waiting for TracePipeline 2"
+            [ -z "$TRACEPIPELINE_READY" ] && sleep 10
+        done
+
+        while [ -z $TRACEPIPELINE_READY ]; do
+            TRACEPIPELINE_READY=$(kubectl get tracepipelines.telemetry.kyma-project.io load-test-3 -o jsonpath='{.status.conditions[?(@.type=="Running")].type}')
+            echo "Waiting for TracePipeline 3"
+            [ -z "$TRACEPIPELINE_READY" ] && sleep 10
+        done
+    fi
+    kubectl -n ${PROMETHEUS_NAMESPACE} rollout status statefulset prometheus-prometheus-kube-prometheus-prometheus
+    kubectl -n ${NAMESPACE} rollout status deployment trace-load-generator
+    kubectl -n ${NAMESPACE} rollout status deployment trace-receiver
+
+    echo "Running Tests"
+}
+
+# shellcheck disable=SC2112
+function cleanup() {
+    kubectl -n ${PROMETHEUS_NAMESPACE} port-forward $(kubectl -n ${PROMETHEUS_NAMESPACE} get service -l app=kube-prometheus-stack-prometheus -oname) 9090 &
+    sleep 3
+
+    echo "Test results collecting"
+
+    curl -fs --data-urlencode 'query=avg(sum(rate(otelcol_receiver_accepted_spans{service="telemetry-trace-collector-metrics"}[1m])))' localhost:9090/api/v1/query | jq -r '.data.result[] | [ "Receiver accepted spans", "Average", .value[1] ] | @csv' | xargs printf "\033[0;31m %s \033[0m \n"
+
+    curl -fs --data-urlencode 'query=avg(sum(rate(otelcol_exporter_sent_spans{exporter=~"otlp/load-test.*"}[1m])))' localhost:9090/api/v1/query | jq -r '.data.result[] | [ "Exporter exported spans", "Average", .value[1] ] | @csv' | xargs printf "\033[0;31m %s \033[0m \n"
+
+    curl -fs --data-urlencode 'query=avg(sum(otelcol_exporter_queue_size{service="telemetry-trace-collector-metrics"}))' localhost:9090/api/v1/query | jq -r '.data.result[] | [ "Exporter queue size", "Average", .value[1] ] | @csv' | xargs printf "\033[0;31m %s \033[0m \n"
+
+    curl -fs --data-urlencode 'query=sum(container_memory_working_set_bytes{namespace="kyma-system"} * on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-trace-collector"}) by (pod)' localhost:9090/api/v1/query | jq -r '.data.result[] | [ "Pod memory", .metric.pod, .value[1] ] | @csv' | xargs printf "\033[0;31m %s \033[0m \n"
+
+    curl -fs --data-urlencode 'query=sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"} * on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-trace-collector"}) by (pod)' localhost:9090/api/v1/query | jq -r '.data.result[] | [ "Pod CPU", .metric.pod, .value[1] ] | @csv' | xargs printf "\033[0;31m %s \033[0m \n"
+    kill %1
+
+    if "$MAX_PIPELINE"; then
+      kubectl delete -f trace-max-pipeline.yaml
+    fi
+    kubectl delete -f trace-load-test-setup.yaml
+
+    helm delete -n ${PROMETHEUS_NAMESPACE} ${HELM_PROM_RELEASE}
+}
+
+echo "TracePipeline Load Test"
+echo "--------------------------------------------"
+trap cleanup EXIT
+setup
+wait_for_resources
+# wait 20 minutes until test finished
+sleep 1200
+
+
+
+
+
diff --git a/docs/contributor/performance-tests/traces/assets/trace-backpressure-config.yaml b/docs/contributor/performance-tests/traces/assets/trace-backpressure-config.yaml
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: trace-receiver
+  namespace: trace-load-test
+  labels:
+    app: trace-receiver
+data:
+  config.yaml: |
+    receivers:
+      otlp:
+        protocols:
+          grpc: {}
+          http: {}
+    exporters:
+      debug:
+        verbosity: detailed
+    service:
+      pipelines:
+        traces:
+          receivers:
+            - otlp
+          exporters:
+            - debug
+---