From cfb14228e3ea7c653c96eefa1105e33a95b9fc55 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Wed, 4 Dec 2024 15:38:48 +0100 Subject: [PATCH 01/66] chore: Add image.repository to otel values charts --- docs/contributor/pocs/assets/otel-log-agent-values.yaml | 1 + docs/contributor/pocs/assets/otel-log-gateway-values.yaml | 1 + docs/contributor/pocs/assets/otel-logs-values.yaml | 3 +++ 3 files changed, 5 insertions(+) diff --git a/docs/contributor/pocs/assets/otel-log-agent-values.yaml b/docs/contributor/pocs/assets/otel-log-agent-values.yaml index 6c3fefe8e..8c3af4264 100644 --- a/docs/contributor/pocs/assets/otel-log-agent-values.yaml +++ b/docs/contributor/pocs/assets/otel-log-agent-values.yaml @@ -148,3 +148,4 @@ serviceMonitor: image: pullPolicy: Always + repository: "otel/opentelemetry-collector-k8s" diff --git a/docs/contributor/pocs/assets/otel-log-gateway-values.yaml b/docs/contributor/pocs/assets/otel-log-gateway-values.yaml index 9a5f3f905..c276e2654 100644 --- a/docs/contributor/pocs/assets/otel-log-gateway-values.yaml +++ b/docs/contributor/pocs/assets/otel-log-gateway-values.yaml @@ -73,6 +73,7 @@ securityContext: image: pullPolicy: Always + repository: "otel/opentelemetry-collector-k8s" rollout: rollingUpdate: {} diff --git a/docs/contributor/pocs/assets/otel-logs-values.yaml b/docs/contributor/pocs/assets/otel-logs-values.yaml index 470608bb9..0024b60be 100644 --- a/docs/contributor/pocs/assets/otel-logs-values.yaml +++ b/docs/contributor/pocs/assets/otel-logs-values.yaml @@ -129,3 +129,6 @@ config: extraEnvsFrom: - secretRef: name: sap-cloud-logging + +image: + repository: "otel/opentelemetry-collector-k8s" \ No newline at end of file From 33c369671046fbefdb6577e9f7103158a147af6c Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Fri, 6 Dec 2024 16:09:38 +0100 Subject: [PATCH 02/66] chore: functional log agent otel values file --- docs/contributor/pocs/assets/otel-log-agent-values.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/contributor/pocs/assets/otel-log-agent-values.yaml b/docs/contributor/pocs/assets/otel-log-agent-values.yaml index 8c3af4264..300091bb8 100644 --- a/docs/contributor/pocs/assets/otel-log-agent-values.yaml +++ b/docs/contributor/pocs/assets/otel-log-agent-values.yaml @@ -116,7 +116,7 @@ config: exporters: otlp: - endpoint: log-gateway-opentelemetry-collector:4317 + endpoint: telemetry-otlp-logs.kyma-system:4317 tls: insecure: true service: @@ -125,7 +125,7 @@ config: address: ${MY_POD_IP}:8888 pipelines: logs: - processors: {} + processors: [] exporters: - otlp @@ -142,7 +142,7 @@ ports: enabled: true serviceMonitor: - enabled: true + enabled: false metricsEndpoints: - port: metrics From 641dfcd815487558c839df1e71ca7fc0766f1587 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Wed, 18 Dec 2024 11:05:00 +0100 Subject: [PATCH 03/66] chore: LogAgent load test setup and config files --- hack/load-tests/log-agent-test-setup.yaml | 151 +++++++++++++++++++ hack/load-tests/log-backpressure-config.yaml | 21 +++ hack/load-tests/run-load-test.sh | 4 +- 3 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 hack/load-tests/log-agent-test-setup.yaml create mode 100644 hack/load-tests/log-backpressure-config.yaml diff --git a/hack/load-tests/log-agent-test-setup.yaml b/hack/load-tests/log-agent-test-setup.yaml new file mode 100644 index 000000000..7ecce4627 --- /dev/null +++ b/hack/load-tests/log-agent-test-setup.yaml @@ -0,0 +1,151 @@ +apiVersion: telemetry.kyma-project.io/v1alpha1 +kind: LogPipeline +metadata: + name: load-test-1 +spec: + output: + otlp: + endpoint: + value: http://log-receiver.log-load-test:4317 + tls: + insecure: true + insecureSkipVerify: true + protocol: grpc +--- +apiVersion: v1 +kind: Namespace +metadata: + name: log-load-test +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: log-load-generator + namespace: log-load-test +spec: + replicas: 11 + selector: + matchLabels: + app.kubernetes.io/name: logs-load-generator + template: + metadata: + labels: + app.kubernetes.io/name: logs-load-generator + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - logs-load-generator + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -b=10485760 # 10MB + - -f=json + - -l + image: mingrammer/flog + imagePullPolicy: Always + name: flog + resources: + limits: + memory: 200Mi + requests: + cpu: 10m + memory: 50Mi +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: log-receiver + namespace: log-load-test + labels: + app.kubernetes.io/name: log-receiver +data: + config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: ${MY_POD_IP}:4317 + http: + endpoint: ${MY_POD_IP}:4318 + exporters: + debug: + + service: + pipelines: + logs: + receivers: + - otlp + exporters: + - debug +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: log-receiver + name: log-receiver + namespace: log-load-test +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: log-receiver + template: + metadata: + labels: + app.kubernetes.io/name: log-receiver + sidecar.istio.io/inject: "true" + spec: + volumes: + - name: collector-config + configMap: + name: log-receiver + securityContext: + fsGroup: 101 + containers: + - image: europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.114.0-main + name: otel-collector + resources: + limits: + memory: 1024Mi + requests: + memory: 1024Mi + volumeMounts: + - name: collector-config + mountPath: /etc/collector + args: + - --config=/etc/collector/config.yaml + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: log-receiver + name: log-receiver + namespace: log-load-test +spec: + ports: + - name: grpc-otlp + port: 4317 + protocol: TCP + targetPort: 4317 + - name: http-otlp + port: 4318 + protocol: TCP + targetPort: 4318 + selector: + app.kubernetes.io/name: log-receiver diff --git a/hack/load-tests/log-backpressure-config.yaml b/hack/load-tests/log-backpressure-config.yaml new file mode 100644 index 000000000..82ba506a1 --- /dev/null +++ b/hack/load-tests/log-backpressure-config.yaml @@ -0,0 +1,21 @@ +apiVersion: networking.istio.io/v1 +kind: VirtualService +metadata: + name: log-receiver-fault + namespace: log-load-test +spec: + hosts: + - log-receiver + http: + - fault: + abort: + httpStatus: 503 + percentage: + value: 70 + delay: + percentage: + value: 70 + fixedDelay: 1s + route: + - destination: + host: log-receiver \ No newline at end of file diff --git a/hack/load-tests/run-load-test.sh b/hack/load-tests/run-load-test.sh index e02bfd547..a3a931ae3 100755 --- a/hack/load-tests/run-load-test.sh +++ b/hack/load-tests/run-load-test.sh @@ -383,9 +383,9 @@ function get_result_and_cleanup_metricagent() { function get_result_and_cleanup_log_otel() { RESULT_TYPE="log" - QUERY_RECEIVED='query=round(sum(rate(otelcol_receiver_accepted_log_records{service="log-gateway-metrics"}[20m])))' + QUERY_RECEIVED='query=round(sum(rate(otelcol_receiver_accepted_log_records{service=~"log-gateway-metrics"}[20m])))' QUERY_EXPORTED='query=round(sum(rate(otelcol_exporter_sent_log_records{service=~"log-gateway-metrics"}[20m])))' - QUERY_QUEUE='query=avg(sum(otelcol_exporter_queue_size{service="log-gateway-metrics"}))' + QUERY_QUEUE='query=avg(sum(otelcol_exporter_queue_size{service=~"log-gateway-metrics"}))' QUERY_MEMORY='query=round(sum(avg_over_time(container_memory_working_set_bytes{namespace="log-load-test", container="collector"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="log-load-test", workload="log-gateway"}[20m])) by (pod) / 1024 / 1024)' QUERY_CPU='query=round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="log-load-test"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="log-load-test", workload="log-gateway"}[20m])) by (pod), 0.1)' From babe0c7615d5f6ab5ed442f58640c7ef3170baa1 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Fri, 20 Dec 2024 14:24:44 +0100 Subject: [PATCH 04/66] chore: Update load test files --- docs/contributor/benchmarks/load-test-logs.md | 6 +++--- hack/load-tests/log-agent-test-setup.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/contributor/benchmarks/load-test-logs.md b/docs/contributor/benchmarks/load-test-logs.md index 2b51fd2ba..bd0e3bfb7 100644 --- a/docs/contributor/benchmarks/load-test-logs.md +++ b/docs/contributor/benchmarks/load-test-logs.md @@ -34,9 +34,9 @@ The tests are executed for 20 minutes, so that each test case has a stabilized o
| config | logs received l/s | logs exported l/s | logs queued | cpu | memory MB | no. restarts of gateway | no. restarts of generator | -| --- | --- | --- | --- | --- | --- | ---| -| single | 7193 | 7195 | 16824 | 2.5 | 826 | 0 | 1 | -| batch | 16428 | 16427 | 0 | 3 | 265 | 0 | 1 | +| ------ | ----------------- | ----------------- | ----------- | --- | --------- | ----------------------- | ------------------------- | +| single | 7193 | 7195 | 16824 | 2.5 | 826 | 0 | 1 | +| batch | 16428 | 16427 | 0 | 3 | 265 | 0 | 1 |
## Interpretation diff --git a/hack/load-tests/log-agent-test-setup.yaml b/hack/load-tests/log-agent-test-setup.yaml index 7ecce4627..9b9326926 100644 --- a/hack/load-tests/log-agent-test-setup.yaml +++ b/hack/load-tests/log-agent-test-setup.yaml @@ -23,7 +23,7 @@ metadata: name: log-load-generator namespace: log-load-test spec: - replicas: 11 + replicas: 10 selector: matchLabels: app.kubernetes.io/name: logs-load-generator @@ -115,9 +115,9 @@ spec: name: otel-collector resources: limits: - memory: 1024Mi + memory: 2048Mi requests: - memory: 1024Mi + memory: 2048Mi volumeMounts: - name: collector-config mountPath: /etc/collector From 18eb9ed846f876ef3104dd2bda70ef238de1201d Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Fri, 20 Dec 2024 14:49:31 +0100 Subject: [PATCH 05/66] docs: Add the log agent load test investigations results and final configuration --- .../benchmarks/otlp-logs-validation.md | 296 ++++++++++++++++++ .../benchmarks/otlp-logs-validation.yaml | 244 +++++++++++++++ 2 files changed, 540 insertions(+) create mode 100644 docs/contributor/benchmarks/otlp-logs-validation.md create mode 100644 docs/contributor/benchmarks/otlp-logs-validation.yaml diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md new file mode 100644 index 000000000..e8e5da83a --- /dev/null +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -0,0 +1,296 @@ +# OTel LogPipeline set-up validation + +This file documents the process of validating the whole LogPipeline with OTLP output flow. It starts by defining the setup, that consists of the manually deployed log agent, the already-implemented log gateway, and log generators using flog. + +The scope is to performance test the agent, observing the resulting values, in terms of throughput, resource consumption, reaction to backpressure, etc. + + + +## 1. Set-up configuration steps + +### With Helm + +``` bash +k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml + +// Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables + +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts + +helm install -n kyma-system logging open-telemetry/opentelemetry-collector -f telemetry-manager/docs/contributor/pocs/assets/otel-log-agent-values.yaml +``` + +### Manual + +``` bash +k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml + +// Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables + +k apply -f ./otlp-logs-validation.yaml +``` + + + +## 2. Resulting Resources + +### Agent ConfigMap (OTel Config) + +See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) + +#### Things to take into consideration (at implementation) +- Dynamically inclusion/exclusion of namespaces, based on LogPipeline spec attributes +- Exclude FluentBit container in OTel configuration and OTel container in FluentBit configuration +- `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled + +### Agent DaemonSet + +See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) + +### How does checkpointing work + +- By enabling the storeCheckpoint preset (Helm) the `file_storage` extension is activated in the receiver +- The `file_storage` has the path `/var/lib/otelcol` +- This is later mounted as a `hostPath` volume in the DaemonSet spec +- Also set in the `storage` property of the filelog receiver + +> `storage` = The ID of a storage extension to be used to store file offsets. File offsets allow the receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver will manage offsets in memory only. + + + +## 3. Benchmarking and Performance Tests Results + +Setup Configuration: +``` bash +k create ns prometheus +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm upgrade --install -n "prometheus" "prometheus" prometheus-community/kube-prometheus-stack -f hack/load-tests/values.yaml --set grafana.adminPassword=myPwd + +k apply -f telemetry-manager/hack/load-tests/log-agent-test-setup.yaml +``` + +For the 🏋️‍♀️ Backpressure Scenario additionally apply: +``` bash +k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml +``` + +PromQL Queries: +``` sql +-- RECEIVED +round(sum(rate(otelcol_receiver_accepted_log_records{service="telemetry-log-agent-metrics"}[20m]))) + +-- EXPORTED +round(sum(rate(otelcol_exporter_sent_log_records{service="telemetry-log-agent-metrics"}[20m]))) + +-- QUEUE +avg(sum(otelcol_exporter_queue_size{service="telemetry-log-agent-metrics"})) + +-- MEMORY +round(sum(avg_over_time(container_memory_working_set_bytes{namespace="kyma-system", container="collector"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod) / 1024 / 1024) + +-- CPU +round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod), 0.1) +``` + +### ⭐️ Best Results (Scenario: Single Pipeline) +| Batching | RECEIVED | EXPORTED | QUEUE | MEMORY | CPU | +| :------: | :-------: | :-------: | :---: | :----: | :---: | +| ❌ | max. 8.9K | max. 8.9K | 0 | ~63 | ~0.5 | +| ✅ | 8.6K | 8.6k | 0 | ~73 | ~0.6 | + +### ⭐️🏋️‍♀️ Best Results (Scenario: Single Pipeline with Backpressure) +| Batching | RECEIVED | EXPORTED | QUEUE | MEMORY | CPU | +| :------: | :------: | :------: | :---: | :----: | :---: | +| ❌ | 6.8K | 6.8K | ~328 | ~66 | ~0.5 | +| ✅ | - | - | - | - | - | + +### 📊 Benchmarking Sessions + +| Icon | Meaning | +| ---- | ---------------------------------------------------- | +| ⏳ | Full-test, involving the whole setup, usually 20 min | +| 🪲 | Debugging session, usually shorter, not so reliable | +| 🏋️‍♀️ | Backpressure Scenario | +| ⭐️ | Best results observed (in a given scenario) | + +#### ⏳ 18 Dec 2024, 13:45 - 14:05 (20 min) +- **Generator:** 10 replicas x 10 MB +- **Agent:** no CPU limit, no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 6.06K + - Agent Memory: + - Pod1: 70 + - Pod2: 70 + - Agent CPU: + - Pod1: 0.5 + - Pod2: 0.4 + - Gateway RECEIVED/EXPORTED: 6.09K + - Gateway QUEUE: 0 + +#### ⏳ 18 Dec 2024, 14:08 - 14:28 (20 min) +- **Generator:** 20 replicas x 10 MB +- **Agent:** no CPU limit, no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 4.93K + - Agent Memory: + - Pod1: 71 + - Pod2: 72 + - Agent CPU: + - Pod1: 0.5 + - Pod2: 0.4 + - Gateway RECEIVED/EXPORTED: 4.93K + - Gateway QUEUE: 0 (max. 6 at some point) + +#### ⏳ 18 Dec 2024, 14:50 - 15:10 (20 min) +- **Generator:** 10 replicas x 20 MB +- **Agent:** no CPU limit, no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 5.94K + - Agent Memory: + - Pod1: 76 + - Pod2: 81 + - Agent CPU: + - Pod1: 0.5 + - Pod2: 0.5 + - Gateway RECEIVED/EXPORTED: 5.94K + - Gateway QUEUE: 0 + +#### ⏳⭐️ 18 Dec 2024, 15:24 - 15:34 (10 min) +- **Generator:** 10 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 8.9K + - Agent Memory: 64/62 + - Agent CPU: 0.5/0.5 + - Gateway RECEIVED/EXPORTED: 8.9K + - Gateway QUEUE: 0 + +#### 🏋️‍♀️⭐️ 18 Dec 2024, 15:36 - 15:56 (20 min) (backpressure scenario) +- **Generator:** 10 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 6.8K + - Agent Memory: + - Pod1: 66 + - Pod2: 67 + - Agent CPU: + - Pod1: 0.6 + - Pod2: 0.5 + - Gateway RECEIVED: 6.8K + - Gateway EXPORTED: 256 + - Gateway QUEUE: 328 +- **Remarks:** + - Agent does not stop when gateway refuses logs (because backpressure does not backpropagate) + - It slows down/stops in other scenarios (see bellow) => SUCCESS + +#### 🪲 19 Dec 2024, Agent exports logs to a debug endpoint (5 min) +- no networking involved +- 12/14 log generators x 10 MB + - 19.5K => ~20K + - MEM: 43/47 + - CPU: 0.7/0.8 + +#### 🪲 19 Dec 2024, Agent exports logs directly to mock backend (5 min) +- networking, but avoiding gateway +- 10 log generators x 10 MB + - 5.3K + - MEM: 58/59 + - CPU: 0.4/0.5 +- 12 log generators x 10 MB + - not increasing + +#### 🪲 19 Dec 2024, Agent exports logs directly to mock backend with batching processor (5 min) +- networking, but with batching mechanism in-place +- 10 log generators x 10 MB, batch size: 1024 + - 8.3K + - MEM: 68/73 + - CPU: 0.5/0.6 +- 12 log generators x 10 MB, batch size: 1024 + - starts decreasing (~7.5K) +- 10 log generators x 10 MB, batch size: 2048 + - ~9K + - MEM: 74/79 + - CPU: 0.6/0.7 + +#### ⏳ 19 Dec 2024, 13:46 - 14:06 (20 min) +- **Generator:** 10 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue, with batch processing (1024) +- **Results:** + - Agent RECEIVED/EXPORTED: 8.46K + - Gateway RECEIVED/EXPORTED: 8.46K + - Agent Memory: 69/76 + - Agent CPU: 0.5/0.7 + - Gateway QUEUE: 0 (max 191) + +#### ⏳ 19 Dec 2024, ??:?? - ??:?? (20 min) +- **Generator:** 10 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue, with batch processing (2048) +- **Results:** + - lower throughput as for the 1024 scenario + +#### ⏳⭐️ 19 Dec 2024, 15:55 - 16:15 (20 min) +- **Agent:** with CPU limit (1), no queue, with batch processing (1024) +- **Mock Backend:** memory limit x2 (2048Mi) +- **Generator:** 10 replicas x 10 MB + - **Results:** + - Agent RECEIVED/EXPORTED: 8.18K + - Gateway RECEIVED/EXPORTED: 8.18K + - Agent Memory: 70/71 + - Agent CPU: 0.6/0.6 + - Gateway QUEUE: 0 +- **Generator:** 12 replicas x 10 MB (16:18 - 16:35) + - **Results:** + - Agent RECEIVED/EXPORTED: 8.6k + - Gateway RECEIVED/EXPORTED: 8.6k + - Agent Memory: 73/74 + - Agent CPU: 0.7/0.6 + - Gateway QUEUE: 0 +- - **Generator:** 14 replicas x 10 MB (16:35 - 16:40) + - **Results:** + - Agent RECEIVED/EXPORTED: 7.54K + - Gateway RECEIVED/EXPORTED: 7.54K + - lower + +#### ⏳ 19 Dec 2024, 16:50 - 17:10 (20 min) +- **Generator:** 12 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue, with batch processing (2048) +- **Mock Backend:** memory limit x2 (2048Mi) +- **Results:** + - Agent RECEIVED/EXPORTED: 8.1K + - Gateway RECEIVED/EXPORTED: 8.11K + - Agent Memory: 74/81 + - Agent CPU: 0.6/0.5 + - Gateway QUEUE: 0 (max 2) + +#### 🪲 20 Dec 2024, Multiple agents loading the gateway (5 min) +- **Setup:** 10 nodes, 10 agents, 1 generator / node (DaemonSet) +- **Results (WITH BATCHING):** + - Agent RECEIVED/EXPORTED: 61.5K => 6.1K / agent instance + - Gateway RECEIVED/EXPORTED: 61.5K/29.5K => 30K/14.7K / gateway instance + - Agent Memory: 61-68/agent + - Agent CPU: 0.4-0.8/agent + - Gateway QUEUE: 510 (max 512, full) + - ~10% exporter failed enqueue logs + - 0% receiver refused logs + - 0% exporter send failed logs +- **Results (WITHOUT BATCHING):** + - Agent RECEIVED/EXPORTED: 31.4K => 3.1K / agent instance + - Gateway RECEIVED/EXPORTED: 31.4K => 11.4K / gateway instance + - Agent Memory: 61-68/agent + - Agent CPU: 0.4-0.5/agent + - Gateway QUEUE: 0 (max 6) + - 0% exporter failed enqueue logs + - 0% receiver refused logs + - 0% exporter send failed logs + + +## 4. Conclusions + +- A lower performance can be expected, compared to the FluentBit counterpart setup. +- Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end, since the agent has no way of knowing when to stop, thus exports data continuously. (This is a known issue, that should get solved by the OTel community in the next half year) +- Agent slows down if the load is increased (i.e. more generators / more logs / more data). +- The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup, since when using just a debug endpoint as an exporter, higher throughput was observed. +- CPU and Memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). +- When enabling the batch processor, throughput was increasing, but this comes at the cost of losing logs in some scenarios. +- More/other methods of improving the throughput might still be worth investigating. \ No newline at end of file diff --git a/docs/contributor/benchmarks/otlp-logs-validation.yaml b/docs/contributor/benchmarks/otlp-logs-validation.yaml new file mode 100644 index 000000000..38b0a7764 --- /dev/null +++ b/docs/contributor/benchmarks/otlp-logs-validation.yaml @@ -0,0 +1,244 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: telemetry-log-agent + name: telemetry-log-agent + namespace: kyma-system +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/port: "8888" + prometheus.io/scheme: http + prometheus.io/scrape: "true" + labels: + app.kubernetes.io/name: telemetry-log-agent + telemetry.kyma-project.io/self-monitor: enabled + name: telemetry-log-agent-metrics + namespace: kyma-system +spec: + internalTrafficPolicy: Cluster + ipFamilies: + - IPv4 + ipFamilyPolicy: SingleStack + ports: + - name: http-metrics + port: 8888 + protocol: TCP + targetPort: 8888 + selector: + app.kubernetes.io/name: telemetry-log-agent + sessionAffinity: None + type: ClusterIP +status: + loadBalancer: {} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/name: telemetry-log-agent + name: telemetry-log-agent + namespace: kyma-system +data: + relay: | + exporters: + otlp: + endpoint: telemetry-otlp-logs.kyma-system:4317 + tls: + insecure: true + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + sending_queue: + enabled: false + + extensions: + file_storage: + directory: /var/lib/otelcol + health_check: + endpoint: ${env:MY_POD_IP}:13133 + pprof: + endpoint: 127.0.0.1:1777 + + processors: + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + transform/set-instrumentation-scope-runtime: + error_mode: ignore + metric_statements: + - context: scope + statements: + - set(version, "main") + - set(name, "io.kyma-project.telemetry/runtime") + + receivers: + filelog: + exclude: + - /var/log/pods/kyma-system_telemetry-log-agent*/*/*.log # exclude self + - /var/log/pods/kyma-system_telemetry-fluent-bit*/*/*.log # exclude FluentBit + include: + - /var/log/pods/*/*/*.log + include_file_name: false + include_file_path: true + operators: + - type: container + id: container-parser + add_metadata_from_filepath: true + format: containerd + - from: attributes.stream + if: attributes.stream != nil + to: attributes["log.iostream"] + type: move + - if: body matches "^{.*}$" + parse_from: body + parse_to: attributes + type: json_parser + - from: body + to: attributes.original + type: copy + - from: attributes.message + if: attributes.message != nil + to: body + type: move + - from: attributes.msg + if: attributes.msg != nil + to: body + type: move + - if: attributes.level != nil + parse_from: attributes.level + type: severity_parser + retry_on_failure: + enabled: true + start_at: beginning + storage: file_storage + + service: + extensions: + - health_check + - pprof + - file_storage + pipelines: + logs: + exporters: + - otlp + processors: + - memory_limiter + - transform/set-instrumentation-scope-runtime + receivers: + - filelog + telemetry: + metrics: + readers: + - pull: + exporter: + prometheus: + host: ${MY_POD_IP} + port: 8888 +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/name: telemetry-log-agent + name: telemetry-log-agent + namespace: kyma-system +spec: + selector: + matchLabels: + app.kubernetes.io/name: telemetry-log-agent + template: + metadata: + labels: + app.kubernetes.io/name: telemetry-log-agent + sidecar.istio.io/inject: "true" + annotations: + traffic.sidecar.istio.io/excludeInboundPorts: 8888,15020 + traffic.sidecar.istio.io/includeInboundPorts: "*" + traffic.sidecar.istio.io/includeOutboundIPRanges: "*" + spec: + containers: + - args: + - --config=/conf/relay.yaml + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + image: europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.114.0-main + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 13133 + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + name: collector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 13133 + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 100m + memory: 50Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + runAsNonRoot: false + runAsUser: 0 + seccompProfile: + type: RuntimeDefault + volumeMounts: + - mountPath: /conf + name: config + - mountPath: /var/log/pods + name: varlogpods + readOnly: true + - mountPath: /var/lib/otelcol + name: varlibotelcol + priorityClassName: telemetry-priority-class-high + securityContext: + runAsNonRoot: false + seccompProfile: + type: RuntimeDefault + serviceAccountName: telemetry-log-agent + terminationGracePeriodSeconds: 30 + volumes: + - configMap: + defaultMode: 420 + items: + - key: relay + path: relay.yaml + name: telemetry-log-agent + name: config + - hostPath: + path: /var/log/pods + type: "" + name: varlogpods + - hostPath: + path: /var/lib/otelcol + type: DirectoryOrCreate + name: varlibotelcol From 64ae962e8c0ef7670dce1dd0fa5056ba31701f85 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Fri, 20 Dec 2024 15:34:58 +0100 Subject: [PATCH 06/66] chore: Add additional load test instruction --- docs/contributor/benchmarks/otlp-logs-validation.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index e8e5da83a..2c9c55369 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -70,6 +70,16 @@ helm upgrade --install -n "prometheus" "prometheus" prometheus-community/kube-pr k apply -f telemetry-manager/hack/load-tests/log-agent-test-setup.yaml ``` +For executing the load tests, the generated logs have to be isolated, hence the following line should be replaced in the ConfigMap of the log agent: + +``` yaml +receivers: + filelog: + # ... + include: + - /var/log/pods/*/*/*.log # replace with "/var/log/pods/log-load-test*/*flog*/*.log" +``` + For the 🏋️‍♀️ Backpressure Scenario additionally apply: ``` bash k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml @@ -246,7 +256,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Agent Memory: 73/74 - Agent CPU: 0.7/0.6 - Gateway QUEUE: 0 -- - **Generator:** 14 replicas x 10 MB (16:35 - 16:40) +- **Generator:** 14 replicas x 10 MB (16:35 - 16:40) - **Results:** - Agent RECEIVED/EXPORTED: 7.54K - Gateway RECEIVED/EXPORTED: 7.54K From e52ed389dbdff57dff74104d54c2c0e7cbedebe8 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 13 Jan 2025 11:42:39 +0100 Subject: [PATCH 07/66] chore: WIP --- docs/contributor/benchmarks/otlp-logs-validation.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 2c9c55369..c870c0fcb 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -2,7 +2,7 @@ This file documents the process of validating the whole LogPipeline with OTLP output flow. It starts by defining the setup, that consists of the manually deployed log agent, the already-implemented log gateway, and log generators using flog. -The scope is to performance test the agent, observing the resulting values, in terms of throughput, resource consumption, reaction to backpressure, etc. +The scope is to performance test the agent, observing the resulting values, in terms of throughput, resource consumption, reaction to backpressure, etc. And compare it to the previous FluentBit-based setup. @@ -295,7 +295,13 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - 0% exporter send failed logs -## 4. Conclusions +## 4. Comparison with FluentBit setup +In the FluentBit setup, for the very same scenario, the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: +- Exported Log Records per second: 3.913 +- Received Log Records per second: 3.868 + + +## 5. Conclusions - A lower performance can be expected, compared to the FluentBit counterpart setup. - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end, since the agent has no way of knowing when to stop, thus exports data continuously. (This is a known issue, that should get solved by the OTel community in the next half year) From 88f438531c35ae39f437a7e58a1c1d9160168aae Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Wed, 15 Jan 2025 17:11:43 +0100 Subject: [PATCH 08/66] chore: New findings --- .../benchmarks/otlp-logs-validation.md | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index c870c0fcb..f0d190890 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -239,7 +239,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - **Results:** - lower throughput as for the 1024 scenario -#### ⏳⭐️ 19 Dec 2024, 15:55 - 16:15 (20 min) +#### ⏳ 19 Dec 2024, 15:55 - 16:15 (20 min) - **Agent:** with CPU limit (1), no queue, with batch processing (1024) - **Mock Backend:** memory limit x2 (2048Mi) - **Generator:** 10 replicas x 10 MB @@ -294,11 +294,38 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - 0% receiver refused logs - 0% exporter send failed logs +#### ⏳ 15 Jan 2025, 12:31 - 12:51 (20 min) +- **Generator:** 10 replicas x 10 MB +- **Results:** + - Agent RECEIVED/EXPORTED: 14.4K + - Gateway RECEIVED/EXPORTED: 14.4K + - Agent Memory: 74/69 + - Agent CPU: 0.9/0.8 + - Gateway QUEUE: 0 + +#### ⏳⭐️ 15 Jan 2025, 14:31 - 14:08 (20 min) +- Gateway on 2 separate nodes +- **Generator:** 10 replicas x 10 MB +- **Results:** + - Agent RECEIVED/EXPORTED: 15.7K + - Gateway RECEIVED/EXPORTED: 15.7K + - Agent Memory: 82/71 + - Agent CPU: 1/0.9 + - Gateway CPU: 0.6/0.6 + - Gateway Memory: 62/68 + - Gateway QUEUE: 0 + +#### 🪲 15 Jan 2025, Agent exports logs to a debug endpoint (5 min) +- no networking involved +- ~15K / agent + + +#### Removing compression for the OTLP exporter boosts throughput + ## 4. Comparison with FluentBit setup In the FluentBit setup, for the very same scenario, the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: -- Exported Log Records per second: 3.913 -- Received Log Records per second: 3.868 +- Exported Log Records/second: 27.8K ## 5. Conclusions From cb8054415e8765e7378a3dbc385399d4b985ba0a Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Thu, 16 Jan 2025 11:02:06 +0100 Subject: [PATCH 09/66] chore: configuration WIP --- .../benchmarks/otlp-logs-validation.md | 14 +++- .../log-agent-setup-telemetrygen.yml | 74 +++++++++++++++++++ .../log-agent-test-setup-generator.yml | 41 ++++++++++ hack/load-tests/log-agent-test-setup.yaml | 49 +----------- 4 files changed, 128 insertions(+), 50 deletions(-) create mode 100644 hack/load-tests/log-agent-setup-telemetrygen.yml create mode 100644 hack/load-tests/log-agent-test-setup-generator.yml diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index f0d190890..d047ba303 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -166,7 +166,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Gateway RECEIVED/EXPORTED: 5.94K - Gateway QUEUE: 0 -#### ⏳⭐️ 18 Dec 2024, 15:24 - 15:34 (10 min) +#### ⏳ 18 Dec 2024, 15:24 - 15:34 (10 min) - **Generator:** 10 replicas x 10 MB - **Agent:** with CPU limit (1), no queue - **Results:** @@ -304,7 +304,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Gateway QUEUE: 0 #### ⏳⭐️ 15 Jan 2025, 14:31 - 14:08 (20 min) -- Gateway on 2 separate nodes +- Gateways on separate nodes - **Generator:** 10 replicas x 10 MB - **Results:** - Agent RECEIVED/EXPORTED: 15.7K @@ -319,9 +319,16 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - no networking involved - ~15K / agent - #### Removing compression for the OTLP exporter boosts throughput +#### ⏳ 15 Jan 2025, ? - ? (20 min) +- Gateways on separate nodes +- Compression disabled for OTLP exporters (on both agent and gateway) (default: gzip) +- **Generator:** 20 replicas (new set-up) +- **Results:** + - Agent RECEIVED/EXPORTED: 15.3K + - Gateway RECEIVED/EXPORTED: 15.3K + ## 4. Comparison with FluentBit setup In the FluentBit setup, for the very same scenario, the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: @@ -329,7 +336,6 @@ In the FluentBit setup, for the very same scenario, the [load test](https://gith ## 5. Conclusions - - A lower performance can be expected, compared to the FluentBit counterpart setup. - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end, since the agent has no way of knowing when to stop, thus exports data continuously. (This is a known issue, that should get solved by the OTel community in the next half year) - Agent slows down if the load is increased (i.e. more generators / more logs / more data). diff --git a/hack/load-tests/log-agent-setup-telemetrygen.yml b/hack/load-tests/log-agent-setup-telemetrygen.yml new file mode 100644 index 000000000..4ebf5ffea --- /dev/null +++ b/hack/load-tests/log-agent-setup-telemetrygen.yml @@ -0,0 +1,74 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: log-load-generator + name: log-load-generator + namespace: log-load-test +spec: + replicas: 20 + selector: + matchLabels: + app.kubernetes.io/name: log-load-generator + template: + metadata: + labels: + app.kubernetes.io/name: log-load-generator + sidecar.istio.io/inject: "true" + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - log-load-generator + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - image: TELEMETRY_GEN_IMAGE + args: + - logs + - --otlp-insecure + - --otlp-endpoint + - "telemetry-otlp-logs.kyma-system:4317" + - --otlp-attributes + - "service.name=\"log-load-generator\"" + - --workers + - "100" + - --duration + - "20m" + - --rate + - "10000000" + - --interval + - "30s" + - --telemetry-attributes + - "key1=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key2=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key3=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key4=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key5=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key6=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key7=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key8=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key9=\"SimSimulates a client generating logs. (Stability level: Development)\"" + - --telemetry-attributes + - "key10=\"SimSimulates a client generating logs. (Stability level: Development)\"" + imagePullPolicy: IfNotPresent + name: telemetrygen + resources: + limits: + memory: 256Mi + requests: + memory: 256Mi \ No newline at end of file diff --git a/hack/load-tests/log-agent-test-setup-generator.yml b/hack/load-tests/log-agent-test-setup-generator.yml new file mode 100644 index 000000000..14e29e6d8 --- /dev/null +++ b/hack/load-tests/log-agent-test-setup-generator.yml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: log-load-generator + namespace: log-load-test +spec: + replicas: 20 + selector: + matchLabels: + app.kubernetes.io/name: logs-load-generator + template: + metadata: + labels: + app.kubernetes.io/name: logs-load-generator + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - logs-load-generator + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -f=json + - -l + image: mingrammer/flog + imagePullPolicy: Always + name: flog + resources: + limits: + cpu: 50m + memory: 200Mi + requests: + cpu: 10m + memory: 50Mi diff --git a/hack/load-tests/log-agent-test-setup.yaml b/hack/load-tests/log-agent-test-setup.yaml index 9b9326926..00a9ea9a7 100644 --- a/hack/load-tests/log-agent-test-setup.yaml +++ b/hack/load-tests/log-agent-test-setup.yaml @@ -17,48 +17,6 @@ kind: Namespace metadata: name: log-load-test --- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: log-load-generator - namespace: log-load-test -spec: - replicas: 10 - selector: - matchLabels: - app.kubernetes.io/name: logs-load-generator - template: - metadata: - labels: - app.kubernetes.io/name: logs-load-generator - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - logs-load-generator - topologyKey: kubernetes.io/hostname - weight: 100 - containers: - - args: - - -b=10485760 # 10MB - - -f=json - - -l - image: mingrammer/flog - imagePullPolicy: Always - name: flog - resources: - limits: - memory: 200Mi - requests: - cpu: 10m - memory: 50Mi ---- apiVersion: v1 kind: ConfigMap metadata: @@ -76,15 +34,14 @@ data: http: endpoint: ${MY_POD_IP}:4318 exporters: - debug: - + nop: service: pipelines: logs: receivers: - otlp exporters: - - debug + - nop --- apiVersion: apps/v1 kind: Deployment @@ -111,7 +68,7 @@ spec: securityContext: fsGroup: 101 containers: - - image: europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.114.0-main + - image: otel/opentelemetry-collector-contrib:0.114.0 name: otel-collector resources: limits: From dc4081b5cea95beaf592d4af122454fe4e50a45e Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Thu, 16 Jan 2025 13:38:54 +0100 Subject: [PATCH 10/66] WIP --- docs/contributor/benchmarks/otlp-logs-validation.md | 12 ++++++++++++ hack/load-tests/metric-agent-test-setup.yaml | 1 + 2 files changed, 13 insertions(+) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index d047ba303..7045ea4e8 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -329,6 +329,18 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Agent RECEIVED/EXPORTED: 15.3K - Gateway RECEIVED/EXPORTED: 15.3K +#### ⏳⭐️ 16 Jan 2025, ~13:17 +- Gateways on separate nodes +- **Generator:** 10 replicas +- **Results:** + - Agent RECEIVED/EXPORTED: 18.8K + - Gateway RECEIVED/EXPORTED: 18.8K + - Agent Memory: 76/73 + - Agent CPU: 0.8/0.9 + - Gateway Memory: 69/27 + - Gateway CPU: 0.6/0.6 + - Gateway QUEUE: 1/0 + ## 4. Comparison with FluentBit setup In the FluentBit setup, for the very same scenario, the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: diff --git a/hack/load-tests/metric-agent-test-setup.yaml b/hack/load-tests/metric-agent-test-setup.yaml index 639a6a27f..5ed8109c2 100644 --- a/hack/load-tests/metric-agent-test-setup.yaml +++ b/hack/load-tests/metric-agent-test-setup.yaml @@ -66,6 +66,7 @@ spec: - --port=8080 resources: limits: + cpu: 200m memory: "256Mi" ports: - containerPort: 8080 From 64465ad33d89cef628bbf2bc76f3852b6454c909 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Fri, 17 Jan 2025 10:06:59 +0100 Subject: [PATCH 11/66] chore: documentation update --- .../benchmarks/otlp-logs-validation.md | 30 +++++++++++++------ .../log-agent-setup-telemetrygen.yml | 1 + 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 7045ea4e8..2a7171765 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -104,15 +104,15 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds ``` ### ⭐️ Best Results (Scenario: Single Pipeline) -| Batching | RECEIVED | EXPORTED | QUEUE | MEMORY | CPU | -| :------: | :-------: | :-------: | :---: | :----: | :---: | -| ❌ | max. 8.9K | max. 8.9K | 0 | ~63 | ~0.5 | -| ✅ | 8.6K | 8.6k | 0 | ~73 | ~0.6 | +| Batching | RECEIVED | EXPORTED | QUEUE | MEMORY | CPU | +| :------: | :------: | :------: | :---: | :----: | :---: | +| ❌ | ? | ? | ? | ? | ? | +| ✅ | ? | ? | ? | ? | ? | ### ⭐️🏋️‍♀️ Best Results (Scenario: Single Pipeline with Backpressure) | Batching | RECEIVED | EXPORTED | QUEUE | MEMORY | CPU | | :------: | :------: | :------: | :---: | :----: | :---: | -| ❌ | 6.8K | 6.8K | ~328 | ~66 | ~0.5 | +| ❌ | ? | ? | ? | ? | ? | | ✅ | - | - | - | - | - | ### 📊 Benchmarking Sessions @@ -317,9 +317,11 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds #### 🪲 15 Jan 2025, Agent exports logs to a debug endpoint (5 min) - no networking involved -- ~15K / agent +- ~15K / agent => ~30K -#### Removing compression for the OTLP exporter boosts throughput +#### Removing compression for the OTLP exporters (on both agent and gateway) +- boosts throughput in the 4 nodes scenario +- the change seemed to have no impact in the 2 nodes scenario #### ⏳ 15 Jan 2025, ? - ? (20 min) - Gateways on separate nodes @@ -329,8 +331,9 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Agent RECEIVED/EXPORTED: 15.3K - Gateway RECEIVED/EXPORTED: 15.3K -#### ⏳⭐️ 16 Jan 2025, ~13:17 +#### ⏳⭐️ 16 Jan 2025, ~13:17 (20 min) - Gateways on separate nodes +- No Istio - **Generator:** 10 replicas - **Results:** - Agent RECEIVED/EXPORTED: 18.8K @@ -341,9 +344,18 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Gateway CPU: 0.6/0.6 - Gateway QUEUE: 1/0 +#### ⏳⭐️ 16 Jan 2025, ~13:56 (20 min) +- No gateway involved, agent sending directly to mock backend +- With Istio +- **Generator:** 10 replicas +- **Results:** + - Agent RECEIVED/EXPORTED: 19K + - Agent Memory: 82/74 + - Agent CPU: 1.3/0.8 + ## 4. Comparison with FluentBit setup -In the FluentBit setup, for the very same scenario, the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: +In the FluentBit setup, for the very same (initial) scenario, the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: - Exported Log Records/second: 27.8K diff --git a/hack/load-tests/log-agent-setup-telemetrygen.yml b/hack/load-tests/log-agent-setup-telemetrygen.yml index 4ebf5ffea..5fba07b18 100644 --- a/hack/load-tests/log-agent-setup-telemetrygen.yml +++ b/hack/load-tests/log-agent-setup-telemetrygen.yml @@ -1,3 +1,4 @@ +# TODO apiVersion: apps/v1 kind: Deployment metadata: From 9f313af21028f86e2628ae7472c3054a34a618c8 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Fri, 17 Jan 2025 13:32:17 +0100 Subject: [PATCH 12/66] chore: Documentation insights --- .../benchmarks/otlp-logs-validation.md | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 2a7171765..87d447d61 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -294,6 +294,10 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - 0% receiver refused logs - 0% exporter send failed logs + + + + #### ⏳ 15 Jan 2025, 12:31 - 12:51 (20 min) - **Generator:** 10 replicas x 10 MB - **Results:** @@ -353,6 +357,50 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Agent Memory: 82/74 - Agent CPU: 1.3/0.8 +#### 🪲 17 Jan 2025, ~10:36 +- 1 node +- No gateway involved, agent sending directly to mock backend +- With Istio +- Agent has everything removed (no processors) +- **Generator:** 5 replicas +- **Results (without batching):** + - Agent RECEIVED/EXPORTED: 11.8K / instance +- **Results (with batching):** + - Agent RECEIVED/EXPORTED: 14K / instance + +#### 🪲 17 Jan 2025, ~11:48 +- 1 node +- No gateway involved, agent sending directly to mock backend +- With Istio +- Agent has everything removed (no processors), then we incrementally add them +- **Generator:** 30 replicas (10m CPU limit) +- 📥 Debug Exporter: + - **Results (without batching):** + - Agent RECEIVED/EXPORTED: 16K / instance + - **Results (with batching):** + - Agent RECEIVED/EXPORTED: 22.4K / instance + - **Results (batching + filestorage):** + - Agent RECEIVED/EXPORTED: 20K / instance +- 📥 OTEL Exporter: + - **Results (batching + filestorage):** + - Agent RECEIVED/EXPORTED: 15K / instance + - **Results (batching + filestorage + sending queue):** + - Agent RECEIVED/EXPORTED: 15K / instance + +#### 🪲 17 Jan 2025, ~13:16 +- No gateway involved, agent sending directly to mock backend +- With Istio +- **2 nodes:** + - **Generator:** 60 replicas (10m CPU limit) + - Agent RECEIVED/EXPORTED: 28.6K + - Agent Memory: 78/71 + - Agent CPU: 1.3/1.3 +- **3 nodes:** + - **Generator:** 90 replicas (10m CPU limit) + - Agent RECEIVED/EXPORTED: 44.6K + - Agent Memory: ~76-90 + - Agent CPU: ~1.3 + ## 4. Comparison with FluentBit setup In the FluentBit setup, for the very same (initial) scenario, the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: From a95075aa255081a9343b91e52bbe8511737985ee Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 20 Jan 2025 10:27:15 +0100 Subject: [PATCH 13/66] chore: Fully document benchmarking session #2 --- .../benchmarks/otlp-logs-validation.md | 43 ++++++++----------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 87d447d61..1a61097c8 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -103,19 +103,7 @@ round(sum(avg_over_time(container_memory_working_set_bytes{namespace="kyma-syste round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod), 0.1) ``` -### ⭐️ Best Results (Scenario: Single Pipeline) -| Batching | RECEIVED | EXPORTED | QUEUE | MEMORY | CPU | -| :------: | :------: | :------: | :---: | :----: | :---: | -| ❌ | ? | ? | ? | ? | ? | -| ✅ | ? | ? | ? | ? | ? | - -### ⭐️🏋️‍♀️ Best Results (Scenario: Single Pipeline with Backpressure) -| Batching | RECEIVED | EXPORTED | QUEUE | MEMORY | CPU | -| :------: | :------: | :------: | :---: | :----: | :---: | -| ❌ | ? | ? | ? | ? | ? | -| ✅ | - | - | - | - | - | - -### 📊 Benchmarking Sessions +### 📊 Benchmarking Session #1 | Icon | Meaning | | ---- | ---------------------------------------------------- | @@ -294,9 +282,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - 0% receiver refused logs - 0% exporter send failed logs - - - +### 📊 Benchmarking Session #2 #### ⏳ 15 Jan 2025, 12:31 - 12:51 (20 min) - **Generator:** 10 replicas x 10 MB @@ -397,21 +383,28 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Agent CPU: 1.3/1.3 - **3 nodes:** - **Generator:** 90 replicas (10m CPU limit) - - Agent RECEIVED/EXPORTED: 44.6K + - Agent RECEIVED/EXPORTED: 44.6K - Agent Memory: ~76-90 - Agent CPU: ~1.3 ## 4. Comparison with FluentBit setup -In the FluentBit setup, for the very same (initial) scenario, the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: +In the FluentBit setup, for the very same (initial) scenario (i.e. 10 generator replicas [old set-up] / 2 agents), the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: - Exported Log Records/second: 27.8K ## 5. Conclusions -- A lower performance can be expected, compared to the FluentBit counterpart setup. -- Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end, since the agent has no way of knowing when to stop, thus exports data continuously. (This is a known issue, that should get solved by the OTel community in the next half year) -- Agent slows down if the load is increased (i.e. more generators / more logs / more data). -- The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup, since when using just a debug endpoint as an exporter, higher throughput was observed. -- CPU and Memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). -- When enabling the batch processor, throughput was increasing, but this comes at the cost of losing logs in some scenarios. -- More/other methods of improving the throughput might still be worth investigating. \ No newline at end of file +- Before 15 Jan. (first session): + - A lower performance can be expected, compared to the FluentBit counterpart setup. + - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end, since the agent has no way of knowing when to stop, thus exports data continuously. (This is a known issue, that should get solved by the OTel community in the next half year) + - Agent slows down if the load is increased (i.e. more generators / more logs / more data). + - The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup, since when using just a debug endpoint as an exporter, higher throughput was observed. + - CPU and Memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). + - When enabling the batch processor, throughput was increasing, but this comes at the cost of losing logs in some scenarios. + - More/other methods of improving the throughput might still be worth investigating. +- After 15 jan. (second session): + - Removing the gateway improves throughput + - We now better understand the performance impact of each OTEL processor and of enabling/disabling compression + - Generators configuration greatly influence the setup => more generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data + - There is a hard limit (see debug endpoint scenario) that we still not fully understand, since strictly based on the benchmarking numbers of OTEL, we should be getting higher throughput (i.e. something related to the infrastructure could be influencing this). + - We have now a more performant setup configuration, being more comparable with the numbers from the FluentBit setup \ No newline at end of file From 77358a173c8da0d571dc0d78fb31f4ff8f8b317a Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 20 Jan 2025 14:58:26 +0100 Subject: [PATCH 14/66] chore: .md changes --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 1a61097c8..e2988f8ed 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -1,6 +1,6 @@ # OTel LogPipeline set-up validation -This file documents the process of validating the whole LogPipeline with OTLP output flow. It starts by defining the setup, that consists of the manually deployed log agent, the already-implemented log gateway, and log generators using flog. +This file documents the process of validating the whole LogPipeline with OTLP output flow. It defines the setup, that consists of the manually deployed log agent, the already-implemented log gateway, and log generators using flog. The scope is to performance test the agent, observing the resulting values, in terms of throughput, resource consumption, reaction to backpressure, etc. And compare it to the previous FluentBit-based setup. From 9f5de425d631864158b471942965c318cbfaa64a Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 14:58:47 +0100 Subject: [PATCH 15/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index e2988f8ed..76ab8175d 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -2,7 +2,7 @@ This file documents the process of validating the whole LogPipeline with OTLP output flow. It defines the setup, that consists of the manually deployed log agent, the already-implemented log gateway, and log generators using flog. -The scope is to performance test the agent, observing the resulting values, in terms of throughput, resource consumption, reaction to backpressure, etc. And compare it to the previous FluentBit-based setup. +The scope is to performance test the agent, observing the resulting values (such as throughput, resource consumption, reaction to backpressure), and to compare the agent to the previous FluentBit-based setup. From 86031b0dec3fa8609b99e24786a046c02f02355b Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:00:39 +0100 Subject: [PATCH 16/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 76ab8175d..e707835ee 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -20,7 +20,7 @@ helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm helm install -n kyma-system logging open-telemetry/opentelemetry-collector -f telemetry-manager/docs/contributor/pocs/assets/otel-log-agent-values.yaml ``` -### Manual +- To set up the log agent manually, run: ``` bash k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml From 5ff70b0cb1541fb80752a5fe5fd984103b88bb7a Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:00:52 +0100 Subject: [PATCH 17/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index e707835ee..dc8e6ae8f 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -61,7 +61,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) ## 3. Benchmarking and Performance Tests Results Setup Configuration: -``` bash + ``` bash k create ns prometheus helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update From acbbc7a1438583a60d967c7b87191124c4c8140f Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:01:01 +0100 Subject: [PATCH 18/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index dc8e6ae8f..c01954fc7 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -72,7 +72,7 @@ k apply -f telemetry-manager/hack/load-tests/log-agent-test-setup.yaml For executing the load tests, the generated logs have to be isolated, hence the following line should be replaced in the ConfigMap of the log agent: -``` yaml + ``` yaml receivers: filelog: # ... From 48ebcc07fff16e6ca7c81fe86f3ba55cea83b46b Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:01:10 +0100 Subject: [PATCH 19/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index c01954fc7..073d4dd96 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -393,7 +393,7 @@ In the FluentBit setup, for the very same (initial) scenario (i.e. 10 generator - Exported Log Records/second: 27.8K -## 5. Conclusions +## Conclusions - Before 15 Jan. (first session): - A lower performance can be expected, compared to the FluentBit counterpart setup. - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end, since the agent has no way of knowing when to stop, thus exports data continuously. (This is a known issue, that should get solved by the OTel community in the next half year) From 8e08ba7fb86840119d7d20acd0f9b201d395cc0e Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:01:27 +0100 Subject: [PATCH 20/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 073d4dd96..6b9bc85cf 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -81,7 +81,7 @@ receivers: ``` For the 🏋️‍♀️ Backpressure Scenario additionally apply: -``` bash + ``` bash k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml ``` From 7be853872afadf0d439f05fc249a5618b19ef639 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:02:51 +0100 Subject: [PATCH 21/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 6b9bc85cf..e6c788cba 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -39,7 +39,7 @@ k apply -f ./otlp-logs-validation.yaml See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) #### Things to take into consideration (at implementation) -- Dynamically inclusion/exclusion of namespaces, based on LogPipeline spec attributes +- Dynamically include/exclude of namespaces, based on LogPipeline spec attributes. - Exclude FluentBit container in OTel configuration and OTel container in FluentBit configuration - `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled From 42c5244bfa96223fa79e435aa43551509edd1809 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:27:22 +0100 Subject: [PATCH 22/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index e6c788cba..f63972f47 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -8,7 +8,7 @@ The scope is to performance test the agent, observing the resulting values (such ## 1. Set-up configuration steps -### With Helm +- To set up the log agent with Helm, run: ``` bash k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml From dee9619b1f4a172a480c5580b56247b58c794546 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 20 Jan 2025 15:29:45 +0100 Subject: [PATCH 23/66] chore: Update config validation doc --- docs/contributor/benchmarks/otlp-logs-validation.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index f63972f47..2188a0cac 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -1,4 +1,4 @@ -# OTel LogPipeline set-up validation +# OTel LogPipeline Setup Validation This file documents the process of validating the whole LogPipeline with OTLP output flow. It defines the setup, that consists of the manually deployed log agent, the already-implemented log gateway, and log generators using flog. @@ -6,7 +6,7 @@ The scope is to performance test the agent, observing the resulting values (such -## 1. Set-up configuration steps +## Setup Configuration Steps - To set up the log agent with Helm, run: @@ -32,7 +32,7 @@ k apply -f ./otlp-logs-validation.yaml -## 2. Resulting Resources +## Resulting Resources ### Agent ConfigMap (OTel Config) @@ -58,7 +58,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) -## 3. Benchmarking and Performance Tests Results +## Benchmarking and Performance Tests Results Setup Configuration: ``` bash @@ -388,7 +388,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Agent CPU: ~1.3 -## 4. Comparison with FluentBit setup +## Comparison with FluentBit Setup In the FluentBit setup, for the very same (initial) scenario (i.e. 10 generator replicas [old set-up] / 2 agents), the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: - Exported Log Records/second: 27.8K From aae1ae74825b96619ccad17d659fcf81fa4de3c7 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:34:50 +0100 Subject: [PATCH 24/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 2188a0cac..d0f3a0183 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -34,7 +34,9 @@ k apply -f ./otlp-logs-validation.yaml ## Resulting Resources -### Agent ConfigMap (OTel Config) +- Log Agent ConfigMap (OTel Config) +- Log Agent DaemonSet + See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) From 0d3aa2e0441d2eca9666574e459f47e4d4072ca3 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:36:53 +0100 Subject: [PATCH 25/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index d0f3a0183..a065340a5 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -52,7 +52,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) ### How does checkpointing work - By enabling the storeCheckpoint preset (Helm) the `file_storage` extension is activated in the receiver -- The `file_storage` has the path `/var/lib/otelcol` +> - The `file_storage` has the path `/var/lib/otelcol`. - This is later mounted as a `hostPath` volume in the DaemonSet spec - Also set in the `storage` property of the filelog receiver From a1c9c7305c882d1e5e17bb6036e7ef01ba4fa5f4 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:52:07 +0100 Subject: [PATCH 26/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index a065340a5..b05736597 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -42,7 +42,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) #### Things to take into consideration (at implementation) - Dynamically include/exclude of namespaces, based on LogPipeline spec attributes. -- Exclude FluentBit container in OTel configuration and OTel container in FluentBit configuration +- Exclude FluentBit container in OTel configuration, and OTel container in FluentBit configuration. - `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled ### Agent DaemonSet From 4e4e14e45a9c58381cec77542c7d4a4bd45fa1fe Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:52:22 +0100 Subject: [PATCH 27/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index b05736597..18c11facc 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -43,7 +43,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) #### Things to take into consideration (at implementation) - Dynamically include/exclude of namespaces, based on LogPipeline spec attributes. - Exclude FluentBit container in OTel configuration, and OTel container in FluentBit configuration. -- `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled +- `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled. ### Agent DaemonSet From e8fb72187016c1b6f341804233437f084b8728a8 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:52:53 +0100 Subject: [PATCH 28/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 18c11facc..d4fe01bd8 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -56,7 +56,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) - This is later mounted as a `hostPath` volume in the DaemonSet spec - Also set in the `storage` property of the filelog receiver -> `storage` = The ID of a storage extension to be used to store file offsets. File offsets allow the receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver will manage offsets in memory only. +> `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. From fd77bfdfc6db331aa76475e3deea12cf909e8476 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:53:16 +0100 Subject: [PATCH 29/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index d4fe01bd8..d0a108bde 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -72,7 +72,7 @@ helm upgrade --install -n "prometheus" "prometheus" prometheus-community/kube-pr k apply -f telemetry-manager/hack/load-tests/log-agent-test-setup.yaml ``` -For executing the load tests, the generated logs have to be isolated, hence the following line should be replaced in the ConfigMap of the log agent: +2. To execute the load tests, the generated logs must be isolated. Replace the following line in the ConfigMap of the log agent: ``` yaml receivers: From 1c7082cc1947f53d17915c096507a4d27d9b10e5 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:53:36 +0100 Subject: [PATCH 30/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index d0a108bde..cb1d99235 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -82,7 +82,7 @@ receivers: - /var/log/pods/*/*/*.log # replace with "/var/log/pods/log-load-test*/*flog*/*.log" ``` -For the 🏋️‍♀️ Backpressure Scenario additionally apply: +3. If you want to run the 🏋️‍♀️ backpressure scenario, additionally apply: ``` bash k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml ``` From ddd2df59ce35fdbc1bf921ee0f767ba0e24a456b Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:54:12 +0100 Subject: [PATCH 31/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index cb1d99235..9b2d83ccd 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -182,7 +182,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - Gateway QUEUE: 328 - **Remarks:** - Agent does not stop when gateway refuses logs (because backpressure does not backpropagate) - - It slows down/stops in other scenarios (see bellow) => SUCCESS + - It slows down/stops in other scenarios (see below) => SUCCESS #### 🪲 19 Dec 2024, Agent exports logs to a debug endpoint (5 min) - no networking involved From 8a9ba373ea3f0ea3923f65f67df087b01e8efa3b Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:54:35 +0100 Subject: [PATCH 32/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 9b2d83ccd..b6315704d 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -201,7 +201,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - not increasing #### 🪲 19 Dec 2024, Agent exports logs directly to mock backend with batching processor (5 min) -- networking, but with batching mechanism in-place +- networking, but with batching mechanism in place - 10 log generators x 10 MB, batch size: 1024 - 8.3K - MEM: 68/73 From 03bef54349349debf30d7546938e6b96bb4bf5e0 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:56:34 +0100 Subject: [PATCH 33/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index b6315704d..8f5fd9034 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -104,7 +104,7 @@ round(sum(avg_over_time(container_memory_working_set_bytes{namespace="kyma-syste -- CPU round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod), 0.1) ``` - +## Performance Tests Results ### 📊 Benchmarking Session #1 | Icon | Meaning | From cbc6ae716e495e3e68cdbf0afda3c473a3f7c688 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:57:07 +0100 Subject: [PATCH 34/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 8f5fd9034..17426b12c 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -227,7 +227,7 @@ round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds - **Generator:** 10 replicas x 10 MB - **Agent:** with CPU limit (1), no queue, with batch processing (2048) - **Results:** - - lower throughput as for the 1024 scenario + - lower throughput than for the 1024 scenario #### ⏳ 19 Dec 2024, 15:55 - 16:15 (20 min) - **Agent:** with CPU limit (1), no queue, with batch processing (1024) From ab06ae7ed1a4fc20ed88b852e328bf2c4cfe7688 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 16:00:50 +0100 Subject: [PATCH 35/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 17426b12c..d512f38a5 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -22,7 +22,7 @@ helm install -n kyma-system logging open-telemetry/opentelemetry-collector -f te - To set up the log agent manually, run: -``` bash + ``` bash k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml // Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables From d1cd4d2cf3c5292edae7e6d1fe47789815b6014d Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 20 Jan 2025 16:01:20 +0100 Subject: [PATCH 36/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index d512f38a5..5ad18d4a5 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -10,7 +10,7 @@ The scope is to performance test the agent, observing the resulting values (such - To set up the log agent with Helm, run: -``` bash + ``` bash k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml // Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables From 1e86e8f22560eed132a6e4d94d57215dcc89a570 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 20 Jan 2025 16:02:51 +0100 Subject: [PATCH 37/66] chore: Update config validation doc --- docs/contributor/benchmarks/otlp-logs-validation.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 17426b12c..066266ebc 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -397,13 +397,13 @@ In the FluentBit setup, for the very same (initial) scenario (i.e. 10 generator ## Conclusions - Before 15 Jan. (first session): - - A lower performance can be expected, compared to the FluentBit counterpart setup. - - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end, since the agent has no way of knowing when to stop, thus exports data continuously. (This is a known issue, that should get solved by the OTel community in the next half year) - - Agent slows down if the load is increased (i.e. more generators / more logs / more data). - - The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup, since when using just a debug endpoint as an exporter, higher throughput was observed. + - Compared to the FluentBit counterpart setup, a lower performance can be expected. + - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end. That's because the agent has no way of knowing when to stop, thus exports data continuously (this is a known issue, which is expected be solved by the OTel community in the next half year). + - If the load is increased (that is, more generators, more logs, or more data), the log agent slows down. + - The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup. That's concluded because higher throughput was observed when using just a debug endpoint as an exporter. - CPU and Memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). - - When enabling the batch processor, throughput was increasing, but this comes at the cost of losing logs in some scenarios. - - More/other methods of improving the throughput might still be worth investigating. + - If the batch processor is enabled, throughput increased. But this comes at the cost of losing logs in some scenarios. + - Further methods of improving the throughput might still be worth investigating. - After 15 jan. (second session): - Removing the gateway improves throughput - We now better understand the performance impact of each OTEL processor and of enabling/disabling compression From ac76542dd3a8caf4bd1c28fc02f4bfb34c90b4cc Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 20 Jan 2025 16:13:01 +0100 Subject: [PATCH 38/66] chore: Update config validation doc --- .../benchmarks/otlp-logs-validation.md | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 7780b5891..893fe7a30 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -10,25 +10,25 @@ The scope is to performance test the agent, observing the resulting values (such - To set up the log agent with Helm, run: - ``` bash -k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml + ``` bash + k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml -// Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables + // Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables -helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts -helm install -n kyma-system logging open-telemetry/opentelemetry-collector -f telemetry-manager/docs/contributor/pocs/assets/otel-log-agent-values.yaml -``` + helm install -n kyma-system logging open-telemetry/opentelemetry-collector -f telemetry-manager/docs/contributor/pocs/assets/otel-log-agent-values.yaml + ``` - To set up the log agent manually, run: - ``` bash -k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml + ``` bash + k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml -// Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables + // Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables -k apply -f ./otlp-logs-validation.yaml -``` + k apply -f ./otlp-logs-validation.yaml + ``` @@ -62,48 +62,48 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) ## Benchmarking and Performance Tests Results -Setup Configuration: - ``` bash -k create ns prometheus -helm repo add prometheus-community https://prometheus-community.github.io/helm-charts -helm repo update -helm upgrade --install -n "prometheus" "prometheus" prometheus-community/kube-prometheus-stack -f hack/load-tests/values.yaml --set grafana.adminPassword=myPwd +1. Apply the configuration (with Prometheus): + ``` bash + k create ns prometheus + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + helm upgrade --install -n "prometheus" "prometheus" prometheus-community/kube-prometheus-stack -f hack/load-tests/values.yaml --set grafana.adminPassword=myPwd -k apply -f telemetry-manager/hack/load-tests/log-agent-test-setup.yaml -``` + k apply -f telemetry-manager/hack/load-tests/log-agent-test-setup.yaml + ``` 2. To execute the load tests, the generated logs must be isolated. Replace the following line in the ConfigMap of the log agent: - ``` yaml -receivers: - filelog: - # ... - include: - - /var/log/pods/*/*/*.log # replace with "/var/log/pods/log-load-test*/*flog*/*.log" -``` + ``` yaml + receivers: + filelog: + # ... + include: + - /var/log/pods/*/*/*.log # replace with "/var/log/pods/log-load-test*/*flog*/*.log" + ``` 3. If you want to run the 🏋️‍♀️ backpressure scenario, additionally apply: - ``` bash -k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml -``` + ``` bash + k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml + ``` -PromQL Queries: -``` sql --- RECEIVED -round(sum(rate(otelcol_receiver_accepted_log_records{service="telemetry-log-agent-metrics"}[20m]))) +4. PromQL Queries used for measuring the results: + ``` sql + -- RECEIVED + round(sum(rate(otelcol_receiver_accepted_log_records{service="telemetry-log-agent-metrics"}[20m]))) --- EXPORTED -round(sum(rate(otelcol_exporter_sent_log_records{service="telemetry-log-agent-metrics"}[20m]))) + -- EXPORTED + round(sum(rate(otelcol_exporter_sent_log_records{service="telemetry-log-agent-metrics"}[20m]))) --- QUEUE -avg(sum(otelcol_exporter_queue_size{service="telemetry-log-agent-metrics"})) + -- QUEUE + avg(sum(otelcol_exporter_queue_size{service="telemetry-log-agent-metrics"})) --- MEMORY -round(sum(avg_over_time(container_memory_working_set_bytes{namespace="kyma-system", container="collector"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod) / 1024 / 1024) + -- MEMORY + round(sum(avg_over_time(container_memory_working_set_bytes{namespace="kyma-system", container="collector"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod) / 1024 / 1024) --- CPU -round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod), 0.1) -``` + -- CPU + round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod), 0.1) + ``` ## Performance Tests Results ### 📊 Benchmarking Session #1 From 8a6d3d96b33ec60b4305fe00abbaed0541f07cad Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 20 Jan 2025 16:29:25 +0100 Subject: [PATCH 39/66] chore: Update config validation doc --- docs/contributor/benchmarks/otlp-logs-validation.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 893fe7a30..013cba014 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -5,7 +5,6 @@ This file documents the process of validating the whole LogPipeline with OTLP ou The scope is to performance test the agent, observing the resulting values (such as throughput, resource consumption, reaction to backpressure), and to compare the agent to the previous FluentBit-based setup. - ## Setup Configuration Steps - To set up the log agent with Helm, run: @@ -31,13 +30,11 @@ The scope is to performance test the agent, observing the resulting values (such ``` - -## Resulting Resources +## Relevant/Configurable Resources - Log Agent ConfigMap (OTel Config) - Log Agent DaemonSet - See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) #### Things to take into consideration (at implementation) @@ -59,8 +56,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) > `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. - -## Benchmarking and Performance Tests Results +## Benchmarking Setup 1. Apply the configuration (with Prometheus): ``` bash @@ -104,7 +100,10 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) -- CPU round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod), 0.1) ``` + + ## Performance Tests Results + ### 📊 Benchmarking Session #1 | Icon | Meaning | From 55ab833d4704dc2616136a0e394dca559edd96e1 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 20 Jan 2025 16:43:40 +0100 Subject: [PATCH 40/66] chore: Update config validation doc --- docs/contributor/benchmarks/otlp-logs-validation.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 013cba014..f88fc16f4 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -42,10 +42,6 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) - Exclude FluentBit container in OTel configuration, and OTel container in FluentBit configuration. - `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled. -### Agent DaemonSet - -See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) - ### How does checkpointing work - By enabling the storeCheckpoint preset (Helm) the `file_storage` extension is activated in the receiver @@ -78,7 +74,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) - /var/log/pods/*/*/*.log # replace with "/var/log/pods/log-load-test*/*flog*/*.log" ``` -3. If you want to run the 🏋️‍♀️ backpressure scenario, additionally apply: +3. If you want to run the backpressure scenario, additionally apply: ``` bash k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml ``` @@ -110,7 +106,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) | ---- | ---------------------------------------------------- | | ⏳ | Full-test, involving the whole setup, usually 20 min | | 🪲 | Debugging session, usually shorter, not so reliable | -| 🏋️‍♀️ | Backpressure Scenario | +| 🏋️‍♀️ | Backpressure scenario | | ⭐️ | Best results observed (in a given scenario) | #### ⏳ 18 Dec 2024, 13:45 - 14:05 (20 min) @@ -400,7 +396,7 @@ In the FluentBit setup, for the very same (initial) scenario (i.e. 10 generator - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end. That's because the agent has no way of knowing when to stop, thus exports data continuously (this is a known issue, which is expected be solved by the OTel community in the next half year). - If the load is increased (that is, more generators, more logs, or more data), the log agent slows down. - The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup. That's concluded because higher throughput was observed when using just a debug endpoint as an exporter. - - CPU and Memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). + - CPU and memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). - If the batch processor is enabled, throughput increased. But this comes at the cost of losing logs in some scenarios. - Further methods of improving the throughput might still be worth investigating. - After 15 jan. (second session): From c75de3ffd0ff6e139c4c2e525f430866655147ad Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Mon, 20 Jan 2025 17:29:45 +0100 Subject: [PATCH 41/66] chore: Update config validation doc --- docs/contributor/benchmarks/otlp-logs-validation.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index f88fc16f4..cfcfa6b30 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -44,10 +44,10 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) ### How does checkpointing work -- By enabling the storeCheckpoint preset (Helm) the `file_storage` extension is activated in the receiver -> - The `file_storage` has the path `/var/lib/otelcol`. -- This is later mounted as a `hostPath` volume in the DaemonSet spec -- Also set in the `storage` property of the filelog receiver +> By enabling the storeCheckpoint preset (Helm) the `file_storage` extension is activated in the receiver +> - The `file_storage` has the path `/var/lib/otelcol` +> - This path is later mounted as a `hostPath` volume in the DaemonSet spec +> - The extension is also set in the `storage` property of the filelog receiver > `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. From d81dfc83b353b133247906c2c98f11c357a5780e Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Tue, 21 Jan 2025 10:03:55 +0100 Subject: [PATCH 42/66] chore: Update config validation doc --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index cfcfa6b30..ae1905da9 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -37,7 +37,7 @@ The scope is to performance test the agent, observing the resulting values (such See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) -#### Things to take into consideration (at implementation) +**Things to take into consideration, when implementing the Log Agent into Telemetry Manager:** - Dynamically include/exclude of namespaces, based on LogPipeline spec attributes. - Exclude FluentBit container in OTel configuration, and OTel container in FluentBit configuration. - `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled. From 6b3bc2a25f2f630ed450a9613247787541b9f766 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Tue, 21 Jan 2025 10:14:24 +0100 Subject: [PATCH 43/66] chore: Update config validation doc --- docs/contributor/benchmarks/otlp-logs-validation.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index ae1905da9..c3ea870a7 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -30,7 +30,7 @@ The scope is to performance test the agent, observing the resulting values (such ``` -## Relevant/Configurable Resources +## Resources Under Investigation - Log Agent ConfigMap (OTel Config) - Log Agent DaemonSet @@ -44,7 +44,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) ### How does checkpointing work -> By enabling the storeCheckpoint preset (Helm) the `file_storage` extension is activated in the receiver +> By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is activated in the receiver. > - The `file_storage` has the path `/var/lib/otelcol` > - This path is later mounted as a `hostPath` volume in the DaemonSet spec > - The extension is also set in the `storage` property of the filelog receiver From 71319a6c823b6ca94af32eeafcb23191725eda84 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Tue, 21 Jan 2025 10:32:24 +0100 Subject: [PATCH 44/66] chore: Update config validation doc --- .../benchmarks/otlp-logs-validation.md | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index c3ea870a7..68ec55326 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -1,8 +1,12 @@ # OTel LogPipeline Setup Validation -This file documents the process of validating the whole LogPipeline with OTLP output flow. It defines the setup, that consists of the manually deployed log agent, the already-implemented log gateway, and log generators using flog. - -The scope is to performance test the agent, observing the resulting values (such as throughput, resource consumption, reaction to backpressure), and to compare the agent to the previous FluentBit-based setup. +- [Setup Configuration Steps](#setup-configuration-steps) +- [Resources Under Investigation](#resources-under-investigation) +- [Benchmarking Setup](#benchmarking-setup) +- [Performance Tests Results](#performance-tests-results) + - [📊 Benchmarking Session #1](#-benchmarking-session-1) + - [📊 Benchmarking Session #2](#-benchmarking-session-2) +- [Conclusions](#conclusions) ## Setup Configuration Steps @@ -42,14 +46,13 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) - Exclude FluentBit container in OTel configuration, and OTel container in FluentBit configuration. - `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled. -### How does checkpointing work - -> By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is activated in the receiver. +**How does checkpointing work?** +> By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is activated in the filelog receiver. > - The `file_storage` has the path `/var/lib/otelcol` > - This path is later mounted as a `hostPath` volume in the DaemonSet spec > - The extension is also set in the `storage` property of the filelog receiver -> `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. +> `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the filelog receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. ## Benchmarking Setup @@ -79,7 +82,7 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml ``` -4. PromQL Queries used for measuring the results: +4. You can use the following PromQL Queries for measuring the results (same/similar queries were used in measuring the results of the performance tests executed below): ``` sql -- RECEIVED round(sum(rate(otelcol_receiver_accepted_log_records{service="telemetry-log-agent-metrics"}[20m]))) From be17e38492af00791057ce49856ea1d023fc2417 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:32:50 +0100 Subject: [PATCH 45/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 68ec55326..130b3feef 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -9,7 +9,9 @@ - [Conclusions](#conclusions) -## Setup Configuration Steps +## Configuring the Log Agent + +To configure the log agent, deploy the [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) either with Helm or manually: - To set up the log agent with Helm, run: From 8a57bf0a2d1f0f874c2e086641285d5f088abf38 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:33:02 +0100 Subject: [PATCH 46/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 130b3feef..ea6bd7f6a 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -37,6 +37,7 @@ To configure the log agent, deploy the [OTLP Logs Validation YAML](./otlp-logs-v ## Resources Under Investigation +We investigate the following resources (for details, see the [OTLP Logs Validation YAML](./otlp-logs-validation.yaml)): - Log Agent ConfigMap (OTel Config) - Log Agent DaemonSet From 2f21872852cdb2efc067607d375341b9d20b2ff1 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:33:13 +0100 Subject: [PATCH 47/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index ea6bd7f6a..9764be73b 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -42,7 +42,6 @@ We investigate the following resources (for details, see the [OTLP Logs Validati - Log Agent ConfigMap (OTel Config) - Log Agent DaemonSet -See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) **Things to take into consideration, when implementing the Log Agent into Telemetry Manager:** - Dynamically include/exclude of namespaces, based on LogPipeline spec attributes. From 1d293b4388052147f79f23e348fd641b80519b0b Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:33:43 +0100 Subject: [PATCH 48/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 9764be73b..32f1ec7c0 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -53,7 +53,6 @@ We investigate the following resources (for details, see the [OTLP Logs Validati > - The `file_storage` has the path `/var/lib/otelcol` > - This path is later mounted as a `hostPath` volume in the DaemonSet spec > - The extension is also set in the `storage` property of the filelog receiver - > `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the filelog receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. From c3b1e9e0155fa5876cf72889f71b9b69d2d8a9f4 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:34:05 +0100 Subject: [PATCH 49/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 32f1ec7c0..8c4733b95 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -51,7 +51,7 @@ We investigate the following resources (for details, see the [OTLP Logs Validati **How does checkpointing work?** > By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is activated in the filelog receiver. > - The `file_storage` has the path `/var/lib/otelcol` -> - This path is later mounted as a `hostPath` volume in the DaemonSet spec +> - Later, this path is mounted as a `hostPath` volume in the DaemonSet spec. > - The extension is also set in the `storage` property of the filelog receiver > `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the filelog receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. From cc0098bf2175cb45840c3117a0cbdd3f65bbf9a3 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:34:13 +0100 Subject: [PATCH 50/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 8c4733b95..3b48c4f6e 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -50,7 +50,7 @@ We investigate the following resources (for details, see the [OTLP Logs Validati **How does checkpointing work?** > By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is activated in the filelog receiver. -> - The `file_storage` has the path `/var/lib/otelcol` +> - The `file_storage` has the path `/var/lib/otelcol`. > - Later, this path is mounted as a `hostPath` volume in the DaemonSet spec. > - The extension is also set in the `storage` property of the filelog receiver > `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the filelog receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. From 3d352b8e86c879b88a9a935973930bb38066f594 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Tue, 21 Jan 2025 10:38:37 +0100 Subject: [PATCH 51/66] chore: Update config validation doc --- .../contributor/benchmarks/otlp-logs-validation.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 68ec55326..fc8737e93 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -47,12 +47,12 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) - `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled. **How does checkpointing work?** -> By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is activated in the filelog receiver. -> - The `file_storage` has the path `/var/lib/otelcol` -> - This path is later mounted as a `hostPath` volume in the DaemonSet spec -> - The extension is also set in the `storage` property of the filelog receiver +By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is activated in the filelog receiver. +- The `file_storage` has the path `/var/lib/otelcol` +- This path is later mounted as a `hostPath` volume in the DaemonSet spec +- The extension is also set in the `storage` property of the filelog receiver -> `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the filelog receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. +> **NOTE:** `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the filelog receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. ## Benchmarking Setup @@ -103,8 +103,6 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) ## Performance Tests Results -### 📊 Benchmarking Session #1 - | Icon | Meaning | | ---- | ---------------------------------------------------- | | ⏳ | Full-test, involving the whole setup, usually 20 min | @@ -112,6 +110,8 @@ See [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) | 🏋️‍♀️ | Backpressure scenario | | ⭐️ | Best results observed (in a given scenario) | +### 📊 Benchmarking Session #1 + #### ⏳ 18 Dec 2024, 13:45 - 14:05 (20 min) - **Generator:** 10 replicas x 10 MB - **Agent:** no CPU limit, no queue From ec690fc7bf76994fa9b9b5ce66a875f43fee2671 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:39:58 +0100 Subject: [PATCH 52/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index c33bdc460..35b3fcafe 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -391,7 +391,7 @@ By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is a ## Comparison with FluentBit Setup -In the FluentBit setup, for the very same (initial) scenario (i.e. 10 generator replicas [old set-up] / 2 agents), the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: +In the FluentBit setup, for the very same (initial) scenario (that is, 10 generator replicas [old set-up] or 2 agents), the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: - Exported Log Records/second: 27.8K From 2d4e5177769a52baa96749ab3a9f6a42850a22b2 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:40:15 +0100 Subject: [PATCH 53/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 35b3fcafe..86ad9a434 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -394,7 +394,6 @@ By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is a In the FluentBit setup, for the very same (initial) scenario (that is, 10 generator replicas [old set-up] or 2 agents), the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: - Exported Log Records/second: 27.8K - ## Conclusions - Before 15 Jan. (first session): - Compared to the FluentBit counterpart setup, a lower performance can be expected. From 313001d11b309b40e5b00433aca7f7f35ecb9cc4 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:40:30 +0100 Subject: [PATCH 54/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 86ad9a434..b2ed6b403 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -395,7 +395,7 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera - Exported Log Records/second: 27.8K ## Conclusions -- Before 15 Jan. (first session): +### Benchmarking Session #1 (before 15 Jan) - Compared to the FluentBit counterpart setup, a lower performance can be expected. - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end. That's because the agent has no way of knowing when to stop, thus exports data continuously (this is a known issue, which is expected be solved by the OTel community in the next half year). - If the load is increased (that is, more generators, more logs, or more data), the log agent slows down. From ba72832128ef4d2ba368b0b0abf5186b26094d4d Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:40:40 +0100 Subject: [PATCH 55/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index b2ed6b403..1d77213c2 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -397,7 +397,7 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera ## Conclusions ### Benchmarking Session #1 (before 15 Jan) - Compared to the FluentBit counterpart setup, a lower performance can be expected. - - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued/lost on the gateway end. That's because the agent has no way of knowing when to stop, thus exports data continuously (this is a known issue, which is expected be solved by the OTel community in the next half year). + - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued or lost on the gateway end. That's because the agent has no way of knowing when to stop, thus exports data continuously (this is a known issue, which is expected be solved by the OTel community in the next half year). - If the load is increased (that is, more generators, more logs, or more data), the log agent slows down. - The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup. That's concluded because higher throughput was observed when using just a debug endpoint as an exporter. - CPU and memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). From e4f5c4a4d0eb3c6a43a899fecb065a80057cd6a7 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:40:47 +0100 Subject: [PATCH 56/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 1d77213c2..30b168ca1 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -403,7 +403,8 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera - CPU and memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). - If the batch processor is enabled, throughput increased. But this comes at the cost of losing logs in some scenarios. - Further methods of improving the throughput might still be worth investigating. -- After 15 jan. (second session): + +### Benchmarking Session #2 (after 15 Jan) - Removing the gateway improves throughput - We now better understand the performance impact of each OTEL processor and of enabling/disabling compression - Generators configuration greatly influence the setup => more generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data From 452b97c4b43ccf660057be2e65b9e3234085f5db Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:40:57 +0100 Subject: [PATCH 57/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 30b168ca1..e0bbe74ed 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -405,7 +405,7 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera - Further methods of improving the throughput might still be worth investigating. ### Benchmarking Session #2 (after 15 Jan) - - Removing the gateway improves throughput + - Removing the gateway improves throughput. - We now better understand the performance impact of each OTEL processor and of enabling/disabling compression - Generators configuration greatly influence the setup => more generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data - There is a hard limit (see debug endpoint scenario) that we still not fully understand, since strictly based on the benchmarking numbers of OTEL, we should be getting higher throughput (i.e. something related to the infrastructure could be influencing this). From 6ecd74e81d59cd8adbdf0597595e52a6de4ef0e6 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:41:04 +0100 Subject: [PATCH 58/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index e0bbe74ed..2bc1ad2cb 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -406,7 +406,7 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera ### Benchmarking Session #2 (after 15 Jan) - Removing the gateway improves throughput. - - We now better understand the performance impact of each OTEL processor and of enabling/disabling compression + - We now better understand the performance impact of each OTel processor and of enabling or disabling compression. - Generators configuration greatly influence the setup => more generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data - There is a hard limit (see debug endpoint scenario) that we still not fully understand, since strictly based on the benchmarking numbers of OTEL, we should be getting higher throughput (i.e. something related to the infrastructure could be influencing this). - We have now a more performant setup configuration, being more comparable with the numbers from the FluentBit setup \ No newline at end of file From 69b05349faf64a293208e3d662a00b2bce72390b Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:42:07 +0100 Subject: [PATCH 59/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 2bc1ad2cb..8ad33f988 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -407,6 +407,6 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera ### Benchmarking Session #2 (after 15 Jan) - Removing the gateway improves throughput. - We now better understand the performance impact of each OTel processor and of enabling or disabling compression. - - Generators configuration greatly influence the setup => more generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data + - The generators' configuration greatly influences the setup: More generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data. - There is a hard limit (see debug endpoint scenario) that we still not fully understand, since strictly based on the benchmarking numbers of OTEL, we should be getting higher throughput (i.e. something related to the infrastructure could be influencing this). - We have now a more performant setup configuration, being more comparable with the numbers from the FluentBit setup \ No newline at end of file From e35fe33a3e0c62b9c43eda734ae4283155c3c948 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:42:26 +0100 Subject: [PATCH 60/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 8ad33f988..15d341eae 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -408,5 +408,5 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera - Removing the gateway improves throughput. - We now better understand the performance impact of each OTel processor and of enabling or disabling compression. - The generators' configuration greatly influences the setup: More generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data. - - There is a hard limit (see debug endpoint scenario) that we still not fully understand, since strictly based on the benchmarking numbers of OTEL, we should be getting higher throughput (i.e. something related to the infrastructure could be influencing this). + - There is a hard limit (see debug endpoint scenario) that we still don't fully understand, because strictly based on the benchmarking numbers of OTel, we should be getting higher throughput. It's possible that something related to the infrastructure could be influencing this. - We have now a more performant setup configuration, being more comparable with the numbers from the FluentBit setup \ No newline at end of file From f5f483558847a3465e64a620dc1e971e065db24f Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 10:44:51 +0100 Subject: [PATCH 61/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 15d341eae..cae56441a 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -409,4 +409,4 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera - We now better understand the performance impact of each OTel processor and of enabling or disabling compression. - The generators' configuration greatly influences the setup: More generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data. - There is a hard limit (see debug endpoint scenario) that we still don't fully understand, because strictly based on the benchmarking numbers of OTel, we should be getting higher throughput. It's possible that something related to the infrastructure could be influencing this. - - We have now a more performant setup configuration, being more comparable with the numbers from the FluentBit setup \ No newline at end of file + - We now have a more performant setup configuration, being more comparable with the numbers from the FluentBit setup \ No newline at end of file From c8ea74a38c1924746e28c55ad68277c93cd016cf Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Tue, 21 Jan 2025 10:52:35 +0100 Subject: [PATCH 62/66] chore: remove unfinished telemetrygen load test config file --- .../log-agent-setup-telemetrygen.yml | 75 ------------------- 1 file changed, 75 deletions(-) delete mode 100644 hack/load-tests/log-agent-setup-telemetrygen.yml diff --git a/hack/load-tests/log-agent-setup-telemetrygen.yml b/hack/load-tests/log-agent-setup-telemetrygen.yml deleted file mode 100644 index 5fba07b18..000000000 --- a/hack/load-tests/log-agent-setup-telemetrygen.yml +++ /dev/null @@ -1,75 +0,0 @@ -# TODO -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/name: log-load-generator - name: log-load-generator - namespace: log-load-test -spec: - replicas: 20 - selector: - matchLabels: - app.kubernetes.io/name: log-load-generator - template: - metadata: - labels: - app.kubernetes.io/name: log-load-generator - sidecar.istio.io/inject: "true" - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - log-load-generator - topologyKey: kubernetes.io/hostname - weight: 100 - containers: - - image: TELEMETRY_GEN_IMAGE - args: - - logs - - --otlp-insecure - - --otlp-endpoint - - "telemetry-otlp-logs.kyma-system:4317" - - --otlp-attributes - - "service.name=\"log-load-generator\"" - - --workers - - "100" - - --duration - - "20m" - - --rate - - "10000000" - - --interval - - "30s" - - --telemetry-attributes - - "key1=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key2=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key3=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key4=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key5=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key6=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key7=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key8=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key9=\"SimSimulates a client generating logs. (Stability level: Development)\"" - - --telemetry-attributes - - "key10=\"SimSimulates a client generating logs. (Stability level: Development)\"" - imagePullPolicy: IfNotPresent - name: telemetrygen - resources: - limits: - memory: 256Mi - requests: - memory: 256Mi \ No newline at end of file From 0774e38d7426ba0bd881b1d82f9a757946d98ab6 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 12:39:26 +0100 Subject: [PATCH 63/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index cae56441a..73b5aa0c6 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -391,7 +391,7 @@ By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is a ## Comparison with FluentBit Setup -In the FluentBit setup, for the very same (initial) scenario (that is, 10 generator replicas [old set-up] or 2 agents), the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: +In the FluentBit setup, for the very same (initial) scenario (that is, 10 generator replicas [old setup] or 2 agents), the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: - Exported Log Records/second: 27.8K ## Conclusions From 3303b78318fc9973963de90e24f7479f9c94f4e6 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 12:40:46 +0100 Subject: [PATCH 64/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 73b5aa0c6..bda53b990 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -400,7 +400,7 @@ In the FluentBit setup, for the very same (initial) scenario (that is, 10 genera - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued or lost on the gateway end. That's because the agent has no way of knowing when to stop, thus exports data continuously (this is a known issue, which is expected be solved by the OTel community in the next half year). - If the load is increased (that is, more generators, more logs, or more data), the log agent slows down. - The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup. That's concluded because higher throughput was observed when using just a debug endpoint as an exporter. - - CPU and memory consumption are surprisingly low, and this was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). + - CPU and memory consumption are surprisingly low, and the efficiency was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). - If the batch processor is enabled, throughput increased. But this comes at the cost of losing logs in some scenarios. - Further methods of improving the throughput might still be worth investigating. From 22186df2df552f8d3884fdcdc495396732ef2472 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu Date: Tue, 21 Jan 2025 12:57:22 +0100 Subject: [PATCH 65/66] chore: Update config validation doc --- docs/contributor/benchmarks/otlp-logs-validation.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index bda53b990..65dd64879 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -1,14 +1,13 @@ # OTel LogPipeline Setup Validation -- [Setup Configuration Steps](#setup-configuration-steps) +- [Setup Configuration Steps](#configuring-the-log-agent) - [Resources Under Investigation](#resources-under-investigation) - [Benchmarking Setup](#benchmarking-setup) - [Performance Tests Results](#performance-tests-results) - - [📊 Benchmarking Session #1](#-benchmarking-session-1) - - [📊 Benchmarking Session #2](#-benchmarking-session-2) - [Conclusions](#conclusions) + ## Configuring the Log Agent To configure the log agent, deploy the [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) either with Helm or manually: From 5b254c73fae25faf1a31fcae7a05bb29821c119e Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:30:18 +0100 Subject: [PATCH 66/66] Update docs/contributor/benchmarks/otlp-logs-validation.md Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/otlp-logs-validation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md index 65dd64879..297badd5f 100644 --- a/docs/contributor/benchmarks/otlp-logs-validation.md +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -1,6 +1,6 @@ # OTel LogPipeline Setup Validation -- [Setup Configuration Steps](#configuring-the-log-agent) +- [Configuring the Log Agent](#configuring-the-log-agent) - [Resources Under Investigation](#resources-under-investigation) - [Benchmarking Setup](#benchmarking-setup) - [Performance Tests Results](#performance-tests-results)