From 94bb656bc7d981f02dabe79cfc20c21908cc4603 Mon Sep 17 00:00:00 2001 From: Andreas Thaler Date: Mon, 27 Jan 2025 14:38:43 +0100 Subject: [PATCH 1/6] feat: Update to OTel-Collector 0.118.0 (#1787) --- .env | 2 +- docs/contributor/benchmarks/README.md | 6 ++++-- internal/images/images.go | 2 +- internal/otelcollector/config/metric/gateway/config.go | 1 - internal/otelcollector/config/metric/gateway/connectors.go | 1 - .../otelcollector/config/metric/gateway/connectors_test.go | 1 - .../config/metric/gateway/testdata/config.yaml | 1 - .../metric/gateway/testdata/config_otlp_disabled.yaml | 1 - sec-scanners-config.yaml | 2 +- 9 files changed, 7 insertions(+), 10 deletions(-) diff --git a/.env b/.env index cd817b514..7f47831d9 100644 --- a/.env +++ b/.env @@ -17,6 +17,6 @@ ENV_GORELEASER_VERSION=v1.23.0 ## Default Docker Images DEFAULT_FLUENTBIT_EXPORTER_IMAGE="europe-docker.pkg.dev/kyma-project/prod/directory-size-exporter:v20241212-e4adf27f" DEFAULT_FLUENTBIT_IMAGE="europe-docker.pkg.dev/kyma-project/prod/external/fluent/fluent-bit:3.2.4" -DEFAULT_OTEL_COLLECTOR_IMAGE="europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.116.0-main" +DEFAULT_OTEL_COLLECTOR_IMAGE="europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.118.0-main" DEFAULT_SELFMONITOR_IMAGE="europe-docker.pkg.dev/kyma-project/prod/tpi/telemetry-self-monitor:3.1.0-98bf175" DEFAULT_TEST_TELEMETRYGEN_IMAGE="ghcr.io/open-telemetry/opentelemetry-collector-contrib/telemetrygen:v0.116.0" \ No newline at end of file diff --git a/docs/contributor/benchmarks/README.md b/docs/contributor/benchmarks/README.md index 65dcbed02..3bba573d5 100644 --- a/docs/contributor/benchmarks/README.md +++ b/docs/contributor/benchmarks/README.md @@ -122,6 +122,7 @@ A typical test result output looks like the following example: | 0.114.0 | 19610 | 19453 | 0 | 127, 125 | 1, 1 | 11256 | 33308 | 0 | 175, 248 | 1.4, 1.4 | 10608 | 321 | 511 | 1737, 1735 | 0.5, 0.5 | 18442 | 956 | 510 | 1798, 1737 | 0.9, 0.9 | | 0.115.0 | 18865 | 18718 | 0 | 191, 253 | 1, 1 | 11615 | 34386 | 0 | 275, 167 | 1.4, 1.5 | 11141 | 277 | 511 | 1747, 1731 | 0.5, 0.5 | 18258 | 880 | 510 | 1741, 1760 | 0.9, 0.9 | | 0.116.0 | 19693 | 19540 | 0 | 165, 126 | 1.1, 1 | 11388 | 33717 | 0 | 196, 137 | 1.5, 1.4 | 11215 | 324 | 510 | 1658, 1738 | 0.5, 0.5 | 17974 | 886 | 509 | 1671, 1683 | 0.9, 0.9 | +| 0.118.0 | 19299 | 19148 | 0 | 88,97, | 1.1,1, | 11369 | 33659 | 0 | 137,159, | 1.4,1.5, | 10066 | 296 | 512 | 1551,1652, | 0.4,0.4, | 18852 | 945 | 510 | 1701,1688, | 0.9,0.9, | @@ -244,7 +245,8 @@ are printed out. | 0.110.0 | 4223 | 4222 | 0 | 130, 137 | 1.5, 1.5 | 3139 | 9417 | 1 | 197, 215 | 1.7, 1.7 | 830 | 640 | 287 | 841, 835 | 0.5, 0.5 | 2048 | 1907 | 510 | 1741, 1694 | 1.4, 1.4 | | 0.114.0 | 4384 | 4385 | 0 | 131, 141 | 1.5, 1.5 | 3209 | 9624 | 0 | 189, 198 | 1.7, 1.8 | 757 | 635 | 393 | 807, 824 | 0.5, 0.4 | 2512 | 1691 | 510 | 1788, 1789 | 1.6, 1.6 | | 0.115.0 | 4256 | 4255 | 0 | 144, 175 | 1.5, 1.5 | 3346 | 10040 | 0 | 244, 202 | 1.7, 1.8 | 726 | 627 | 361 | 821, 834 | 0.5, 0.5 | 2510 | 1926 | 505 | 1778, 1730 | 1.7, 1.6 | -| 0.116.0 | 4374 | 4374 | 0 | 100, 109 | 1.5, 1.5 | 3500 | 10500 | 0 | 171, 171 | 1.8, 2 | 710 | 641 | 383 | 857, 870 | 0.5, 0.5 | 3183 | 1780 | 509 | 1760, 1848 | 2, 2.1 | +| 0.116.0 | 4374 | 4374 | 0 | 100, 109 | 1.5, 1.5 | 3500 | 10500 | 0 | 171, 171 | 1.8, 2 | 710 | 641 | 383 | 857, 870 | 0.5, 0.5 | 3183 | 1780 | 509 | 1760, 1848 | 2, 2.1 | +| 0.118.0 | 4357 | 4357 | 0 | 120,115, | 1.5,1.5, | 3520 | 10566 | 0 | 151,179, | 2,1.8, | 813 | 522 | 443 | 880,1752, | 0.6,0.6, | 3264 | 1925 | 510 | 1837,1855, | 2,2.1, | @@ -294,7 +296,7 @@ On average, memory usage for MetricPipeline instances is ~150MB for a single Pod | 0.114.0 | 19904 | 19904 | 0 | 683, 707 | 0.2, 0.2 | 19942 | 19958 | 0 | 701, 743 | 0.2, 0.2 | | 0.115.0 | 20073 | 20073 | 0 | 697, 697 | 0.2, 0.2 | 19924 | 19954 | 0 | 700, 773 | 0.2, 0.3 | | 0.116.0 | 20058 | 20057 | 0 | 690, 682 | 0.3, 0.3 | 19998 | 19999 | 0 | 713, 692 | 0.2, 0.3 | - +| 0.118.0 | 19859 | 19859 | 0 | 694,672, | 0.2,0.2, | 20057 | 20057 | 0 | 661,664, | 0.2,0.2, | The expected throughput for the MetricPipeline agent receiver is ~20K metrics/sec. Expected memory usage is on average ~700MB, and CPU usage is ~0.2 for each instance. diff --git a/internal/images/images.go b/internal/images/images.go index e91f95c3f..6028515f9 100644 --- a/internal/images/images.go +++ b/internal/images/images.go @@ -6,6 +6,6 @@ package images const ( DefaultFluentBitExporterImage = "europe-docker.pkg.dev/kyma-project/prod/directory-size-exporter:v20241212-e4adf27f" DefaultFluentBitImage = "europe-docker.pkg.dev/kyma-project/prod/external/fluent/fluent-bit:3.2.4" - DefaultOTelCollectorImage = "europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.116.0-main" + DefaultOTelCollectorImage = "europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.118.0-main" DefaultSelfMonitorImage = "europe-docker.pkg.dev/kyma-project/prod/tpi/telemetry-self-monitor:3.1.0-98bf175" ) diff --git a/internal/otelcollector/config/metric/gateway/config.go b/internal/otelcollector/config/metric/gateway/config.go index d6d25c56e..40040b910 100644 --- a/internal/otelcollector/config/metric/gateway/config.go +++ b/internal/otelcollector/config/metric/gateway/config.go @@ -97,7 +97,6 @@ type Connectors map[string]any type RoutingConnector struct { DefaultPipelines []string `yaml:"default_pipelines"` ErrorMode string `yaml:"error_mode"` - MatchOnce bool `yaml:"match_once"` Table []RoutingConnectorTableEntry `yaml:"table"` } diff --git a/internal/otelcollector/config/metric/gateway/connectors.go b/internal/otelcollector/config/metric/gateway/connectors.go index 87f94b41d..d25c02361 100644 --- a/internal/otelcollector/config/metric/gateway/connectors.go +++ b/internal/otelcollector/config/metric/gateway/connectors.go @@ -13,7 +13,6 @@ func makeRoutingConnectorConfig(pipelineName string) RoutingConnector { return RoutingConnector{ DefaultPipelines: []string{attributesEnrichmentPipelineID}, ErrorMode: "ignore", - MatchOnce: true, Table: []RoutingConnectorTableEntry{ { Statement: fmt.Sprintf("route() where attributes[\"%s\"] == \"true\"", metric.SkipEnrichmentAttribute), diff --git a/internal/otelcollector/config/metric/gateway/connectors_test.go b/internal/otelcollector/config/metric/gateway/connectors_test.go index d1e4c8d98..6b65e813b 100644 --- a/internal/otelcollector/config/metric/gateway/connectors_test.go +++ b/internal/otelcollector/config/metric/gateway/connectors_test.go @@ -45,7 +45,6 @@ func TestConnectors(t *testing.T) { expectedRoutingConnector := RoutingConnector{ DefaultPipelines: []string{"metrics/test-attributes-enrichment"}, ErrorMode: "ignore", - MatchOnce: true, Table: []RoutingConnectorTableEntry{ { Statement: "route() where attributes[\"io.kyma-project.telemetry.skip_enrichment\"] == \"true\"", diff --git a/internal/otelcollector/config/metric/gateway/testdata/config.yaml b/internal/otelcollector/config/metric/gateway/testdata/config.yaml index 3033c643c..8deeab183 100644 --- a/internal/otelcollector/config/metric/gateway/testdata/config.yaml +++ b/internal/otelcollector/config/metric/gateway/testdata/config.yaml @@ -179,7 +179,6 @@ connectors: default_pipelines: - metrics/test-attributes-enrichment error_mode: ignore - match_once: true table: - statement: route() where attributes["io.kyma-project.telemetry.skip_enrichment"] == "true" pipelines: diff --git a/internal/otelcollector/config/metric/gateway/testdata/config_otlp_disabled.yaml b/internal/otelcollector/config/metric/gateway/testdata/config_otlp_disabled.yaml index de3d7dce5..afbbdcbdd 100644 --- a/internal/otelcollector/config/metric/gateway/testdata/config_otlp_disabled.yaml +++ b/internal/otelcollector/config/metric/gateway/testdata/config_otlp_disabled.yaml @@ -184,7 +184,6 @@ connectors: default_pipelines: - metrics/test-attributes-enrichment error_mode: ignore - match_once: true table: - statement: route() where attributes["io.kyma-project.telemetry.skip_enrichment"] == "true" pipelines: diff --git a/sec-scanners-config.yaml b/sec-scanners-config.yaml index 79c3022c2..48f7eca35 100644 --- a/sec-scanners-config.yaml +++ b/sec-scanners-config.yaml @@ -3,7 +3,7 @@ protecode: - europe-docker.pkg.dev/kyma-project/prod/telemetry-manager:main - europe-docker.pkg.dev/kyma-project/prod/directory-size-exporter:v20241212-e4adf27f - europe-docker.pkg.dev/kyma-project/prod/external/fluent/fluent-bit:3.2.4 -- europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.116.0-main +- europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.118.0-main - europe-docker.pkg.dev/kyma-project/prod/tpi/telemetry-self-monitor:3.1.0-98bf175 whitesource: language: golang-mod From 776ecdb4f5d1f4742980f85b7b97ecea42e33153 Mon Sep 17 00:00:00 2001 From: Teodor-Adrian Mihaescu <103431261+TeodorSAP@users.noreply.github.com> Date: Mon, 27 Jan 2025 19:11:18 +0100 Subject: [PATCH 2/6] docs: Validate the LogPipeline OTel Setup and test the performance of the log agent (#1705) Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/load-test-logs.md | 6 +- .../benchmarks/otlp-logs-validation.md | 411 ++++++++++++++++++ .../benchmarks/otlp-logs-validation.yaml | 244 +++++++++++ .../pocs/assets/otel-log-agent-values.yaml | 7 +- .../pocs/assets/otel-log-gateway-values.yaml | 1 + .../pocs/assets/otel-logs-values.yaml | 3 + .../log-agent-test-setup-generator.yml | 41 ++ hack/load-tests/log-agent-test-setup.yaml | 108 +++++ hack/load-tests/log-backpressure-config.yaml | 21 + hack/load-tests/run-load-test.sh | 4 +- 10 files changed, 838 insertions(+), 8 deletions(-) create mode 100644 docs/contributor/benchmarks/otlp-logs-validation.md create mode 100644 docs/contributor/benchmarks/otlp-logs-validation.yaml create mode 100644 hack/load-tests/log-agent-test-setup-generator.yml create mode 100644 hack/load-tests/log-agent-test-setup.yaml create mode 100644 hack/load-tests/log-backpressure-config.yaml diff --git a/docs/contributor/benchmarks/load-test-logs.md b/docs/contributor/benchmarks/load-test-logs.md index 2b51fd2ba..bd0e3bfb7 100644 --- a/docs/contributor/benchmarks/load-test-logs.md +++ b/docs/contributor/benchmarks/load-test-logs.md @@ -34,9 +34,9 @@ The tests are executed for 20 minutes, so that each test case has a stabilized o
| config | logs received l/s | logs exported l/s | logs queued | cpu | memory MB | no. restarts of gateway | no. restarts of generator | -| --- | --- | --- | --- | --- | --- | ---| -| single | 7193 | 7195 | 16824 | 2.5 | 826 | 0 | 1 | -| batch | 16428 | 16427 | 0 | 3 | 265 | 0 | 1 | +| ------ | ----------------- | ----------------- | ----------- | --- | --------- | ----------------------- | ------------------------- | +| single | 7193 | 7195 | 16824 | 2.5 | 826 | 0 | 1 | +| batch | 16428 | 16427 | 0 | 3 | 265 | 0 | 1 |
## Interpretation diff --git a/docs/contributor/benchmarks/otlp-logs-validation.md b/docs/contributor/benchmarks/otlp-logs-validation.md new file mode 100644 index 000000000..297badd5f --- /dev/null +++ b/docs/contributor/benchmarks/otlp-logs-validation.md @@ -0,0 +1,411 @@ +# OTel LogPipeline Setup Validation + +- [Configuring the Log Agent](#configuring-the-log-agent) +- [Resources Under Investigation](#resources-under-investigation) +- [Benchmarking Setup](#benchmarking-setup) +- [Performance Tests Results](#performance-tests-results) +- [Conclusions](#conclusions) + + + +## Configuring the Log Agent + +To configure the log agent, deploy the [OTLP Logs Validation YAML](./otlp-logs-validation.yaml) either with Helm or manually: + +- To set up the log agent with Helm, run: + + ``` bash + k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml + + // Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables + + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts + + helm install -n kyma-system logging open-telemetry/opentelemetry-collector -f telemetry-manager/docs/contributor/pocs/assets/otel-log-agent-values.yaml + ``` + +- To set up the log agent manually, run: + + ``` bash + k apply -f telemetry-manager/config/samples/operator_v1alpha1_telemetry.yaml + + // Execute knowledge-hub/scripts/create_cls_log_pipeline.sh with the corresponding environment variables + + k apply -f ./otlp-logs-validation.yaml + ``` + + +## Resources Under Investigation +We investigate the following resources (for details, see the [OTLP Logs Validation YAML](./otlp-logs-validation.yaml)): + +- Log Agent ConfigMap (OTel Config) +- Log Agent DaemonSet + + +**Things to take into consideration, when implementing the Log Agent into Telemetry Manager:** +- Dynamically include/exclude of namespaces, based on LogPipeline spec attributes. +- Exclude FluentBit container in OTel configuration, and OTel container in FluentBit configuration. +- `receivers/filelog/operators`: The copy body to `attributes.original` must be avoided if `dropLogRawBody` flag is enabled. + +**How does checkpointing work?** +By enabling the storeCheckpoint preset (Helm), the `file_storage` extension is activated in the filelog receiver. +- The `file_storage` has the path `/var/lib/otelcol`. +- Later, this path is mounted as a `hostPath` volume in the DaemonSet spec. +- The extension is also set in the `storage` property of the filelog receiver. + +> **NOTE:** `storage` = The ID of a storage extension to be used to store file offsets. File offsets enable the filelog receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver manages offsets only in memory. + + +## Benchmarking Setup + +1. Apply the configuration (with Prometheus): + ``` bash + k create ns prometheus + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + helm upgrade --install -n "prometheus" "prometheus" prometheus-community/kube-prometheus-stack -f hack/load-tests/values.yaml --set grafana.adminPassword=myPwd + + k apply -f telemetry-manager/hack/load-tests/log-agent-test-setup.yaml + ``` + +2. To execute the load tests, the generated logs must be isolated. Replace the following line in the ConfigMap of the log agent: + + ``` yaml + receivers: + filelog: + # ... + include: + - /var/log/pods/*/*/*.log # replace with "/var/log/pods/log-load-test*/*flog*/*.log" + ``` + +3. If you want to run the backpressure scenario, additionally apply: + ``` bash + k apply -f telemetry-manager/hack/load-tests/log-backpressure-config.yaml + ``` + +4. You can use the following PromQL Queries for measuring the results (same/similar queries were used in measuring the results of the performance tests executed below): + ``` sql + -- RECEIVED + round(sum(rate(otelcol_receiver_accepted_log_records{service="telemetry-log-agent-metrics"}[20m]))) + + -- EXPORTED + round(sum(rate(otelcol_exporter_sent_log_records{service="telemetry-log-agent-metrics"}[20m]))) + + -- QUEUE + avg(sum(otelcol_exporter_queue_size{service="telemetry-log-agent-metrics"})) + + -- MEMORY + round(sum(avg_over_time(container_memory_working_set_bytes{namespace="kyma-system", container="collector"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod) / 1024 / 1024) + + -- CPU + round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-log-agent"}[20m])) by (pod), 0.1) + ``` + + +## Performance Tests Results + +| Icon | Meaning | +| ---- | ---------------------------------------------------- | +| ⏳ | Full-test, involving the whole setup, usually 20 min | +| 🪲 | Debugging session, usually shorter, not so reliable | +| 🏋️‍♀️ | Backpressure scenario | +| ⭐️ | Best results observed (in a given scenario) | + +### 📊 Benchmarking Session #1 + +#### ⏳ 18 Dec 2024, 13:45 - 14:05 (20 min) +- **Generator:** 10 replicas x 10 MB +- **Agent:** no CPU limit, no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 6.06K + - Agent Memory: + - Pod1: 70 + - Pod2: 70 + - Agent CPU: + - Pod1: 0.5 + - Pod2: 0.4 + - Gateway RECEIVED/EXPORTED: 6.09K + - Gateway QUEUE: 0 + +#### ⏳ 18 Dec 2024, 14:08 - 14:28 (20 min) +- **Generator:** 20 replicas x 10 MB +- **Agent:** no CPU limit, no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 4.93K + - Agent Memory: + - Pod1: 71 + - Pod2: 72 + - Agent CPU: + - Pod1: 0.5 + - Pod2: 0.4 + - Gateway RECEIVED/EXPORTED: 4.93K + - Gateway QUEUE: 0 (max. 6 at some point) + +#### ⏳ 18 Dec 2024, 14:50 - 15:10 (20 min) +- **Generator:** 10 replicas x 20 MB +- **Agent:** no CPU limit, no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 5.94K + - Agent Memory: + - Pod1: 76 + - Pod2: 81 + - Agent CPU: + - Pod1: 0.5 + - Pod2: 0.5 + - Gateway RECEIVED/EXPORTED: 5.94K + - Gateway QUEUE: 0 + +#### ⏳ 18 Dec 2024, 15:24 - 15:34 (10 min) +- **Generator:** 10 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 8.9K + - Agent Memory: 64/62 + - Agent CPU: 0.5/0.5 + - Gateway RECEIVED/EXPORTED: 8.9K + - Gateway QUEUE: 0 + +#### 🏋️‍♀️⭐️ 18 Dec 2024, 15:36 - 15:56 (20 min) (backpressure scenario) +- **Generator:** 10 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue +- **Results:** + - Agent RECEIVED/EXPORTED: 6.8K + - Agent Memory: + - Pod1: 66 + - Pod2: 67 + - Agent CPU: + - Pod1: 0.6 + - Pod2: 0.5 + - Gateway RECEIVED: 6.8K + - Gateway EXPORTED: 256 + - Gateway QUEUE: 328 +- **Remarks:** + - Agent does not stop when gateway refuses logs (because backpressure does not backpropagate) + - It slows down/stops in other scenarios (see below) => SUCCESS + +#### 🪲 19 Dec 2024, Agent exports logs to a debug endpoint (5 min) +- no networking involved +- 12/14 log generators x 10 MB + - 19.5K => ~20K + - MEM: 43/47 + - CPU: 0.7/0.8 + +#### 🪲 19 Dec 2024, Agent exports logs directly to mock backend (5 min) +- networking, but avoiding gateway +- 10 log generators x 10 MB + - 5.3K + - MEM: 58/59 + - CPU: 0.4/0.5 +- 12 log generators x 10 MB + - not increasing + +#### 🪲 19 Dec 2024, Agent exports logs directly to mock backend with batching processor (5 min) +- networking, but with batching mechanism in place +- 10 log generators x 10 MB, batch size: 1024 + - 8.3K + - MEM: 68/73 + - CPU: 0.5/0.6 +- 12 log generators x 10 MB, batch size: 1024 + - starts decreasing (~7.5K) +- 10 log generators x 10 MB, batch size: 2048 + - ~9K + - MEM: 74/79 + - CPU: 0.6/0.7 + +#### ⏳ 19 Dec 2024, 13:46 - 14:06 (20 min) +- **Generator:** 10 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue, with batch processing (1024) +- **Results:** + - Agent RECEIVED/EXPORTED: 8.46K + - Gateway RECEIVED/EXPORTED: 8.46K + - Agent Memory: 69/76 + - Agent CPU: 0.5/0.7 + - Gateway QUEUE: 0 (max 191) + +#### ⏳ 19 Dec 2024, ??:?? - ??:?? (20 min) +- **Generator:** 10 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue, with batch processing (2048) +- **Results:** + - lower throughput than for the 1024 scenario + +#### ⏳ 19 Dec 2024, 15:55 - 16:15 (20 min) +- **Agent:** with CPU limit (1), no queue, with batch processing (1024) +- **Mock Backend:** memory limit x2 (2048Mi) +- **Generator:** 10 replicas x 10 MB + - **Results:** + - Agent RECEIVED/EXPORTED: 8.18K + - Gateway RECEIVED/EXPORTED: 8.18K + - Agent Memory: 70/71 + - Agent CPU: 0.6/0.6 + - Gateway QUEUE: 0 +- **Generator:** 12 replicas x 10 MB (16:18 - 16:35) + - **Results:** + - Agent RECEIVED/EXPORTED: 8.6k + - Gateway RECEIVED/EXPORTED: 8.6k + - Agent Memory: 73/74 + - Agent CPU: 0.7/0.6 + - Gateway QUEUE: 0 +- **Generator:** 14 replicas x 10 MB (16:35 - 16:40) + - **Results:** + - Agent RECEIVED/EXPORTED: 7.54K + - Gateway RECEIVED/EXPORTED: 7.54K + - lower + +#### ⏳ 19 Dec 2024, 16:50 - 17:10 (20 min) +- **Generator:** 12 replicas x 10 MB +- **Agent:** with CPU limit (1), no queue, with batch processing (2048) +- **Mock Backend:** memory limit x2 (2048Mi) +- **Results:** + - Agent RECEIVED/EXPORTED: 8.1K + - Gateway RECEIVED/EXPORTED: 8.11K + - Agent Memory: 74/81 + - Agent CPU: 0.6/0.5 + - Gateway QUEUE: 0 (max 2) + +#### 🪲 20 Dec 2024, Multiple agents loading the gateway (5 min) +- **Setup:** 10 nodes, 10 agents, 1 generator / node (DaemonSet) +- **Results (WITH BATCHING):** + - Agent RECEIVED/EXPORTED: 61.5K => 6.1K / agent instance + - Gateway RECEIVED/EXPORTED: 61.5K/29.5K => 30K/14.7K / gateway instance + - Agent Memory: 61-68/agent + - Agent CPU: 0.4-0.8/agent + - Gateway QUEUE: 510 (max 512, full) + - ~10% exporter failed enqueue logs + - 0% receiver refused logs + - 0% exporter send failed logs +- **Results (WITHOUT BATCHING):** + - Agent RECEIVED/EXPORTED: 31.4K => 3.1K / agent instance + - Gateway RECEIVED/EXPORTED: 31.4K => 11.4K / gateway instance + - Agent Memory: 61-68/agent + - Agent CPU: 0.4-0.5/agent + - Gateway QUEUE: 0 (max 6) + - 0% exporter failed enqueue logs + - 0% receiver refused logs + - 0% exporter send failed logs + +### 📊 Benchmarking Session #2 + +#### ⏳ 15 Jan 2025, 12:31 - 12:51 (20 min) +- **Generator:** 10 replicas x 10 MB +- **Results:** + - Agent RECEIVED/EXPORTED: 14.4K + - Gateway RECEIVED/EXPORTED: 14.4K + - Agent Memory: 74/69 + - Agent CPU: 0.9/0.8 + - Gateway QUEUE: 0 + +#### ⏳⭐️ 15 Jan 2025, 14:31 - 14:08 (20 min) +- Gateways on separate nodes +- **Generator:** 10 replicas x 10 MB +- **Results:** + - Agent RECEIVED/EXPORTED: 15.7K + - Gateway RECEIVED/EXPORTED: 15.7K + - Agent Memory: 82/71 + - Agent CPU: 1/0.9 + - Gateway CPU: 0.6/0.6 + - Gateway Memory: 62/68 + - Gateway QUEUE: 0 + +#### 🪲 15 Jan 2025, Agent exports logs to a debug endpoint (5 min) +- no networking involved +- ~15K / agent => ~30K + +#### Removing compression for the OTLP exporters (on both agent and gateway) +- boosts throughput in the 4 nodes scenario +- the change seemed to have no impact in the 2 nodes scenario + +#### ⏳ 15 Jan 2025, ? - ? (20 min) +- Gateways on separate nodes +- Compression disabled for OTLP exporters (on both agent and gateway) (default: gzip) +- **Generator:** 20 replicas (new set-up) +- **Results:** + - Agent RECEIVED/EXPORTED: 15.3K + - Gateway RECEIVED/EXPORTED: 15.3K + +#### ⏳⭐️ 16 Jan 2025, ~13:17 (20 min) +- Gateways on separate nodes +- No Istio +- **Generator:** 10 replicas +- **Results:** + - Agent RECEIVED/EXPORTED: 18.8K + - Gateway RECEIVED/EXPORTED: 18.8K + - Agent Memory: 76/73 + - Agent CPU: 0.8/0.9 + - Gateway Memory: 69/27 + - Gateway CPU: 0.6/0.6 + - Gateway QUEUE: 1/0 + +#### ⏳⭐️ 16 Jan 2025, ~13:56 (20 min) +- No gateway involved, agent sending directly to mock backend +- With Istio +- **Generator:** 10 replicas +- **Results:** + - Agent RECEIVED/EXPORTED: 19K + - Agent Memory: 82/74 + - Agent CPU: 1.3/0.8 + +#### 🪲 17 Jan 2025, ~10:36 +- 1 node +- No gateway involved, agent sending directly to mock backend +- With Istio +- Agent has everything removed (no processors) +- **Generator:** 5 replicas +- **Results (without batching):** + - Agent RECEIVED/EXPORTED: 11.8K / instance +- **Results (with batching):** + - Agent RECEIVED/EXPORTED: 14K / instance + +#### 🪲 17 Jan 2025, ~11:48 +- 1 node +- No gateway involved, agent sending directly to mock backend +- With Istio +- Agent has everything removed (no processors), then we incrementally add them +- **Generator:** 30 replicas (10m CPU limit) +- 📥 Debug Exporter: + - **Results (without batching):** + - Agent RECEIVED/EXPORTED: 16K / instance + - **Results (with batching):** + - Agent RECEIVED/EXPORTED: 22.4K / instance + - **Results (batching + filestorage):** + - Agent RECEIVED/EXPORTED: 20K / instance +- 📥 OTEL Exporter: + - **Results (batching + filestorage):** + - Agent RECEIVED/EXPORTED: 15K / instance + - **Results (batching + filestorage + sending queue):** + - Agent RECEIVED/EXPORTED: 15K / instance + +#### 🪲 17 Jan 2025, ~13:16 +- No gateway involved, agent sending directly to mock backend +- With Istio +- **2 nodes:** + - **Generator:** 60 replicas (10m CPU limit) + - Agent RECEIVED/EXPORTED: 28.6K + - Agent Memory: 78/71 + - Agent CPU: 1.3/1.3 +- **3 nodes:** + - **Generator:** 90 replicas (10m CPU limit) + - Agent RECEIVED/EXPORTED: 44.6K + - Agent Memory: ~76-90 + - Agent CPU: ~1.3 + + +## Comparison with FluentBit Setup +In the FluentBit setup, for the very same (initial) scenario (that is, 10 generator replicas [old setup] or 2 agents), the [load test](https://github.com/kyma-project/telemetry-manager/actions/runs/12691802471) outputs the following values for the agent: +- Exported Log Records/second: 27.8K + +## Conclusions +### Benchmarking Session #1 (before 15 Jan) + - Compared to the FluentBit counterpart setup, a lower performance can be expected. + - Backpressure is currently not backpropagated from the gateway to the agent, resulting in logs being queued or lost on the gateway end. That's because the agent has no way of knowing when to stop, thus exports data continuously (this is a known issue, which is expected be solved by the OTel community in the next half year). + - If the load is increased (that is, more generators, more logs, or more data), the log agent slows down. + - The network communication between the agent and the gateway or/and the gateway represent a bottleneck in this setup. That's concluded because higher throughput was observed when using just a debug endpoint as an exporter. + - CPU and memory consumption are surprisingly low, and the efficiency was not improved by removing the limits (quite the opposite was observed, with the CPU throttling more often and the throughput decreasing). + - If the batch processor is enabled, throughput increased. But this comes at the cost of losing logs in some scenarios. + - Further methods of improving the throughput might still be worth investigating. + +### Benchmarking Session #2 (after 15 Jan) + - Removing the gateway improves throughput. + - We now better understand the performance impact of each OTel processor and of enabling or disabling compression. + - The generators' configuration greatly influences the setup: More generators exporting less data and taking less CPU leads to higher throughput than fewer generators taking more CPU and exporting more data. + - There is a hard limit (see debug endpoint scenario) that we still don't fully understand, because strictly based on the benchmarking numbers of OTel, we should be getting higher throughput. It's possible that something related to the infrastructure could be influencing this. + - We now have a more performant setup configuration, being more comparable with the numbers from the FluentBit setup \ No newline at end of file diff --git a/docs/contributor/benchmarks/otlp-logs-validation.yaml b/docs/contributor/benchmarks/otlp-logs-validation.yaml new file mode 100644 index 000000000..38b0a7764 --- /dev/null +++ b/docs/contributor/benchmarks/otlp-logs-validation.yaml @@ -0,0 +1,244 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: telemetry-log-agent + name: telemetry-log-agent + namespace: kyma-system +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/port: "8888" + prometheus.io/scheme: http + prometheus.io/scrape: "true" + labels: + app.kubernetes.io/name: telemetry-log-agent + telemetry.kyma-project.io/self-monitor: enabled + name: telemetry-log-agent-metrics + namespace: kyma-system +spec: + internalTrafficPolicy: Cluster + ipFamilies: + - IPv4 + ipFamilyPolicy: SingleStack + ports: + - name: http-metrics + port: 8888 + protocol: TCP + targetPort: 8888 + selector: + app.kubernetes.io/name: telemetry-log-agent + sessionAffinity: None + type: ClusterIP +status: + loadBalancer: {} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/name: telemetry-log-agent + name: telemetry-log-agent + namespace: kyma-system +data: + relay: | + exporters: + otlp: + endpoint: telemetry-otlp-logs.kyma-system:4317 + tls: + insecure: true + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + sending_queue: + enabled: false + + extensions: + file_storage: + directory: /var/lib/otelcol + health_check: + endpoint: ${env:MY_POD_IP}:13133 + pprof: + endpoint: 127.0.0.1:1777 + + processors: + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + transform/set-instrumentation-scope-runtime: + error_mode: ignore + metric_statements: + - context: scope + statements: + - set(version, "main") + - set(name, "io.kyma-project.telemetry/runtime") + + receivers: + filelog: + exclude: + - /var/log/pods/kyma-system_telemetry-log-agent*/*/*.log # exclude self + - /var/log/pods/kyma-system_telemetry-fluent-bit*/*/*.log # exclude FluentBit + include: + - /var/log/pods/*/*/*.log + include_file_name: false + include_file_path: true + operators: + - type: container + id: container-parser + add_metadata_from_filepath: true + format: containerd + - from: attributes.stream + if: attributes.stream != nil + to: attributes["log.iostream"] + type: move + - if: body matches "^{.*}$" + parse_from: body + parse_to: attributes + type: json_parser + - from: body + to: attributes.original + type: copy + - from: attributes.message + if: attributes.message != nil + to: body + type: move + - from: attributes.msg + if: attributes.msg != nil + to: body + type: move + - if: attributes.level != nil + parse_from: attributes.level + type: severity_parser + retry_on_failure: + enabled: true + start_at: beginning + storage: file_storage + + service: + extensions: + - health_check + - pprof + - file_storage + pipelines: + logs: + exporters: + - otlp + processors: + - memory_limiter + - transform/set-instrumentation-scope-runtime + receivers: + - filelog + telemetry: + metrics: + readers: + - pull: + exporter: + prometheus: + host: ${MY_POD_IP} + port: 8888 +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/name: telemetry-log-agent + name: telemetry-log-agent + namespace: kyma-system +spec: + selector: + matchLabels: + app.kubernetes.io/name: telemetry-log-agent + template: + metadata: + labels: + app.kubernetes.io/name: telemetry-log-agent + sidecar.istio.io/inject: "true" + annotations: + traffic.sidecar.istio.io/excludeInboundPorts: 8888,15020 + traffic.sidecar.istio.io/includeInboundPorts: "*" + traffic.sidecar.istio.io/includeOutboundIPRanges: "*" + spec: + containers: + - args: + - --config=/conf/relay.yaml + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + image: europe-docker.pkg.dev/kyma-project/prod/kyma-otel-collector:0.114.0-main + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 13133 + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + name: collector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 13133 + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 100m + memory: 50Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + runAsNonRoot: false + runAsUser: 0 + seccompProfile: + type: RuntimeDefault + volumeMounts: + - mountPath: /conf + name: config + - mountPath: /var/log/pods + name: varlogpods + readOnly: true + - mountPath: /var/lib/otelcol + name: varlibotelcol + priorityClassName: telemetry-priority-class-high + securityContext: + runAsNonRoot: false + seccompProfile: + type: RuntimeDefault + serviceAccountName: telemetry-log-agent + terminationGracePeriodSeconds: 30 + volumes: + - configMap: + defaultMode: 420 + items: + - key: relay + path: relay.yaml + name: telemetry-log-agent + name: config + - hostPath: + path: /var/log/pods + type: "" + name: varlogpods + - hostPath: + path: /var/lib/otelcol + type: DirectoryOrCreate + name: varlibotelcol diff --git a/docs/contributor/pocs/assets/otel-log-agent-values.yaml b/docs/contributor/pocs/assets/otel-log-agent-values.yaml index 6c3fefe8e..300091bb8 100644 --- a/docs/contributor/pocs/assets/otel-log-agent-values.yaml +++ b/docs/contributor/pocs/assets/otel-log-agent-values.yaml @@ -116,7 +116,7 @@ config: exporters: otlp: - endpoint: log-gateway-opentelemetry-collector:4317 + endpoint: telemetry-otlp-logs.kyma-system:4317 tls: insecure: true service: @@ -125,7 +125,7 @@ config: address: ${MY_POD_IP}:8888 pipelines: logs: - processors: {} + processors: [] exporters: - otlp @@ -142,9 +142,10 @@ ports: enabled: true serviceMonitor: - enabled: true + enabled: false metricsEndpoints: - port: metrics image: pullPolicy: Always + repository: "otel/opentelemetry-collector-k8s" diff --git a/docs/contributor/pocs/assets/otel-log-gateway-values.yaml b/docs/contributor/pocs/assets/otel-log-gateway-values.yaml index 9a5f3f905..c276e2654 100644 --- a/docs/contributor/pocs/assets/otel-log-gateway-values.yaml +++ b/docs/contributor/pocs/assets/otel-log-gateway-values.yaml @@ -73,6 +73,7 @@ securityContext: image: pullPolicy: Always + repository: "otel/opentelemetry-collector-k8s" rollout: rollingUpdate: {} diff --git a/docs/contributor/pocs/assets/otel-logs-values.yaml b/docs/contributor/pocs/assets/otel-logs-values.yaml index 470608bb9..0024b60be 100644 --- a/docs/contributor/pocs/assets/otel-logs-values.yaml +++ b/docs/contributor/pocs/assets/otel-logs-values.yaml @@ -129,3 +129,6 @@ config: extraEnvsFrom: - secretRef: name: sap-cloud-logging + +image: + repository: "otel/opentelemetry-collector-k8s" \ No newline at end of file diff --git a/hack/load-tests/log-agent-test-setup-generator.yml b/hack/load-tests/log-agent-test-setup-generator.yml new file mode 100644 index 000000000..14e29e6d8 --- /dev/null +++ b/hack/load-tests/log-agent-test-setup-generator.yml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: log-load-generator + namespace: log-load-test +spec: + replicas: 20 + selector: + matchLabels: + app.kubernetes.io/name: logs-load-generator + template: + metadata: + labels: + app.kubernetes.io/name: logs-load-generator + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - logs-load-generator + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - -f=json + - -l + image: mingrammer/flog + imagePullPolicy: Always + name: flog + resources: + limits: + cpu: 50m + memory: 200Mi + requests: + cpu: 10m + memory: 50Mi diff --git a/hack/load-tests/log-agent-test-setup.yaml b/hack/load-tests/log-agent-test-setup.yaml new file mode 100644 index 000000000..00a9ea9a7 --- /dev/null +++ b/hack/load-tests/log-agent-test-setup.yaml @@ -0,0 +1,108 @@ +apiVersion: telemetry.kyma-project.io/v1alpha1 +kind: LogPipeline +metadata: + name: load-test-1 +spec: + output: + otlp: + endpoint: + value: http://log-receiver.log-load-test:4317 + tls: + insecure: true + insecureSkipVerify: true + protocol: grpc +--- +apiVersion: v1 +kind: Namespace +metadata: + name: log-load-test +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: log-receiver + namespace: log-load-test + labels: + app.kubernetes.io/name: log-receiver +data: + config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: ${MY_POD_IP}:4317 + http: + endpoint: ${MY_POD_IP}:4318 + exporters: + nop: + service: + pipelines: + logs: + receivers: + - otlp + exporters: + - nop +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: log-receiver + name: log-receiver + namespace: log-load-test +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: log-receiver + template: + metadata: + labels: + app.kubernetes.io/name: log-receiver + sidecar.istio.io/inject: "true" + spec: + volumes: + - name: collector-config + configMap: + name: log-receiver + securityContext: + fsGroup: 101 + containers: + - image: otel/opentelemetry-collector-contrib:0.114.0 + name: otel-collector + resources: + limits: + memory: 2048Mi + requests: + memory: 2048Mi + volumeMounts: + - name: collector-config + mountPath: /etc/collector + args: + - --config=/etc/collector/config.yaml + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: log-receiver + name: log-receiver + namespace: log-load-test +spec: + ports: + - name: grpc-otlp + port: 4317 + protocol: TCP + targetPort: 4317 + - name: http-otlp + port: 4318 + protocol: TCP + targetPort: 4318 + selector: + app.kubernetes.io/name: log-receiver diff --git a/hack/load-tests/log-backpressure-config.yaml b/hack/load-tests/log-backpressure-config.yaml new file mode 100644 index 000000000..82ba506a1 --- /dev/null +++ b/hack/load-tests/log-backpressure-config.yaml @@ -0,0 +1,21 @@ +apiVersion: networking.istio.io/v1 +kind: VirtualService +metadata: + name: log-receiver-fault + namespace: log-load-test +spec: + hosts: + - log-receiver + http: + - fault: + abort: + httpStatus: 503 + percentage: + value: 70 + delay: + percentage: + value: 70 + fixedDelay: 1s + route: + - destination: + host: log-receiver \ No newline at end of file diff --git a/hack/load-tests/run-load-test.sh b/hack/load-tests/run-load-test.sh index 5b1557a4b..38da9db8b 100755 --- a/hack/load-tests/run-load-test.sh +++ b/hack/load-tests/run-load-test.sh @@ -385,9 +385,9 @@ function get_result_and_cleanup_metricagent() { function get_result_and_cleanup_log_otel() { RESULT_TYPE="log" - QUERY_RECEIVED='query=round(sum(rate(otelcol_receiver_accepted_log_records{service="log-gateway-metrics"}[20m])))' + QUERY_RECEIVED='query=round(sum(rate(otelcol_receiver_accepted_log_records{service=~"log-gateway-metrics"}[20m])))' QUERY_EXPORTED='query=round(sum(rate(otelcol_exporter_sent_log_records{service=~"log-gateway-metrics"}[20m])))' - QUERY_QUEUE='query=avg(sum(otelcol_exporter_queue_size{service="log-gateway-metrics"}))' + QUERY_QUEUE='query=avg(sum(otelcol_exporter_queue_size{service=~"log-gateway-metrics"}))' QUERY_MEMORY='query=round(sum(avg_over_time(container_memory_working_set_bytes{namespace="log-load-test", container="collector"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="log-load-test", workload="log-gateway"}[20m])) by (pod) / 1024 / 1024)' QUERY_CPU='query=round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="log-load-test"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="log-load-test", workload="log-gateway"}[20m])) by (pod), 0.1)' From c87a9d45f02775dc0521348c2bd4a70c02afa563 Mon Sep 17 00:00:00 2001 From: Korbinian Stoemmer Date: Tue, 28 Jan 2025 13:12:03 +0100 Subject: [PATCH 3/6] chore: Remove unnecessary kube-rbac-proxy reference (#1790) --- config/default/kustomization.yaml | 7 --- config/default/manager_auth_proxy_patch.yaml | 56 -------------------- config/rbac/kustomization.yaml | 7 --- 3 files changed, 70 deletions(-) delete mode 100644 config/default/manager_auth_proxy_patch.yaml diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index a0dc0faf0..ef4a1305c 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -34,13 +34,6 @@ resources: - ../networking patchesStrategicMerge: -# Protect the /metrics endpoint by putting it behind auth. -# If you want your controller-manager to expose the /metrics -# endpoint w/o any authn/z, please comment the following line. -# - manager_auth_proxy_patch.yaml - - - # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml #- manager_webhook_patch.yaml diff --git a/config/default/manager_auth_proxy_patch.yaml b/config/default/manager_auth_proxy_patch.yaml deleted file mode 100644 index 08696a2a4..000000000 --- a/config/default/manager_auth_proxy_patch.yaml +++ /dev/null @@ -1,56 +0,0 @@ ---- -# This patch inject a sidecar container which is a HTTP proxy for the -# controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. -apiVersion: apps/v1 -kind: Deployment -metadata: - name: manager - namespace: system -spec: - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/arch - operator: In - values: - - amd64 - - arm64 - - ppc64le - - s390x - - key: kubernetes.io/os - operator: In - values: - - linux - containers: - - name: kube-rbac-proxy - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - image: gcr.io/kubebuilder/kube-rbac-proxy:v0.13.1 - args: - - "--secure-listen-address=0.0.0.0:8443" - - "--upstream=http://127.0.0.1:8080/" - - "--logtostderr=true" - - "--v=0" - ports: - - containerPort: 8443 - protocol: TCP - name: https - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 5m - memory: 64Mi - - name: manager - args: - - "--health-probe-bind-address=:8081" - - "--metrics-bind-address=127.0.0.1:8080" - - "--leader-elect" diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 5188a34db..b935fe131 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -10,10 +10,3 @@ resources: - role_binding.yaml - leader_election_role.yaml - leader_election_role_binding.yaml -# Comment the following 4 lines if you want to disable -# the auth proxy (https://github.com/brancz/kube-rbac-proxy) -# which protects your /metrics endpoint. -# - auth_proxy_service.yaml -# - auth_proxy_role.yaml -# - auth_proxy_role_binding.yaml -# - auth_proxy_client_clusterrole.yaml From b7d18b226a345c66e4df64e6262d18823bc7f956 Mon Sep 17 00:00:00 2001 From: Andreas Thaler Date: Tue, 28 Jan 2025 17:32:12 +0100 Subject: [PATCH 4/6] chore: Migrate to checkmarx (#1791) --- internal/tools/populateimages/main.go | 28 ++++++++++++++++++--------- sec-scanners-config.yaml | 9 ++++++++- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/internal/tools/populateimages/main.go b/internal/tools/populateimages/main.go index 525ebba04..a76947984 100644 --- a/internal/tools/populateimages/main.go +++ b/internal/tools/populateimages/main.go @@ -14,15 +14,21 @@ import ( ) type secScanConfig struct { - ModuleName string `yaml:"module-name"` - Protecode []string `yaml:"protecode"` - WhiteSource whiteSource `yaml:"whitesource"` + ModuleName string `yaml:"module-name"` + Kind string `yaml:"kind"` + Protecode []string `yaml:"protecode"` + WhiteSource whiteSource `yaml:"whitesource"` + CheckmarxOne checkmarxOne `yaml:"checkmarx-one"` } type whiteSource struct { - Language string `yaml:"language"` - Subprojects bool `yaml:"subprojects"` - Exclude []string `yaml:"exclude"` + Language string `yaml:"language"` + Exclude []string `yaml:"exclude"` +} + +type checkmarxOne struct { + Preset string `yaml:"preset"` + Exclude []string `yaml:"exclude"` } func main() { @@ -176,11 +182,15 @@ func generateSecScanConfig(data map[string]string) error { imgs := []string{data["ENV_IMG"], data["DEFAULT_FLUENTBIT_EXPORTER_IMAGE"], data["DEFAULT_FLUENTBIT_IMAGE"], data["DEFAULT_OTEL_COLLECTOR_IMAGE"], data["DEFAULT_SELFMONITOR_IMAGE"]} secScanCfg := secScanConfig{ ModuleName: "telemetry", + Kind: "kyma", Protecode: imgs, WhiteSource: whiteSource{ - Language: "golang-mod", - Subprojects: false, - Exclude: []string{"**/mocks/**", "**/stubs/**", "**/test/**", "**/*_test.go"}, + Language: "golang-mod", + Exclude: []string{"**/mocks/**", "**/stubs/**", "**/test/**", "**/*_test.go"}, + }, + CheckmarxOne: checkmarxOne{ + Preset: "go-default", + Exclude: []string{"**/mocks/**", "**/stubs/**", "**/test/**", "**/*_test.go"}, }, } diff --git a/sec-scanners-config.yaml b/sec-scanners-config.yaml index 48f7eca35..0a1612ba2 100644 --- a/sec-scanners-config.yaml +++ b/sec-scanners-config.yaml @@ -1,4 +1,5 @@ module-name: telemetry +kind: kyma protecode: - europe-docker.pkg.dev/kyma-project/prod/telemetry-manager:main - europe-docker.pkg.dev/kyma-project/prod/directory-size-exporter:v20241212-e4adf27f @@ -7,7 +8,13 @@ protecode: - europe-docker.pkg.dev/kyma-project/prod/tpi/telemetry-self-monitor:3.1.0-98bf175 whitesource: language: golang-mod - subprojects: false + exclude: + - '**/mocks/**' + - '**/stubs/**' + - '**/test/**' + - '**/*_test.go' +checkmarx-one: + preset: go-default exclude: - '**/mocks/**' - '**/stubs/**' From 1302c47c13e71d0b1721d033aaf27ee1ce4d130b Mon Sep 17 00:00:00 2001 From: Andreas Thaler Date: Wed, 29 Jan 2025 20:08:24 +0100 Subject: [PATCH 5/6] docs: Fix dynatrace links (#1794) --- docs/user/integration/dynatrace/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/user/integration/dynatrace/README.md b/docs/user/integration/dynatrace/README.md index 77ab51719..dd72e5bf4 100644 --- a/docs/user/integration/dynatrace/README.md +++ b/docs/user/integration/dynatrace/README.md @@ -50,11 +50,11 @@ With the Kyma Telemetry module, you gain even more visibility by adding custom s ## Dynatrace Setup -There are different ways to deploy Dynatrace on Kubernetes. All [deployment options](https://www.dynatrace.com/support/help/setup-and-configuration/setup-on-container-platforms/kubernetes/get-started-with-kubernetes-monitoring/deployment-options-k8s) are based on the [Dynatrace Operator](https://github.com/Dynatrace/dynatrace-operator). +There are different ways to deploy Dynatrace on Kubernetes. All [deployment options](https://docs.dynatrace.com/docs/ingest-from/setup-on-k8s/deployment) are based on the [Dynatrace Operator](https://github.com/Dynatrace/dynatrace-operator). 1. Install Dynatrace with the namespace you prepared earlier. > [!NOTE] - > By default, Dynatrace uses the classic full-stack injection. However, for better stability, we recommend using the [cloud-native fullstack injection](https://docs.dynatrace.com/docs/setup-and-configuration/setup-on-k8s/installation/cloud-native-fullstack). + > By default, Dynatrace used the classic full-stack injection. However, for better stability, we recommend using the [cloud-native fullstack injection](https://docs.dynatrace.com/docs/ingest-from/setup-on-k8s/guides/operation/migration/classic-to-cloud-native). 2. In the DynaKube resource, configure the correct `apiurl` of your environment. @@ -76,7 +76,7 @@ There are different ways to deploy Dynatrace on Kubernetes. All [deployment opti 5. In the Dynatrace Hub, enable the **Istio Service Mesh** extension and annotate your services as outlined in the description. -6. If you have a workload exposing metrics in the Prometheus format, you can collect custom metrics in Prometheus format by [annotating the workload](https://docs.dynatrace.com/docs/platform-modules/infrastructure-monitoring/container-platform-monitoring/kubernetes-monitoring/monitor-prometheus-metrics). If the workload has an Istio sidecar, you must either weaken the mTLS setting for the metrics port by defining an [Istio PeerAuthentication](https://istio.io/latest/docs/reference/config/security/peer_authentication/#PeerAuthentication) or exclude the port from interception by the Istio proxy by placing an `traffic.sidecar.istio.io/excludeInboundPorts` annotaion on your Pod that lists the metrics port. +6. If you have a workload exposing metrics in the Prometheus format, you can collect custom metrics in Prometheus format by [annotating the workload](https://docs.dynatrace.com/docs/observe/infrastructure-monitoring/container-platform-monitoring/kubernetes-monitoring/monitor-prometheus-metrics#annotate-kubernetes-services). If the workload has an Istio sidecar, you must either weaken the mTLS setting for the metrics port by defining an [Istio PeerAuthentication](https://istio.io/latest/docs/reference/config/security/peer_authentication/#PeerAuthentication) or exclude the port from interception by the Istio proxy by placing an `traffic.sidecar.istio.io/excludeInboundPorts` annotaion on your Pod that lists the metrics port. As a result, you see data arriving in your environment, advanced Kubernetes monitoring is possible, and Istio metrics are available. @@ -86,14 +86,14 @@ Next, you set up the ingestion of custom span and Istio span data, and, optional ### Create Secret -1. To push custom metrics and spans to Dynatrace, set up a [dataIngestToken](https://docs.dynatrace.com/docs/manage/access-control/access-tokens). +1. To push custom metrics and spans to Dynatrace, set up a [dataIngestToken](https://docs.dynatrace.com/docs/manage/identity-access-management/access-tokens-and-oauth-clients/access-tokens/personal-access-token). - Follow the instructions in [Dynatrace: Generate an access token](https://docs.dynatrace.com/docs/manage/access-control/access-tokens#create-api-token) and select the following scopes: + Follow the instructions in [Dynatrace: Generate an access token](https://docs.dynatrace.com/docs/manage/identity-access-management/access-tokens-and-oauth-clients/access-tokens/personal-access-token#generate-personal-access-tokens) and select the following scopes: - **Ingest metrics** - **Ingest OpenTelemetry traces** -2. Create an [apiToken](https://docs.dynatrace.com/docs/manage/access-control/access-tokens) by selecting the template `Kubernetes: Dynatrace Operator`. +2. Create an [apiToken](https://docs.dynatrace.com/docs/manage/identity-access-management/access-tokens-and-oauth-clients/access-tokens/personal-access-token) by selecting the template `Kubernetes: Dynatrace Operator`. 3. To create a new Secret containing your access tokens, replace the `` and `` placeholder with the `apiToken` and `dataIngestToken` you created, replace the `` placeholder with the Dynatrace endpoint, and run the following command: @@ -169,7 +169,7 @@ There are several approaches to ingest custom metrics to Dynatrace, each with di - Use a MetricPipeline to push metrics directly. > [!NOTE] - > The Dynatrace OTLP API does [not support](https://docs.dynatrace.com/docs/extend-dynatrace/opentelemetry/getting-started/metrics/ingest/migration-guide-otlp-exporter#migrate-collector-configuration) the full OTLP specification and needs custom transformation. A MetricPipeline does not support these transformation features, so that only metrics can be ingested that don't hit the limitations. At the moment, metrics of type "Histogram" and "Summary" are not supported. Furthermore, "Sum"s must use "delta" aggregation temporality. + > The Dynatrace OTLP API does [not support](https://docs.dynatrace.com/docs/shortlink/opentelemetry-metrics-limitations#limitations) the full OTLP specification and needs custom transformation. A MetricPipeline does not support these transformation features, so that only metrics can be ingested that don't hit the limitations. At the moment, metrics of type "Histogram" and "Summary" are not supported. Furthermore, "Sum"s must use "delta" aggregation temporality. Use this setup when your application pushes metrics to the telemetry metric service natively with OTLP, and if you have explicitly enabled "delta" aggregation temporality. You cannot enable additional inputs for the MetricPipeline. @@ -202,7 +202,7 @@ There are several approaches to ingest custom metrics to Dynatrace, each with di EOF ``` - 1. Start pushing metrics to the metric gateway using [delta aggregation temporality.](https://docs.dynatrace.com/docs/extend-dynatrace/opentelemetry/getting-started/metrics/limitations#aggregation-temporality) + 1. Start pushing metrics to the metric gateway using [delta aggregation temporality.](https://docs.dynatrace.com/docs/ingest-from/opentelemetry/getting-started/metrics/limitations#aggregation-temporality) 1. To find metrics from your Kyma cluster in the Dynatrace UI, go to **Observe & Explore** > **Metrics**. @@ -240,7 +240,7 @@ There are several approaches to ingest custom metrics to Dynatrace, each with di - Use the Dynatrace metric ingestion with Prometheus exporters. - Use the [Dynatrace annotation approach](https://docs.dynatrace.com/docs/platform-modules/infrastructure-monitoring/container-platform-monitoring/kubernetes-monitoring/monitor-prometheus-metrics), where the Dynatrace ActiveGate component running in your cluster scrapes workloads that are annotated with Dynatrace-specific annotations. + Use the [Dynatrace annotation approach](https://docs.dynatrace.com/docs/observe/infrastructure-monitoring/container-platform-monitoring/kubernetes-monitoring/monitor-prometheus-metrics), where the Dynatrace ActiveGate component running in your cluster scrapes workloads that are annotated with Dynatrace-specific annotations. This approach works well with workloads that expose metrics in the typical Prometheus format when not running with Istio. If you use Istio, you must disable Istio interception for the relevant metric port with the [traffic.istio.io/excludeInboundPorts](https://istio.io/latest/docs/reference/config/annotations/#TrafficExcludeInboundPorts) annotation. To collect Istio metrics from the envoys themselves, you need additional Dynatrace annotations for every workload. From 7bfdf6f98710fc1f7b07aa23f46717699c3eb581 Mon Sep 17 00:00:00 2001 From: Korbinian Stoemmer Date: Thu, 30 Jan 2025 10:16:42 +0100 Subject: [PATCH 6/6] docs: Update release process documentation (#1776) Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/releasing.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/contributor/releasing.md b/docs/contributor/releasing.md index e8fa7230f..98525c5b0 100644 --- a/docs/contributor/releasing.md +++ b/docs/contributor/releasing.md @@ -2,7 +2,9 @@ ## Release Process -This release process covers the steps to release new major and minor versions for the Kyma Telemetry module. +This release process covers the steps to release new major and minor versions for the Kyma Telemetry module. + +Together with the module release, prepare a new release of the [opentelemetry-collector-components](https://github.com/kyma-project/opentelemetry-collector-components). You will need this later in the release process of the Telemetry Manager. The version of `opentelemetry-collector-components` will include the Telemetry Manager version as part of its version (`{CURRENT_OCC_VERSION}-{TELEMETRY_MANAGER_VERSION}`). 1. Verify that all issues in the [GitHub milestone](https://github.com/kyma-project/telemetry-manager/milestones) related to the version are closed.