Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate Kubelet ServiceMonitor to ScrapeConfigs #2235

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 9 additions & 35 deletions jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet
Original file line number Diff line number Diff line change
@@ -1,40 +1,14 @@
{
prometheus+: {
serviceMonitorKubelet+:
{
spec+: {
endpoints: [
{
port: 'http-metrics',
scheme: 'http',
interval: '30s',
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{ sourceLabels: ['__metrics_path__'], targetLabel: 'metrics_path' },
],
},
{
port: 'http-metrics',
scheme: 'http',
path: '/metrics/cadvisor',
interval: '30s',
honorLabels: true,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{ sourceLabels: ['__metrics_path__'], targetLabel: 'metrics_path' },
],
metricRelabelings: [
// Drop a bunch of metrics which are disabled but still sent, see
// https://github.com/google/cadvisor/issues/1925.
{
sourceLabels: ['__name__'],
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
action: 'drop',
},
],
},
],
},
scrapeConfigKubelet+: {
spec+: {
scheme: 'http',
},
},
scrapeConfigKubeletCadvisor+: {
spec+: {
scheme: 'http',
},
},
},
}
175 changes: 94 additions & 81 deletions jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ local defaults = {
},
},
kubeProxy:: false,
prometheusServiceAccountTokenSecretName: 'prometheus-k8s-token',
};

function(params) {
Expand Down Expand Up @@ -87,102 +88,114 @@ function(params) {
},
},

serviceMonitorKubelet: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
scrapeConfigKubelet: {
apiVersion: 'monitoring.coreos.com/v1alpha1',
kind: 'ScrapeConfig',
metadata: k8s._metadata {
name: 'kubelet',
labels+: { 'app.kubernetes.io/name': 'kubelet' },
},
spec: {
jobLabel: 'app.kubernetes.io/name',
endpoints: [
authorization: {
credentials: {
key: 'token',
name: k8s._config.prometheusServiceAccountTokenSecretName,
},
type: 'Bearer',
},
honorLabels: true,
kubernetesSDConfigs: [{ role: 'Node' }],
metricRelabelings: relabelings,
metricsPath: '/metrics',
// Majority of those relabelings are here to preserve as much backwards compatibility as possible
// with the old ServiceMonitor scrape configuration.
relabelings: [
{
port: 'https-metrics',
scheme: 'https',
interval: '30s',
honorLabels: true,
tlsConfig: { insecureSkipVerify: true },
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
metricRelabelings: relabelings,
relabelings: [{
action: 'replace',
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path',
}],
action: 'replace',
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path',
},
{
port: 'https-metrics',
scheme: 'https',
path: '/metrics/cadvisor',
interval: '30s',
honorLabels: true,
honorTimestamps: false,
tlsConfig: {
insecureSkipVerify: true,
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [{
action: 'replace',
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path',
}],
metricRelabelings: [
// Drop a bunch of metrics which are disabled but still sent, see
// https://github.com/google/cadvisor/issues/1925.
{
sourceLabels: ['__name__'],
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
action: 'drop',
},
// Drop cAdvisor metrics with no (pod, namespace) labels while preserving ability to monitor system services resource usage (cardinality estimation)
{
sourceLabels: ['__name__', 'pod', 'namespace'],
action: 'drop',
regex: '(' + std.join('|',
[
'container_spec_.*', // everything related to cgroup specification and thus static data (nodes*services*5)
'container_file_descriptors', // file descriptors limits and global numbers are exposed via (nodes*services)
'container_sockets', // used sockets in cgroup. Usually not important for system services (nodes*services)
'container_threads_max', // max number of threads in cgroup. Usually for system services it is not limited (nodes*services)
'container_threads', // used threads in cgroup. Usually not important for system services (nodes*services)
'container_start_time_seconds', // container start. Possibly not needed for system services (nodes*services)
'container_last_seen', // not needed as system services are always running (nodes*services)
]) + ');;',
},
{
sourceLabels: ['__name__', 'container'],
action: 'drop',
regex: '(' + std.join('|',
[
'container_blkio_device_usage_total',
]) + ');.+',
},
],
action: 'replace',
replacement: 'kube-system',
targetLabel: 'namespace',
},
{
port: 'https-metrics',
scheme: 'https',
path: '/metrics/probes',
interval: '30s',
honorLabels: true,
tlsConfig: { insecureSkipVerify: true },
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [{
action: 'replace',
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path',
}],
action: 'replace',
sourceLabels: ['__meta_kubernetes_node_name'],
targetLabel: 'node',
},
{
targetLabel: 'job',
replacement: 'kubelet',
},
],
selector: {
matchLabels: { 'app.kubernetes.io/name': 'kubelet' },
},
namespaceSelector: {
matchNames: ['kube-system'],
scheme: 'HTTPS',
scrapeInterval: '30s',
tlsConfig: {
insecureSkipVerify: true,
},
},
},
scrapeConfigKubeletCadvisor: k8s.scrapeConfigKubelet {
metadata+: {
name: 'kubelet-cadvisor',
},
spec+: {
honorTimestamps: false,
metricsPath: '/metrics/cadvisor',
},
},
scrapeConfigKubeletProbes: k8s.scrapeConfigKubelet {
metadata+: {
name: 'kubelet-probes',
},
spec+: {
metricsPath: '/metrics/probes',
metricRelabelings: [
// Drop a bunch of metrics which are disabled but still sent, see
// https://github.com/google/cadvisor/issues/1925.
{
sourceLabels: ['__name__'],
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
action: 'drop',
},
// Drop cAdvisor metrics with no (pod, namespace) labels while preserving ability to monitor system services resource usage (cardinality estimation)
{
sourceLabels: ['__name__', 'pod', 'namespace'],
action: 'drop',
regex: '(' + std.join('|',
[
'container_spec_.*', // everything related to cgroup specification and thus static data (nodes*services*5)
'container_file_descriptors', // file descriptors limits and global numbers are exposed via (nodes*services)
'container_sockets', // used sockets in cgroup. Usually not important for system services (nodes*services)
'container_threads_max', // max number of threads in cgroup. Usually for system services it is not limited (nodes*services)
'container_threads', // used threads in cgroup. Usually not important for system services (nodes*services)
'container_start_time_seconds', // container start. Possibly not needed for system services (nodes*services)
'container_last_seen', // not needed as system services are always running (nodes*services)
]) + ');;',
},
{
sourceLabels: ['__name__', 'container'],
action: 'drop',
regex: '(' + std.join('|',
[
'container_blkio_device_usage_total',
]) + ');.+',
},
],
},
},
/*scrapeConfigKubeletSLIs: k8s.scrapeConfigKubelet {
metadata+: {
name: 'kubelet-slis',
},
spec+: {
metricsPath: '/metrics/slis',
scrapeInterval: '5s',
scrapeTimeout: '5s',
},
},*/

serviceMonitorKubeControllerManager: {
apiVersion: 'monitoring.coreos.com/v1',
Expand Down
3 changes: 2 additions & 1 deletion jsonnet/kube-prometheus/main.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ local utils = import './lib/utils.libsonnet';
image: $.values.common.images.prometheusAdapter,
prometheusURL: 'http://prometheus-' + $.values.prometheus.name + '.' + $.values.prometheus.namespace + '.svc:9090/',
rangeIntervals+: {
kubelet: utils.rangeInterval($.kubernetesControlPlane.serviceMonitorKubelet.spec.endpoints[0].interval),
kubelet: utils.rangeInterval($.kubernetesControlPlane.scrapeConfigKubelet.spec.scrapeInterval),
nodeExporter: utils.rangeInterval($.nodeExporter.serviceMonitor.spec.endpoints[0].interval),
},
},
Expand All @@ -127,6 +127,7 @@ local utils = import './lib/utils.libsonnet';
kubernetesControlPlane: {
namespace: $.values.common.namespace,
mixin+: { ruleLabels: $.values.common.ruleLabels },
prometheusServiceAccountTokenSecretName: 'prometheus-' + $.values.prometheus.name + '-token',
},
},

Expand Down
4 changes: 3 additions & 1 deletion kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@ resources:
- ./manifests/kubeStateMetrics-serviceAccount.yaml
- ./manifests/kubeStateMetrics-serviceMonitor.yaml
- ./manifests/kubernetesControlPlane-prometheusRule.yaml
- ./manifests/kubernetesControlPlane-scrapeConfigKubelet.yaml
- ./manifests/kubernetesControlPlane-scrapeConfigKubeletCadvisor.yaml
- ./manifests/kubernetesControlPlane-scrapeConfigKubeletProbes.yaml
- ./manifests/kubernetesControlPlane-serviceMonitorApiserver.yaml
- ./manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml
- ./manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml
- ./manifests/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml
- ./manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml
- ./manifests/nodeExporter-clusterRole.yaml
- ./manifests/nodeExporter-clusterRoleBinding.yaml
- ./manifests/nodeExporter-daemonset.yaml
Expand Down
69 changes: 69 additions & 0 deletions manifests/kubernetesControlPlane-scrapeConfigKubelet.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
apiVersion: monitoring.coreos.com/v1alpha1
kind: ScrapeConfig
metadata:
labels:
app.kubernetes.io/name: kubelet
app.kubernetes.io/part-of: kube-prometheus
name: kubelet
namespace: monitoring
spec:
authorization:
credentials:
key: token
name: prometheus-k8s-token
type: Bearer
honorLabels: true
kubernetesSDConfigs:
- role: Node
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
sourceLabels:
- __name__
metricsPath: /metrics
relabelings:
- action: replace
sourceLabels:
- __metrics_path__
targetLabel: metrics_path
- action: replace
replacement: kube-system
targetLabel: namespace
- action: replace
sourceLabels:
- __meta_kubernetes_node_name
targetLabel: node
- replacement: kubelet
targetLabel: job
scheme: HTTPS
scrapeInterval: 30s
tlsConfig:
insecureSkipVerify: true
Loading