charts/vault-monitoring/values.yaml

# Default values for vault-monitoring.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

# -- specifies the name override to be used for helm
nameOverride: ""
# -- specifies the full name override to be used for helm
fullnameOverride: ""

vault:
  # -- the vault servername
  serverName: vault.example.com
  # -- the vault port  to connect to
  port: 443
  # -- the vault portName to use in the services
  portName: https
  # -- the CA path to include in the configuration
  ca_path: /etc/vault/ssl/ca.crt
  # -- the vault ip to connect to
  ip: 10.1.2.3
  # -- the scheme to use for connection
  scheme: https
  # -- if set a secret is created that must be mounted in Prometheus for the ServiceMonitoring to work
  ca: ""
  auth:
    # -- where the kubernetes auth is mounted on vault
    mount_path: auth/kubernetes
    # -- the vault role to use for connection
    role: metrics
  service:
    # -- which type the vault service has. For connecting to an external vault server, choose ExternalName
    type: ExternalName
    # -- definition of the  vault service selector for endpoint selection. Keep empty for using ExternalName
    selector: {}
  serviceMonitor:
    # -- wheter or not the serviceMonitor should be created
    create: true
    # -- labels to set on the vault serviceMonitor
    labels: {}
    # -- scrapeTimeout of the serviceMonitor
    scrapeTimeout: 30s
    # -- Specify to enable authentication
    authentication: true
    # -- Specify a bearerTokenFile for authentication. Keep empty for Unauthenticated access
    bearerTokenFile: '/etc/prometheus/config_out/.vault-token'

prometheusRules:
  # -- wether or not the prometheus alerts are enabled
  enabled: false
  # -- set namespace where the prometheusRules will be created
  namespace: ""
  # --  labels to set in the prometheus alerts
  labels: {}
  # -- set of prometheus alerts to define
  # @default -- list of predefined alerts
  rules:
    - alert: VaultAutopilotNodeHealthy
      # Set to 1 if Autopilot considers all nodes healthy
      # https://www.vaultproject.io/docs/internals/telemetry#integrated-storage-raft-autopilot
      expr: vault_autopilot_healthy < 1
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Autopilot Vault Raft node unhealthy
        description: At least one of the Autopilot Vault Raft nodes is unhealthy
    - alert: VaultLeadershipLoss
      expr: sum(increase(vault_core_leadership_lost_count{job="vault-monitoring"}[1h])) > 5
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: High frequency of Vault leadership losses
        description: There have been more than 5 Vault leadership losses in the past 1h
    - alert: VaultLeadershipStepDowns
      expr: sum(increase(vault_core_step_down_count{job="vault-monitoring"}[1h])) > 5
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: High frequency of Vault leadership step downs
        description: There have been more than 5 Vault leadership step downs in the past 1h
    - alert: VaultLeadershipSetupFailures
      expr: sum(increase(vault_core_leadership_setup_failed{job="vault-monitoring"}[1h])) > 5
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: High frequency of Vault leadership setup failures
        description: There have been more than 5 Vault leadership setup failures in the past 1h
    - alert: VaultRequestFailures
      expr: increase(vault_audit_log_request_failure[5m]) > 0
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: High frequency of failed Vault requests
        description: There has been an increased number of failed Vault requests in the last 15 minutes
    - alert: VaultResponseFailures
      expr: increase(vault_audit_log_response_failure[5m]) > 0
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: High frequency of failed Vault responses
        description: There has been an increased number of failed Vault responses in the last 15 minutes
    - alert: VaultTokenCreate
      expr: increase(vault_token_create_count[5m]) > 100
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: High frequency of created Vault token
        description: There has been an increased number of Vault token creation in the last 15 minutes
    - alert: VaultTokenStore
      expr: increase(vault_token_store_count[5m]) > 100
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: High frequency of stored Vault token
        description: There has been an increased number of Vault token storing in the last 15 minutes
    - alert: VaultLowFailureTolerance
      expr: vault_autopilot_failure_tolerance == 0
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Vault Raft Cluster Failure Tolerance Critical
        description: Vault Raft cluster has a failure tolerance of 0