chart/values.yaml

# Access Control List for JupyterHub, will be written to a file mounted in
# /etc/jupyterhub/acl/acl.yaml and parsed by custom authenticator logic.
acl.yaml: {}

nfs:
  enabled: false
  serverIP: ""
  serverName: ""
  gitRepoSync:
    enabled: false
  nodeCacher:
    enabled: false

tags:
  # Controls whether Prometheus and Grafana should be be installed as part of
  # this meta Helm chart. The chart is configured to use them if they are
  # available. Its also worth noting that access to Grafana relies on
  # JupyterHub's configurable proxy in this setup, so JupyterHub needs to run to
  # be able to access Grafana.
  metrics: true

jupyterhub:
  hub:
    # NOTE: readinessProbe is disabled because of bad default values make a hub
    # under load become unavailable.
    #
    # ref: https://github.com/jupyterhub/zero-to-jupyterhub-k8s/issues/1732
    readinessProbe:
      enabled: false

    # ref: https://jupyterhub.readthedocs.io/en/stable/reference/services.html#properties-of-a-service
    services:
      grafana:
        # This will make the CHP proxy let /services/grafana route to the
        # grafana service in the k8s namespace, which lets us make use of
        # JupyterHub's HTTPS setup without needing something like nginx-ingress
        # + cert-manager and additional ingress k8s resources.
        url: http://grafana

    db:
      pvc:
        storageClassName: ssd

    extraConfig:
      00-acl-parsing: |
        import os
        import yaml

        def read_acl(path="/etc/jupyterhub/acl/acl.yaml"):
            """
            Load a mounted Access Control List (ACL) on startup from disk and
            return a dictionary with keys representing group names and with sets
            of usernames as values.
            """
            acl = {}
            if os.path.exists(path):
                print(f"Loading Access Control List (ACL) from {path}")
                with open(path) as f:
                    acl = yaml.safe_load(f)
                    for group, usernames in acl.items(): 
                        acl[group] = {str(username).lower() for username in usernames}
            else:
                print(f"No Access Control List (ACL) at {path}")
            return acl

        acl = read_acl()
        c.Authenticator.admin_users = acl["admins"]
        c.Authenticator.whitelist = set.union(
            acl["admins"],
            acl["instructors"],
            acl["participants"],
        )

    resources:
      requests:
        cpu: 50m
        memory: 1Gi
      limits:
        cpu: 1000m
        memory: 1Gi

  proxy:
    chp:
      resources:
        requests:
          memory: 320Mi
          cpu: 50m
        limits:
          memory: 320Mi
          cpu: 500m
    traefik:
      resources:
        requests:
          memory: 512Mi
          cpu: 50m
        limits:
          memory: 512Mi
          cpu: 1000m

# Reference on the configuration options:
# https://github.com/helm/charts/blob/master/stable/grafana/values.yaml
grafana:
  fullnameOverride: grafana

  # NOTE: It can be useful to be able to render an image of a chart, but that
  #       requires a workaround configuration of the Grafana Helm chart at the
  #       moment.
  #
  #       workaround: https://github.com/helm/charts/issues/21959#issuecomment-640653320
  #       issue with workaround: https://github.com/grafana/grafana/issues/25716
  extraContainers: |
    - name: renderer
      image: grafana/grafana-image-renderer:latest
      resources:
        requests:
          memory: 128Mi
          cpu: 10m
        limits:
          memory: 512Mi
          cpu: 250m
  env:
    GF_RENDERING_SERVER_URL: http://localhost:8081/render
    GF_RENDERING_CALLBACK_URL: http://localhost:3000/services/grafana

  # NOTE: We need Recreate when using a persistence PVC. If we use an external
  # database, we can do a RollingUpdate instead.
  deploymentStrategy:
    type: Recreate

  persistence:
    type: pvc
    enabled: true

  service:
    annotations:
      prometheus.io/scrape: "true"
      prometheus.io/path: "/services/grafana/metrics"

  resources:
    limits:
      cpu: 1
      memory: 1Gi
    requests:
      cpu: 50m
      memory: 100Mi

  initChownData:
    resources:
      limits:
        cpu: 100m
        memory: 128Mi
      requests:
        cpu: 25m
        memory: 64Mi

# Reference on the configuration options:
# https://github.com/helm/charts/blob/master/stable/prometheus/values.yaml
prometheus:
  fullnameOverride: prometheus

  # the actual prometheus server that polls various sources for metrics etc.
  server:
    fullnameOverride: prometheus-server
    enabled: true

    # data retention period
    retention: 3y

    # NOTE: We prefer StatefulSet to be used when using a persistence PVC. If we
    #       use an external database, we can use a Deployment with rolling
    #       updates instead. Until then, we should shut down one pod and then
    #       start up another, which a StatefulSet will do by default and a
    #       Deployment will Recreate as an upgradeStrategy will also do.
    statefulSet:
      enabled: true
    persistentVolume:
      enabled: true
      size: 200Gi
      storageClass: ssd
    resources:
      limits:
        cpu: 2
        memory: 12Gi
      requests:
        cpu: 50m
        # IMPORTANT: This value was lowered to 100Mi from 12Gi after the course
        # ended to allow prometheus to run in a cheaper node.
        memory: 100Mi

  # alertmanager is meant to be able to alert using email etc. Grafana can also
  # do this by itself to some degree at least as I understand it.
  alertmanager:
    fullnameOverride: prometheus-alertmanager
    enabled: false

  # kube-state-metrics exports information coming from the kubernetes api-server
  # about the state of kubernetes resources. It can list the state of pods etc.
  #
  # ref: https://github.com/helm/charts/blob/master/stable/prometheus/requirements.yaml
  # ref: https://github.com/helm/charts/tree/master/stable/kube-state-metrics
  kube-state-metrics:
    fullnameOverride: prometheus-kube-state-metrics
    resources:
      limits:
        cpu: 100m
        memory: 64Mi
      requests:
        cpu: 10m
        memory: 32Mi
  kubeStateMetrics:
    enabled: true

  nodeExporter:
    fullnameOverride: prometheus-node-exporter
    enabled: true
    # NOTE: We want to be able to scrape metrics on all nodes, even GPU nodes
    #       etc.
    tolerations:
      - operator: "Exists"
    resources:
      limits:
        cpu: 200m
        memory: 50Mi
      requests:
        cpu: 50m
        memory: 30Mi

  # pushgateway is meant to buffer metrics pushed to it from short lived sources
  # and expose them later for prometheus in their place.
  pushgateway:
    fullnameOverride: prometheus-pushgateway
    enabled: false