helm/charts/prometheus/v6.3.0/values.yaml

rbac:
  create: true

podSecurityPolicy:
  enabled: false

imagePullSecrets:
# - name: "image-pull-secret"

## Define serviceAccount names for components. Defaults to component's fully qualified name.
##
serviceAccounts:
  alertmanager:
    create: true
    name:
    annotations: {}
  nodeExporter:
    create: true
    name:
    annotations: {}
  pushgateway:
    create: true
    name:
    annotations: {}
  kubeStateMetrics:
    create: true
    name:
    annotations: {}
  server:
    create: true
    name:
    annotations: {}

alertmanager:
  ## If false, alertmanager will not be installed
  ##
  enabled: true

  ## Use a ClusterRole (and ClusterRoleBinding)
  ## - If set to false - we define a Role and RoleBinding in the defined namespaces ONLY
  ## This makes alertmanager work - for users who do not have ClusterAdmin privs, but wants alertmanager to operate on their own namespaces, instead of clusterwide.
  useClusterRole: true

  ## Set to a rolename to use existing role - skipping role creating - but still doing serviceaccount and rolebinding to the rolename set here.
  useExistingRole: false

  ## alertmanager container name
  ##
  name: alertmanager

  ## alertmanager container image
  ##
  image:
    repository: prom/alertmanager
    tag: v0.21.0
    pullPolicy: IfNotPresent

  ## alertmanager priorityClassName
  ##
  priorityClassName: ""

  ## Additional alertmanager container arguments
  ##
  extraArgs: {}

  ## Additional InitContainers to initialize the pod
  ##
  extraInitContainers: []

  ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug
  ## so that the various internal URLs are still able to access as they are in the default case.
  ## (Optional)
  prefixURL: ""

  ## External URL which can access alertmanager
  baseURL: "http://localhost:9093"

  ## Additional alertmanager container environment variable
  ## For instance to add a http_proxy
  ##
  extraEnv: {}

  ## Additional alertmanager Secret mounts
  # Defines additional mounts with secrets. Secrets must be manually created in the namespace.
  extraSecretMounts: []
    # - name: secret-files
    #   mountPath: /etc/secrets
    #   subPath: ""
    #   secretName: alertmanager-secret-files
    #   readOnly: true

  ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}}
  ## Defining configMapOverrideName will cause templates/alertmanager-configmap.yaml
  ## to NOT generate a ConfigMap resource
  ##
  configMapOverrideName: ""

  ## The name of a secret in the same kubernetes namespace which contains the Alertmanager config
  ## Defining configFromSecret will cause templates/alertmanager-configmap.yaml
  ## to NOT generate a ConfigMap resource
  ##
  configFromSecret: ""

  ## The configuration file name to be loaded to alertmanager
  ## Must match the key within configuration loaded from ConfigMap/Secret
  ##
  configFileName: alertmanager.yml

  ingress:
    ## If true, alertmanager Ingress will be created
    ##
    enabled: false

    ## alertmanager Ingress annotations
    ##
    annotations: {}
    #   kubernetes.io/ingress.class: nginx
    #   kubernetes.io/tls-acme: 'true'

    ## alertmanager Ingress additional labels
    ##
    extraLabels: {}

    ## alertmanager Ingress hostnames with optional path
    ## Must be provided if Ingress is enabled
    ##
    hosts: []
    #   - alertmanager.domain.com
    #   - domain.com/alertmanager

    ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
    extraPaths: []
    # - path: /*
    #   backend:
    #     serviceName: ssl-redirect
    #     servicePort: use-annotation

    ## alertmanager Ingress TLS configuration
    ## Secrets must be manually created in the namespace
    ##
    tls: []
    #   - secretName: prometheus-alerts-tls
    #     hosts:
    #       - alertmanager.domain.com

  ## Alertmanager Deployment Strategy type
  # strategy:
  #   type: Recreate

  ## Node tolerations for alertmanager scheduling to nodes with taints
  ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
  ##
  tolerations: []
    # - key: "key"
    #   operator: "Equal|Exists"
    #   value: "value"
    #   effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"

  ## Node labels for alertmanager pod assignment
  ## Ref: https://kubernetes.io/docs/user-guide/node-selection/
  ##
  nodeSelector: {}

  ## Pod affinity
  ##
  affinity: {}

  ## PodDisruptionBudget settings
  ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
  ##
  podDisruptionBudget:
    enabled: false
    maxUnavailable: 1

  ## Use an alternate scheduler, e.g. "stork".
  ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
  ##
  # schedulerName:

  persistentVolume:
    ## If true, alertmanager will create/use a Persistent Volume Claim
    ## If false, use emptyDir
    ##
    enabled: true

    ## alertmanager data Persistent Volume access modes
    ## Must match those of existing PV or dynamic provisioner
    ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
    ##
    accessModes:
      - ReadWriteOnce

    ## alertmanager data Persistent Volume Claim annotations
    ##
    annotations: {}

    ## alertmanager data Persistent Volume existing claim name
    ## Requires alertmanager.persistentVolume.enabled: true
    ## If defined, PVC must be created manually before volume will be bound
    existingClaim: ""

    ## alertmanager data Persistent Volume mount root path
    ##
    mountPath: /data

    ## alertmanager data Persistent Volume size
    ##
    size: 2Gi

    ## alertmanager data Persistent Volume Storage Class
    ## If defined, storageClassName: <storageClass>
    ## If set to "-", storageClassName: "", which disables dynamic provisioning
    ## If undefined (the default) or set to null, no storageClassName spec is
    ##   set, choosing the default provisioner.  (gp2 on AWS, standard on
    ##   GKE, AWS & OpenStack)
    ##
    # storageClass: "-"

    ## alertmanager data Persistent Volume Binding Mode
    ## If defined, volumeBindingMode: <volumeBindingMode>
    ## If undefined (the default) or set to null, no volumeBindingMode spec is
    ##   set, choosing the default mode.
    ##
    # volumeBindingMode: ""

    ## Subdirectory of alertmanager data Persistent Volume to mount
    ## Useful if the volume's root directory is not empty
    ##
    subPath: ""

  ## Annotations to be added to alertmanager pods
  ##
  podAnnotations: {}
    ## Tell prometheus to use a specific set of alertmanager pods
    ## instead of all alertmanager pods found in the same namespace
    ## Useful if you deploy multiple releases within the same namespace
    ##
    ## prometheus.io/probe: alertmanager-teamA

  ## Labels to be added to Prometheus AlertManager pods
  ##
  podLabels: {}

  ## Specify if a Pod Security Policy for node-exporter must be created
  ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
  ##
  podSecurityPolicy:
    annotations: {}
      ## Specify pod annotations
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
      ##
      # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
      # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
      # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'

  ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below)
  ##
  replicaCount: 1

  ## Annotations to be added to deployment
  ##
  deploymentAnnotations: {}

  statefulSet:
    ## If true, use a statefulset instead of a deployment for pod management.
    ## This allows to scale replicas to more than 1 pod
    ##
    enabled: false

    annotations: {}
    labels: {}
    podManagementPolicy: OrderedReady

    ## Alertmanager headless service to use for the statefulset
    ##
    headless:
      annotations: {}
      labels: {}

      ## Enabling peer mesh service end points for enabling the HA alert manager
      ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md
      enableMeshPeer: false

      servicePort: 80

  ## alertmanager resource requests and limits
  ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
  ##
  resources: {}
    # limits:
    #   cpu: 10m
    #   memory: 32Mi
    # requests:
    #   cpu: 10m
    #   memory: 32Mi

  ## Security context to be added to alertmanager pods
  ##
  securityContext:
    runAsUser: 65534
    runAsNonRoot: true
    runAsGroup: 65534
    fsGroup: 65534

  service:
    annotations: {}
    labels: {}
    clusterIP: ""

    ## Enabling peer mesh service end points for enabling the HA alert manager
    ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md
    # enableMeshPeer : true

    ## List of IP addresses at which the alertmanager service is available
    ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
    ##
    externalIPs: []

    loadBalancerIP: ""
    loadBalancerSourceRanges: []
    servicePort: 80
    # nodePort: 30000
    sessionAffinity: None
    type: ClusterIP

## Monitors ConfigMap changes and POSTs to a URL
## Ref: https://github.com/jimmidyson/configmap-reload
##
configmapReload:
  prometheus:
    ## If false, the configmap-reload container will not be deployed
    ##
    enabled: true

    ## configmap-reload container name
    ##
    name: configmap-reload

    ## configmap-reload container image
    ##
    image:
      repository: jimmidyson/configmap-reload
      tag: v0.4.0
      pullPolicy: IfNotPresent

    ## Additional configmap-reload container arguments
    ##
    extraArgs: {}
    ## Additional configmap-reload volume directories
    ##
    extraVolumeDirs: []


    ## Additional configmap-reload mounts
    ##
    extraConfigmapMounts: []
      # - name: prometheus-alerts
      #   mountPath: /etc/alerts.d
      #   subPath: ""
      #   configMap: prometheus-alerts
      #   readOnly: true


    ## configmap-reload resource requests and limits
    ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
    ##
    resources: {}
  alertmanager:
    ## If false, the configmap-reload container will not be deployed
    ##
    enabled: true

    ## configmap-reload container name
    ##
    name: configmap-reload

    ## configmap-reload container image
    ##
    image:
      repository: jimmidyson/configmap-reload
      tag: v0.4.0
      pullPolicy: IfNotPresent

    ## Additional configmap-reload container arguments
    ##
    extraArgs: {}
    ## Additional configmap-reload volume directories
    ##
    extraVolumeDirs: []


    ## Additional configmap-reload mounts
    ##
    extraConfigmapMounts: []
      # - name: prometheus-alerts
      #   mountPath: /etc/alerts.d
      #   subPath: ""
      #   configMap: prometheus-alerts
      #   readOnly: true


    ## configmap-reload resource requests and limits
    ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
    ##
    resources: {}

kubeStateMetrics:
  ## If false, kube-state-metrics will not be installed
  ##
  enabled: true

  ## kube-state-metrics container name
  ##
  name: kube-state-metrics

  image:
    repository: quay.io/coreos/kube-state-metrics
    tag: v1.9.7
    pullPolicy: IfNotPresent

  podSecurityPolicy:
    annotations: {}
    additionalVolumes: []

  imagePullSecrets: []
  # - name: "image-pull-secret"

  # If set to true, this will deploy kube-state-metrics as a StatefulSet and the data
  # will be automatically sharded across <.Values.replicas> pods using the built-in
  # autodiscovery feature: https://github.com/kubernetes/kube-state-metrics#automated-sharding
  # This is an experimental feature and there are no stability guarantees.
  autosharding:
    enabled: false

  replicas: 1

  service:
    port: 8080
    # Default to clusterIP for backward compatibility
    type: ClusterIP
    nodePort: 0
    loadBalancerIP: ""
    annotations: {}

  customLabels: {}

  hostNetwork: false

  securityContext:
    enabled: true
    runAsGroup: 65534
    runAsUser: 65534
    fsGroup: 65534

  ## Node labels for pod assignment
  ## Ref: https://kubernetes.io/docs/user-guide/node-selection/
  nodeSelector: {}

  ## Affinity settings for pod assignment
  ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
  affinity: {}

  ## Tolerations for pod assignment
  ## Ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
  tolerations: []

  # Annotations to be added to the pod
  podAnnotations: {}

  ## Assign a PriorityClassName to pods if set
  # priorityClassName: ""

  # Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
  podDisruptionBudget: {}

  # Available collectors for kube-state-metrics. By default all available
  # collectors are enabled.
  collectors:
    certificatesigningrequests: true
    configmaps: true
    cronjobs: true
    daemonsets: true
    deployments: true
    endpoints: true
    horizontalpodautoscalers: true
    ingresses: true
    jobs: true
    limitranges: true
    mutatingwebhookconfigurations: true
    namespaces: true
    networkpolicies: true
    nodes: true
    persistentvolumeclaims: true
    persistentvolumes: true
    poddisruptionbudgets: true
    pods: true
    replicasets: true
    replicationcontrollers: true
    resourcequotas: true
    secrets: true
    services: true
    statefulsets: true
    storageclasses: true
    validatingwebhookconfigurations: true
    verticalpodautoscalers: false
    volumeattachments: true

nodeExporter:
  ## If false, node-exporter will not be installed
  ##
  enabled: true

  ## If true, node-exporter pods share the host network namespace
  ##
  hostNetwork: false

  ## If true, node-exporter pods share the host PID namespace
  ##
  hostPID: false

  ## node-exporter container name
  ##
  name: node-exporter

  ## node-exporter container image
  ##
  image:
    repository: prom/node-exporter
    tag: v1.0.1
    pullPolicy: IfNotPresent

  ## Specify if a Pod Security Policy for node-exporter must be created
  ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
  ##
  podSecurityPolicy:
    annotations: {}
      ## Specify pod annotations
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
      ##
      # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
      # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
      # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'

  ## node-exporter priorityClassName
  ##
  priorityClassName: ""

  ## Custom Update Strategy
  ##
  updateStrategy:
    type: RollingUpdate

  ## Additional node-exporter container arguments
  ##
  extraArgs: {}

  ## Additional InitContainers to initialize the pod
  ##
  extraInitContainers: []

  ## Additional node-exporter hostPath mounts
  ##
  extraHostPathMounts: []
    # - name: textfile-dir
    #   mountPath: /srv/txt_collector
    #   hostPath: /var/lib/node-exporter
    #   readOnly: true
    #   mountPropagation: HostToContainer

  extraConfigmapMounts: []
    # - name: certs-configmap
    #   mountPath: /prometheus
    #   configMap: certs-configmap
    #   readOnly: true

  ## Node tolerations for node-exporter scheduling to nodes with taints
  ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
  ##
  tolerations: []
    # - key: "key"
    #   operator: "Equal|Exists"
    #   value: "value"
    #   effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"

  ## Node labels for node-exporter pod assignment
  ## Ref: https://kubernetes.io/docs/user-guide/node-selection/
  ##
  nodeSelector: {}

  ## Annotations to be added to node-exporter pods
  ##
  podAnnotations: {}

  ## Labels to be added to node-exporter pods
  ##
  pod:
    labels: {}

  ## PodDisruptionBudget settings
  ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
  ##
  podDisruptionBudget:
    enabled: false
    maxUnavailable: 1

  ## node-exporter resource limits & requests
  ## Ref: https://kubernetes.io/docs/user-guide/compute-resources/
  ##
  resources: {}
    # limits:
    #   cpu: 200m
    #   memory: 50Mi
    # requests:
    #   cpu: 100m
    #   memory: 30Mi

  ## Security context to be added to node-exporter pods
  ##
  securityContext: {}
    # runAsUser: 0

  service:
    annotations:
      prometheus.io/scrape: "true"
    labels: {}

    # Exposed as a headless service:
    # https://kubernetes.io/docs/concepts/services-networking/service/#headless-services
    clusterIP: None

    ## List of IP addresses at which the node-exporter service is available
    ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
    ##
    externalIPs: []

    hostPort: 9100
    loadBalancerIP: ""
    loadBalancerSourceRanges: []
    servicePort: 9100
    type: ClusterIP

server:
  ## Prometheus server container name
  ##
  enabled: true

  ## Use a ClusterRole (and ClusterRoleBinding)
  ## - If set to false - we define a RoleBinding in the defined namespaces ONLY
  ##
  ## NB: because we need a Role with nonResourceURL's ("/metrics") - you must get someone with Cluster-admin privileges to define this role for you, before running with this setting enabled.
  ##     This makes prometheus work - for users who do not have ClusterAdmin privs, but wants prometheus to operate on their own namespaces, instead of clusterwide.
  ##
  ## You MUST also set namespaces to the ones you have access to and want monitored by Prometheus.
  ##
  # useExistingClusterRoleName: nameofclusterrole

  ## namespaces to monitor (instead of monitoring all - clusterwide). Needed if you want to run without Cluster-admin privileges.
  # namespaces:
  #   - yournamespace

  name: server
  sidecarContainers:

  ## Prometheus server container image
  ##
  image:
    repository: prom/prometheus
    tag: v2.21.0
    pullPolicy: IfNotPresent

  ## prometheus server priorityClassName
  ##
  priorityClassName: ""

  ## EnableServiceLinks indicates whether information about services should be injected
  ## into pod's environment variables, matching the syntax of Docker links.
  ## WARNING: the field is unsupported and will be skipped in K8s prior to v1.13.0.
  ##
  enableServiceLinks: true

  ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug
  ## so that the various internal URLs are still able to access as they are in the default case.
  ## (Optional)
  prefixURL: ""

  ## External URL which can access alertmanager
  ## Maybe same with Ingress host name
  baseURL: ""

  ## Additional server container environment variables
  ##
  ## You specify this manually like you would a raw deployment manifest.
  ## This means you can bind in environment variables from secrets.
  ##
  ## e.g. static environment variable:
  ##  - name: DEMO_GREETING
  ##    value: "Hello from the environment"
  ##
  ## e.g. secret environment variable:
  ## - name: USERNAME
  ##   valueFrom:
  ##     secretKeyRef:
  ##       name: mysecret
  ##       key: username
  env: []

  extraFlags:
    - web.enable-lifecycle
    ## web.enable-admin-api flag controls access to the administrative HTTP API which includes functionality such as
    ## deleting time series. This is disabled by default.
    # - web.enable-admin-api
    ##
    ## storage.tsdb.no-lockfile flag controls BD locking
    # - storage.tsdb.no-lockfile
    ##
    ## storage.tsdb.wal-compression flag enables compression of the write-ahead log (WAL)
    # - storage.tsdb.wal-compression

  ## Path to a configuration file on prometheus server container FS
  configPath: /etc/config/prometheus.yml

  global:
    ## How frequently to scrape targets by default
    ##
    scrape_interval: 1m
    ## How long until a scrape request times out
    ##
    scrape_timeout: 10s
    ## How frequently to evaluate rules
    ##
    evaluation_interval: 1m
  ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
  ##
  remoteWrite: []
  ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
  ##
  remoteRead: []

  ## Additional Prometheus server container arguments
  ##
  extraArgs: {}

  ## Additional InitContainers to initialize the pod
  ##
  extraInitContainers: []

  ## Additional Prometheus server Volume mounts
  ##
  extraVolumeMounts: []

  ## Additional Prometheus server Volumes
  ##
  extraVolumes: []

  ## Additional Prometheus server hostPath mounts
  ##
  extraHostPathMounts: []
    # - name: certs-dir
    #   mountPath: /etc/kubernetes/certs
    #   subPath: ""
    #   hostPath: /etc/kubernetes/certs
    #   readOnly: true

  extraConfigmapMounts: []
    # - name: certs-configmap
    #   mountPath: /prometheus
    #   subPath: ""
    #   configMap: certs-configmap
    #   readOnly: true

  ## Additional Prometheus server Secret mounts
  # Defines additional mounts with secrets. Secrets must be manually created in the namespace.
  extraSecretMounts: []
    # - name: secret-files
    #   mountPath: /etc/secrets
    #   subPath: ""
    #   secretName: prom-secret-files
    #   readOnly: true

  ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.server.configMapOverrideName}}
  ## Defining configMapOverrideName will cause templates/server-configmap.yaml
  ## to NOT generate a ConfigMap resource
  ##
  configMapOverrideName: ""

  ingress:
    ## If true, Prometheus server Ingress will be created
    ##
    enabled: false

    ## Prometheus server Ingress annotations
    ##
    annotations: {}
    #   kubernetes.io/ingress.class: nginx
    #   kubernetes.io/tls-acme: 'true'

    ## Prometheus server Ingress additional labels
    ##
    extraLabels: {}

    ## Prometheus server Ingress hostnames with optional path
    ## Must be provided if Ingress is enabled
    ##
    hosts: []
    #   - prometheus.domain.com
    #   - domain.com/prometheus

    ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
    extraPaths: []
    # - path: /*
    #   backend:
    #     serviceName: ssl-redirect
    #     servicePort: use-annotation

    ## Prometheus server Ingress TLS configuration
    ## Secrets must be manually created in the namespace
    ##
    tls: []
    #   - secretName: prometheus-server-tls
    #     hosts:
    #       - prometheus.domain.com

  ## Server Deployment Strategy type
  # strategy:
  #   type: Recreate

  ## hostAliases allows adding entries to /etc/hosts inside the containers
  hostAliases: []
  #   - ip: "127.0.0.1"
  #     hostnames:
  #       - "example.com"

  ## Node tolerations for server scheduling to nodes with taints
  ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
  ##
  tolerations: []
    # - key: "key"
    #   operator: "Equal|Exists"
    #   value: "value"
    #   effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"

  ## Node labels for Prometheus server pod assignment
  ## Ref: https://kubernetes.io/docs/user-guide/node-selection/
  ##
  nodeSelector: {}

  ## Pod affinity
  ##
  affinity: {}

  ## PodDisruptionBudget settings
  ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
  ##
  podDisruptionBudget:
    enabled: false
    maxUnavailable: 1

  ## Use an alternate scheduler, e.g. "stork".
  ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
  ##
  # schedulerName:

  persistentVolume:
    ## If true, Prometheus server will create/use a Persistent Volume Claim
    ## If false, use emptyDir
    ##
    enabled: true

    ## Prometheus server data Persistent Volume access modes
    ## Must match those of existing PV or dynamic provisioner
    ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
    ##
    accessModes:
      - ReadWriteOnce

    ## Prometheus server data Persistent Volume annotations
    ##
    annotations: {}

    ## Prometheus server data Persistent Volume existing claim name
    ## Requires server.persistentVolume.enabled: true
    ## If defined, PVC must be created manually before volume will be bound
    existingClaim: ""

    ## Prometheus server data Persistent Volume mount root path
    ##
    mountPath: /data

    ## Prometheus server data Persistent Volume size
    ##
    size: 8Gi

    ## Prometheus server data Persistent Volume Storage Class
    ## If defined, storageClassName: <storageClass>
    ## If set to "-", storageClassName: "", which disables dynamic provisioning
    ## If undefined (the default) or set to null, no storageClassName spec is
    ##   set, choosing the default provisioner.  (gp2 on AWS, standard on
    ##   GKE, AWS & OpenStack)
    ##
    # storageClass: "-"

    ## Prometheus server data Persistent Volume Binding Mode
    ## If defined, volumeBindingMode: <volumeBindingMode>
    ## If undefined (the default) or set to null, no volumeBindingMode spec is
    ##   set, choosing the default mode.
    ##
    # volumeBindingMode: ""

    ## Subdirectory of Prometheus server data Persistent Volume to mount
    ## Useful if the volume's root directory is not empty
    ##
    subPath: ""

  emptyDir:
    sizeLimit: ""

  ## Annotations to be added to Prometheus server pods
  ##
  podAnnotations: {}
    # iam.amazonaws.com/role: prometheus

  ## Labels to be added to Prometheus server pods
  ##
  podLabels: {}

  ## Prometheus AlertManager configuration
  ##
  alertmanagers: []

  ## Specify if a Pod Security Policy for node-exporter must be created
  ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
  ##
  podSecurityPolicy:
    annotations: {}
      ## Specify pod annotations
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
      ##
      # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
      # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
      # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'

  ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below)
  ##
  replicaCount: 1

  ## Annotations to be added to deployment
  ##
  deploymentAnnotations: {}

  statefulSet:
    ## If true, use a statefulset instead of a deployment for pod management.
    ## This allows to scale replicas to more than 1 pod
    ##
    enabled: false

    annotations: {}
    labels: {}
    podManagementPolicy: OrderedReady

    ## Alertmanager headless service to use for the statefulset
    ##
    headless:
      annotations: {}
      labels: {}
      servicePort: 80
      ## Enable gRPC port on service to allow auto discovery with thanos-querier
      gRPC:
        enabled: false
        servicePort: 10901
        # nodePort: 10901

  ## Prometheus server readiness and liveness probe initial delay and timeout
  ## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
  ##
  readinessProbeInitialDelay: 30
  readinessProbePeriodSeconds: 5
  readinessProbeTimeout: 30
  readinessProbeFailureThreshold: 3
  readinessProbeSuccessThreshold: 1
  livenessProbeInitialDelay: 30
  livenessProbePeriodSeconds: 15
  livenessProbeTimeout: 30
  livenessProbeFailureThreshold: 3
  livenessProbeSuccessThreshold: 1

  ## Prometheus server resource requests and limits
  ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
  ##
  resources: {}
    # limits:
    #   cpu: 500m
    #   memory: 512Mi
    # requests:
    #   cpu: 500m
    #   memory: 512Mi

  ## Vertical Pod Autoscaler config
  ## Ref: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler
  verticalAutoscaler:
    ## If true a VPA object will be created for the controller (either StatefulSet or Deployemnt, based on above configs)
    enabled: false
    # updateMode: "Auto"
    # containerPolicies:
    # - containerName: 'prometheus-server'

  ## Security context to be added to server pods
  ##
  securityContext:
    runAsUser: 65534
    runAsNonRoot: true
    runAsGroup: 65534
    fsGroup: 65534

  service:
    annotations: {}
    labels: {}
    clusterIP: ""

    ## List of IP addresses at which the Prometheus server service is available
    ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
    ##
    externalIPs: []

    loadBalancerIP: ""
    loadBalancerSourceRanges: []
    servicePort: 9090
    sessionAffinity: None
    type: ClusterIP

    ## Enable gRPC port on service to allow auto discovery with thanos-querier
    gRPC:
      enabled: false
      servicePort: 10901
      # nodePort: 10901

    ## If using a statefulSet (statefulSet.enabled=true), configure the
    ## service to connect to a specific replica to have a consistent view
    ## of the data.
    statefulsetReplica:
      enabled: false
      replica: 0

  ## Prometheus server pod termination grace period
  ##
  terminationGracePeriodSeconds: 300

  ## Prometheus data retention period (default if not specified is 15 days)
  ##
  retention: "15d"

pushgateway:
  ## If false, pushgateway will not be installed
  ##
  enabled: true

  ## Use an alternate scheduler, e.g. "stork".
  ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
  ##
  # schedulerName:

  ## pushgateway container name
  ##
  name: pushgateway

  ## pushgateway container image
  ##
  image:
    repository: prom/pushgateway
    tag: v1.2.0
    pullPolicy: IfNotPresent

  ## pushgateway priorityClassName
  ##
  priorityClassName: ""

  ## Additional pushgateway container arguments
  ##
  ## for example: persistence.file: /data/pushgateway.data
  extraArgs: {}

  ## Additional InitContainers to initialize the pod
  ##
  extraInitContainers: []

  ingress:
    ## If true, pushgateway Ingress will be created
    ##
    enabled: false

    ## pushgateway Ingress annotations
    ##
    annotations: {}
    #   kubernetes.io/ingress.class: nginx
    #   kubernetes.io/tls-acme: 'true'

    ## pushgateway Ingress hostnames with optional path
    ## Must be provided if Ingress is enabled
    ##
    hosts: []
    #   - pushgateway.domain.com
    #   - domain.com/pushgateway

    ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
    extraPaths: []
    # - path: /*
    #   backend:
    #     serviceName: ssl-redirect
    #     servicePort: use-annotation

    ## pushgateway Ingress TLS configuration
    ## Secrets must be manually created in the namespace
    ##
    tls: []
    #   - secretName: prometheus-alerts-tls
    #     hosts:
    #       - pushgateway.domain.com

  ## Node tolerations for pushgateway scheduling to nodes with taints
  ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
  ##
  tolerations: []
    # - key: "key"
    #   operator: "Equal|Exists"
    #   value: "value"
    #   effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"

  ## Node labels for pushgateway pod assignment
  ## Ref: https://kubernetes.io/docs/user-guide/node-selection/
  ##
  nodeSelector: {}

  ## Annotations to be added to pushgateway pods
  ##
  podAnnotations: {}

  ## Labels to be added to pushgateway pods
  ##
  podLabels: {}

  ## Specify if a Pod Security Policy for node-exporter must be created
  ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
  ##
  podSecurityPolicy:
    annotations: {}
      ## Specify pod annotations
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
      ##
      # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
      # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
      # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'

  replicaCount: 1

  ## Annotations to be added to deployment
  ##
  deploymentAnnotations: {}

  ## PodDisruptionBudget settings
  ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
  ##
  podDisruptionBudget:
    enabled: false
    maxUnavailable: 1

  ## pushgateway resource requests and limits
  ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
  ##
  resources: {}
    # limits:
    #   cpu: 10m
    #   memory: 32Mi
    # requests:
    #   cpu: 10m
    #   memory: 32Mi

  ## Security context to be added to push-gateway pods
  ##
  securityContext:
    runAsUser: 65534
    runAsNonRoot: true

  service:
    annotations:
      prometheus.io/probe: pushgateway
    labels: {}
    clusterIP: ""

    ## List of IP addresses at which the pushgateway service is available
    ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
    ##
    externalIPs: []

    loadBalancerIP: ""
    loadBalancerSourceRanges: []
    servicePort: 9091
    type: ClusterIP

  ## pushgateway Deployment Strategy type
  # strategy:
  #   type: Recreate

  persistentVolume:
    ## If true, pushgateway will create/use a Persistent Volume Claim
    ## If false, use emptyDir
    ##
    enabled: false

    ## pushgateway data Persistent Volume access modes
    ## Must match those of existing PV or dynamic provisioner
    ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
    ##
    accessModes:
      - ReadWriteOnce

    ## pushgateway data Persistent Volume Claim annotations
    ##
    annotations: {}

    ## pushgateway data Persistent Volume existing claim name
    ## Requires pushgateway.persistentVolume.enabled: true
    ## If defined, PVC must be created manually before volume will be bound
    existingClaim: ""

    ## pushgateway data Persistent Volume mount root path
    ##
    mountPath: /data

    ## pushgateway data Persistent Volume size
    ##
    size: 2Gi

    ## pushgateway data Persistent Volume Storage Class
    ## If defined, storageClassName: <storageClass>
    ## If set to "-", storageClassName: "", which disables dynamic provisioning
    ## If undefined (the default) or set to null, no storageClassName spec is
    ##   set, choosing the default provisioner.  (gp2 on AWS, standard on
    ##   GKE, AWS & OpenStack)
    ##
    # storageClass: "-"

    ## pushgateway data Persistent Volume Binding Mode
    ## If defined, volumeBindingMode: <volumeBindingMode>
    ## If undefined (the default) or set to null, no volumeBindingMode spec is
    ##   set, choosing the default mode.
    ##
    # volumeBindingMode: ""

    ## Subdirectory of pushgateway data Persistent Volume to mount
    ## Useful if the volume's root directory is not empty
    ##
    subPath: ""


## alertmanager ConfigMap entries
##
alertmanagerFiles:
  alertmanager.yml:
    global:
      resolve_timeout: 30s
    route:
      group_by: ["alertname"]
      group_wait: 5s
      group_interval: 10s
      repeat_interval: 999h
      receiver: "default"
      routes:
        - receiver: "default"
          group_by: []
          match_re:
            alertname: .*
          continue: true
        - receiver: "watchdog"
          group_by: ["alertname", "instance"]
          match_re:
            alertname: Watchdog
          continue: false
        - receiver: "by-cluster-service"
          group_by: ["alertname", "cluster", "service"]
          match_re:
            alertname: .*
          continue: true
        - receiver: "by-name"
          group_by: [alertname]
          match_re:
            alertname: .*
          continue: true
        - receiver: "by-cluster"
          group_by: [cluster]
          match_re:
            alertname: .*
          continue: true

    inhibit_rules:
      - source_match:
          severity: "critical"
        target_match:
          severity: "warning"
        # Apply inhibition if the alertname and cluster is the same in both
        equal: ["alertname", "cluster"]

    receivers:
      - name: "default"
      - name: "watchdog"
      - name: "by-cluster-service"
      - name: "by-name"
      - name: "by-cluster"

## Prometheus server ConfigMap entries
##
serverFiles:

  ## Alerts configuration
  ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/

  #alerting_rules.yml: {}
  alerting_rules.yml:
    groups:
    - name: promethus
      rules:
      - alert: PrometheusJobMissing
        expr: absent(up{job="prometheus"})
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus job missing (instance {{ $labels.instance }})"
          description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTargetMissing
        expr: up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus target missing (instance {{ $labels.instance }})"
          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusAllTargetsMissing
        expr: count by (job) (up) == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus all targets missing (instance {{ $labels.instance }})"
          description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusConfigurationReloadFailure
        expr: prometheus_config_last_reload_successful != 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})"
          description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTooManyRestarts
        expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusAlertmanagerConfigurationReloadFailure
        expr: alertmanager_config_last_reload_successful != 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})"
          description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusAlertmanagerConfigNotSynced
        expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})"
          description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusNotConnectedToAlertmanager
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
          description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusRuleEvaluationFailures
        expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTemplateTextExpansionFailures
        expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusRuleEvaluationSlow
        expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})"
          description: "Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusNotificationsBacklog
        expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
          description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusAlertmanagerNotificationFailing
        expr: rate(alertmanager_notifications_failed_total[1m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
          description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTargetEmpty
        expr: prometheus_sd_discovered_targets == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus target empty (instance {{ $labels.instance }})"
          description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTargetScrapingSlow
        expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
          description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusLargeScrape
        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus large scrape (instance {{ $labels.instance }})"
          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTargetScrapeDuplicate
        expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})"
          description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTsdbCheckpointCreationFailures
        expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTsdbCheckpointDeletionFailures
        expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTsdbCompactionsFailed
        expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTsdbHeadTruncationsFailed
        expr: increase(prometheus_tsdb_head_truncations_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTsdbReloadFailures
        expr: increase(prometheus_tsdb_reloads_failures_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTsdbWalCorruptions
        expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: PrometheusTsdbWalTruncationsFailed
        expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"


    - name: node-exporter
      rules:
      - alert: HostOutOfMemory
        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host out of memory (instance {{ $labels.instance }})"
          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostMemoryUnderMemoryPressure
        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
          description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostUnusualNetworkThroughputIn
        expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
          description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostUnusualNetworkThroughputOut
        expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
          description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostUnusualDiskReadRate
        expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
          description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostUnusualDiskWriteRate
        expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host unusual disk write rate (instance {{ $labels.instance }})"
          description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      # please add ignored mountpoints in node_exporter parameters like
      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)"
      - alert: HostOutOfDiskSpace
        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host out of disk space (instance {{ $labels.instance }})"
          description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostDiskWillFillIn4Hours
        expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
          description: "Disk will fill in 4 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostOutOfInodes
        expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host out of inodes (instance {{ $labels.instance }})"
          description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostUnusualDiskReadLatency
        expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host unusual disk read latency (instance {{ $labels.instance }})"
          description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostUnusualDiskWriteLatency
        expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host unusual disk write latency (instance {{ $labels.instance }})"
          description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostHighCpuLoad
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host high CPU load (instance {{ $labels.instance }})"
          description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      # 1000 context switches is an arbitrary number.
      # Alert threshold depends on nature of application.
      # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
      - alert: HostContextSwitching
        expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 5000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host context switching (instance {{ $labels.instance }})"
          description: "Context switching is growing on node (> 5000 / s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostSwapIsFillingUp
        expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host swap is filling up (instance {{ $labels.instance }})"
          description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostSystemdServiceCrashed
        expr: node_systemd_unit_state{state="failed"} == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host SystemD service crashed (instance {{ $labels.instance }})"
          description: "SystemD service crashed\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"


      - alert: HostKernelVersionDeviations
        expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host kernel version deviations (instance {{ $labels.instance }})"
          description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: HostOomKillDetected
        expr: increase(node_vmstat_oom_kill[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host OOM kill detected (instance {{ $labels.instance }})"
          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"


    - name: cadvisor
      rules:
      - alert: ContainerKilled
        expr: time() - container_last_seen > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container killed (instance {{ $labels.instance }})"
          description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      # cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly.
      # If you want to exclude it from this alert, just use: container_cpu_usage_seconds_total{name!=""}
      - alert: ContainerCpuUsage
        expr: (sum(rate(container_cpu_usage_seconds_total{image!=""}[3m])) BY (instance, name) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container CPU usage (instance {{ $labels.instance }})"
          description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
      - alert: ContainerMemoryUsage
        expr: (sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container Memory usage (instance {{ $labels.instance }})"
          description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: ContainerVolumeUsage
        expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container Volume usage (instance {{ $labels.instance }})"
          description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: ContainerVolumeIoUsage
        expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container Volume IO usage (instance {{ $labels.instance }})"
          description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: ContainerHighThrottleRate
        expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container high throttle rate (instance {{ $labels.instance }})"
          description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

    - name: k8s.rules
      rules:
      - expr: |
                    sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m])) by (namespace)
        record: namespace:container_cpu_usage_seconds_total:sum_rate
      - expr: |
          sum by (namespace, pod, container) (
                  rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m])
                )
        record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
      - expr: |
                    sum(container_memory_usage_bytes{image!="", container!="POD",namespace!=""}) by (namespace)
        record: namespace:container_memory_usage_bytes:sum
      - expr: |
          sum by (namespace, label_name) (
             sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (namespace, pod)
           * on (namespace, pod) group_left(label_name)
             label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
          )
        record: namespace_name:container_cpu_usage_seconds_total:sum_rate
      - expr: |
          sum by (namespace, label_name) (
            sum(container_memory_usage_bytes{image!="", container!="POD"}) by (pod, namespace)
          * on (namespace, pod) group_left(label_name)
            label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
          )
        record: namespace_name:container_memory_usage_bytes:sum
      - expr: |
          sum by (namespace, label_name) (
            sum(kube_pod_container_resource_requests_memory_bytes{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
          * on (namespace, pod) group_left(label_name)
            label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
          )
        record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
      - expr: |
          sum by (namespace, label_name) (
            sum(kube_pod_container_resource_requests_cpu_cores{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
          * on (namespace, pod) group_left(label_name)
            label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
          )
        record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
      - expr: |
          sum(
            label_replace(
              label_replace(
                kube_pod_owner{component="kube-state-metrics", owner_kind="ReplicaSet"},
                "replicaset", "$1", "owner_name", "(.*)"
              ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{component="kube-state-metrics"},
              "workload", "$1", "owner_name", "(.*)"
            )
          ) by (namespace, workload, pod)
        labels:
          workload_type: deployment
        record: mixin_pod_workload
      - expr: |
          sum(
            label_replace(
              kube_pod_owner{component="kube-state-metrics", owner_kind="DaemonSet"},
              "workload", "$1", "owner_name", "(.*)"
            )
          ) by (namespace, workload, pod)
        labels:
          workload_type: daemonset
        record: mixin_pod_workload
      - expr: |
          sum(
            label_replace(
              kube_pod_owner{component="kube-state-metrics", owner_kind="StatefulSet"},
              "workload", "$1", "owner_name", "(.*)"
            )
          ) by (namespace, workload, pod)
        labels:
          workload_type: statefulset
        record: mixin_pod_workload

    - name: kube-scheduler.rules
      rules:
      - expr: |
                    histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.99"
        record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
      - expr: |
                    histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.99"
        record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
      - expr: |
                    histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.99"
        record: cluster_quantile:scheduler_binding_latency:histogram_quantile
      - expr: |
                    histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.9"
        record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
      - expr: |
                    histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.9"
        record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
      - expr: |
                    histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.9"
        record: cluster_quantile:scheduler_binding_latency:histogram_quantile
      - expr: |
                    histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.5"
        record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
      - expr: |
                    histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.5"
        record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
      - expr: |
                    histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.5"
        record: cluster_quantile:scheduler_binding_latency:histogram_quantile

    - name: kube-apiserver.rules
      rules:
      - expr: |
                    histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.99"
        record: cluster_quantile:apiserver_request_latencies:histogram_quantile
      - expr: |
                    histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.9"
        record: cluster_quantile:apiserver_request_latencies:histogram_quantile
      - expr: |
                    histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
        labels:
          quantile: "0.5"
        record: cluster_quantile:apiserver_request_latencies:histogram_quantile

    - name: node.rules
      rules:
      - expr: sum(min(kube_pod_info) by (node))
        record: ':kube_pod_info_node_count:'
      - expr: |
                    max(label_replace(kube_pod_info{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
        record: 'node_namespace_pod:kube_pod_info:'
      - expr: |
          count by (node) (sum by (node, cpu) (
            node_cpu_seconds_total{component="node-exporter"}
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:
          ))
        record: node:node_num_cpu:sum
      - expr: |
                    1 - avg(rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m]))
        record: :node_cpu_utilization:avg1m
      - expr: |
          1 - avg by (node) (
            rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m])
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:)
        record: node:node_cpu_utilization:avg1m
      - expr: |
          node:node_cpu_utilization:avg1m
            *
          node:node_num_cpu:sum
            /
          scalar(sum(node:node_num_cpu:sum))
        record: node:cluster_cpu_utilization:ratio
      - expr: |
          sum(node_load1{component="node-exporter"})
          /
          sum(node:node_num_cpu:sum)
        record: ':node_cpu_saturation_load1:'
      - expr: |
          sum by (node) (
            node_load1{component="node-exporter"}
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:
          )
          /
          node:node_num_cpu:sum
        record: 'node:node_cpu_saturation_load1:'
      - expr: |
          1 -
          sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
          /
          sum(node_memory_MemTotal_bytes{component="node-exporter"})
        record: ':node_memory_utilization:'
      - expr: |
                    sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
        record: :node_memory_MemFreeCachedBuffers_bytes:sum
      - expr: |
                    sum(node_memory_MemTotal_bytes{component="node-exporter"})
        record: :node_memory_MemTotal_bytes:sum
      - expr: |
          sum by (node) (
            (node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
            * on (namespace, pod) group_left(node)
              node_namespace_pod:kube_pod_info:
          )
        record: node:node_memory_bytes_available:sum
      - expr: |
          sum by (node) (
            node_memory_MemTotal_bytes{component="node-exporter"}
            * on (namespace, pod) group_left(node)
              node_namespace_pod:kube_pod_info:
          )
        record: node:node_memory_bytes_total:sum
      - expr: |
          (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
          /
          node:node_memory_bytes_total:sum
        record: node:node_memory_utilization:ratio
      - expr: |
          (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
          /
          scalar(sum(node:node_memory_bytes_total:sum))
        record: node:cluster_memory_utilization:ratio
      - expr: |
          1e3 * sum(
            (rate(node_vmstat_pgpgin{component="node-exporter"}[1m])
           + rate(node_vmstat_pgpgout{component="node-exporter"}[1m]))
          )
        record: :node_memory_swap_io_bytes:sum_rate
      - expr: |
          1 -
          sum by (node) (
            (node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:
          )
          /
          sum by (node) (
            node_memory_MemTotal_bytes{component="node-exporter"}
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:
          )
        record: 'node:node_memory_utilization:'
      - expr: |
                    1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
        record: 'node:node_memory_utilization_2:'
      - expr: |
          1e3 * sum by (node) (
            (rate(node_vmstat_pgpgin{component="node-exporter"}[1m])
           + rate(node_vmstat_pgpgout{component="node-exporter"}[1m]))
           * on (namespace, pod) group_left(node)
             node_namespace_pod:kube_pod_info:
          )
        record: node:node_memory_swap_io_bytes:sum_rate
      - expr: |
                    avg(irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
        record: :node_disk_utilization:avg_irate
      - expr: |
          avg by (node) (
            irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:
          )
        record: node:node_disk_utilization:avg_irate
      - expr: |
                    avg(irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
        record: :node_disk_saturation:avg_irate
      - expr: |
          avg by (node) (
            irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:
          )
        record: node:node_disk_saturation:avg_irate
      - expr: |
          max by (namespace, nodename, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
          - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
          / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
        record: 'node:node_filesystem_usage:'
      - expr: |
                    max by (namespace, nodename, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
        record: 'node:node_filesystem_avail:'
      - expr: |
          sum(irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) +
          sum(irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m]))
        record: :node_net_utilization:sum_irate
      - expr: |
          sum by (node) (
            (irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m]) +
            irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m]))
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:
          )
        record: node:node_net_utilization:sum_irate
      - expr: |
          sum(irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m])) +
          sum(irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m]))
        record: :node_net_saturation:sum_irate
      - expr: |
          sum by (node) (
            (irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m]) +
            irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m]))
          * on (namespace, pod) group_left(node)
            node_namespace_pod:kube_pod_info:
          )
        record: node:node_net_saturation:sum_irate
      - expr: |
          max(
            max(
              kube_pod_info{component="kube-state-metrics", host_ip!=""}
            ) by (node, host_ip)
            * on (host_ip) group_right (node)
            label_replace(
              (max(node_filesystem_files{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
            )
          ) by (node)
        record: 'node:node_inodes_total:'
      - expr: |
          max(
            max(
              kube_pod_info{component="kube-state-metrics", host_ip!=""}
            ) by (node, host_ip)
            * on (host_ip) group_right (node)
            label_replace(
              (max(node_filesystem_files_free{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
            )
          ) by (node)
        record: 'node:node_inodes_free:'
    - name: cluster.rules
      rules:
        # Total number of CPU cores in the cluster.
        - expr: |
                        sum(node:node_num_cpu:sum)
          record: cluster:cpu_total
        # Cluster-wide CPU usage rate in percent.
        - expr: |
                        sum(node:cluster_cpu_utilization:ratio * 100)
          record: cluster:cpu_usage_rate
        # Cluster-wide total RAM in bytes.
        - expr: |
                        sum(node:node_memory_bytes_total:sum)
          record: cluster:memory_total_bytes
        # Cluster-wide RAM usage in bytes.
        - expr: |
                        sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)
          record: cluster:memory_usage_bytes
        # Cluster-wide RAM usage rate in percent.
        - expr: |
                        (sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)) / scalar(sum(node:node_memory_bytes_total:sum)) * 100
          record: cluster:memory_usage_rate
    - name: kube-prometheus-node-recording.rules
      rules:
      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
          (instance)
        record: instance:node_cpu:rate:sum
      - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
          BY (instance)
        record: instance:node_filesystem_usage:sum
      - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
        record: instance:node_network_receive_bytes:rate:sum
      - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
        record: instance:node_network_transmit_bytes:rate:sum
      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
          (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
          BY (instance, cpu)) BY (instance)
        record: instance:node_cpu:ratio
      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
        record: cluster:node_cpu:sum_rate5m
      - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
          BY (instance, cpu))
        record: cluster:node_cpu:ratio
    - name: kubernetes-absent
      rules:
      - alert: KubeAPIDown
        annotations:
          message: KubeAPI has disappeared from Prometheus target discovery.
        expr: |
                    absent(up{job="kubernetes-apiservers"} == 1)
        for: 15m
        labels:
          severity: critical

      - alert: NodeExporterDown
        annotations:
          message: NodeExporter has disappeared from Prometheus target discovery.
        expr: |
                    absent(up{component="node-exporter"} == 1)
        for: 15m
        labels:
          severity: critical
      - alert: PrometheusDown
        annotations:
          message: Prometheus has disappeared from Prometheus target discovery.
        expr: |
                    absent(up{job="prometheus"} == 1)
        for: 15m
        labels:
          severity: critical
      - alert: cAdvisoDown
        annotations:
          message: cAdviso has disappeared from Prometheus target discovery.
        expr: |
                    absent(up{job="kubernetes-nodes-cadvisor"} == 1)
        for: 15m
        labels:
          severity: critical
    - name: kubernetes-apps
      rules:
      - alert: KubePodCrashLooping
        annotations:
          message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
            }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
        expr: |
                    rate(kube_pod_container_status_restarts_total{component="kube-state-metrics"}[15m]) * 60 * 5 > 0
        for: 1h
        labels:
          severity: critical
      - alert: KubePodNotReady
        annotations:
          message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
            state for longer than an hour.
        expr: |
                    sum by (namespace, pod) (kube_pod_status_phase{component="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
        for: 1h
        labels:
          severity: critical
      - alert: KubeDeploymentGenerationMismatch
        annotations:
          message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
            }} does not match, this indicates that the Deployment has failed but has
            not been rolled back.
        expr: |
          kube_deployment_status_observed_generation{component="kube-state-metrics"}
            !=
          kube_deployment_metadata_generation{component="kube-state-metrics"}
        for: 15m
        labels:
          severity: critical
      - alert: KubeDeploymentReplicasMismatch
        annotations:
          message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
            matched the expected number of replicas for longer than an hour.
        expr: |
          kube_deployment_spec_replicas{component="kube-state-metrics"}
            !=
          kube_deployment_status_replicas_available{component="kube-state-metrics"}
        for: 1h
        labels:
          severity: critical
      - alert: KubeStatefulSetReplicasMismatch
        annotations:
          message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
            not matched the expected number of replicas for longer than 15 minutes.
        expr: |
          kube_statefulset_status_replicas_ready{component="kube-state-metrics"}
            !=
          kube_statefulset_status_replicas{component="kube-state-metrics"}
        for: 15m
        labels:
          severity: critical
      - alert: KubeStatefulSetGenerationMismatch
        annotations:
          message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
            }} does not match, this indicates that the StatefulSet has failed but has
            not been rolled back.
        expr: |
          kube_statefulset_status_observed_generation{component="kube-state-metrics"}
            !=
          kube_statefulset_metadata_generation{component="kube-state-metrics"}
        for: 15m
        labels:
          severity: critical
      - alert: KubeStatefulSetUpdateNotRolledOut
        annotations:
          message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
            has not been rolled out.
        expr: |
          max without (revision) (
            kube_statefulset_status_current_revision{component="kube-state-metrics"}
              unless
            kube_statefulset_status_update_revision{component="kube-state-metrics"}
          )
            *
          (
            kube_statefulset_replicas{component="kube-state-metrics"}
              !=
            kube_statefulset_status_replicas_updated{component="kube-state-metrics"}
          )
        for: 15m
        labels:
          severity: critical
      - alert: KubeDaemonSetRolloutStuck
        annotations:
          message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
            }}/{{ $labels.daemonset }} are scheduled and ready.
        expr: |
          kube_daemonset_status_number_ready{component="kube-state-metrics"}
            /
          kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"} * 100 < 100
        for: 15m
        labels:
          severity: critical
      - alert: KubeDaemonSetNotScheduled
        annotations:
          message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
            }} are not scheduled.'
        expr: |
          kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"}
            -
          kube_daemonset_status_current_number_scheduled{component="kube-state-metrics"} > 0
        for: 10m
        labels:
          severity: warning
      - alert: KubeDaemonSetMisScheduled
        annotations:
          message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
            }} are running where they are not supposed to run.'
        expr: |
                    kube_daemonset_status_number_misscheduled{component="kube-state-metrics"} > 0
        for: 10m
        labels:
          severity: warning
      - alert: KubeCronJobRunning
        annotations:
          message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
            than 1h to complete.
        expr: |
                    time() - kube_cronjob_next_schedule_time{component="kube-state-metrics"} > 3600
        for: 1h
        labels:
          severity: warning
      - alert: KubeJobCompletion
        annotations:
          message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
            than one hour to complete.
        expr: |
                    kube_job_spec_completions{component="kube-state-metrics"} - kube_job_status_succeeded{component="kube-state-metrics"}  > 0
        for: 1h
        labels:
          severity: warning
      - alert: KubeJobFailed
        annotations:
          message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
        expr: |
                    kube_job_status_failed{component="kube-state-metrics"}  > 0
        for: 1h
        labels:
          severity: warning

    - name: kubernetes-resources
      rules:
      - alert: KubeCPUOvercommit
        annotations:
          message: Cluster has overcommitted CPU resource requests for Pods and cannot
            tolerate node failure.
        expr: |
          sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
            /
          sum(node:node_num_cpu:sum)
            >
          (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
        for: 5m
        labels:
          severity: warning
      - alert: KubeMemOvercommit
        annotations:
          message: Cluster has overcommitted memory resource requests for Pods and cannot
            tolerate node failure.
        expr: |
          sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
            /
          sum(node_memory_MemTotal_bytes)
            >
          (count(node:node_num_cpu:sum)-1)
            /
          count(node:node_num_cpu:sum)
        for: 5m
        labels:
          severity: warning
      - alert: KubeCPUOvercommit
        annotations:
          message: Cluster has overcommitted CPU resource requests for Namespaces.
        expr: |
          sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="cpu"})
            /
          sum(node:node_num_cpu:sum)
            > 1.5
        for: 5m
        labels:
          severity: warning
      - alert: KubeMemOvercommit
        annotations:
          message: Cluster has overcommitted memory resource requests for Namespaces.
        expr: |
          sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="memory"})
            /
          sum(node_memory_MemTotal_bytes{component="node-exporter"})
            > 1.5
        for: 5m
        labels:
          severity: warning
      - alert: KubeQuotaExceeded
        annotations:
          message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
            }}% of its {{ $labels.resource }} quota.
        expr: |
          100 * kube_resourcequota{component="kube-state-metrics", type="used"}
            / ignoring(instance, job, type)
          (kube_resourcequota{component="kube-state-metrics", type="hard"} > 0)
            > 90
        for: 15m
        labels:
          severity: warning
    - name: kubernetes-storage
      rules:
      - alert: KubePersistentVolumeUsageCritical
        annotations:
          message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
            }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
            }}% free.
        expr: |
          100 * kubelet_volume_stats_available_bytes{job="kubelet"}
            /
          kubelet_volume_stats_capacity_bytes{job="kubelet"}
            < 3
        for: 1m
        labels:
          severity: critical
      - alert: KubePersistentVolumeFullInFourDays
        annotations:
          message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
            }} in Namespace {{ $labels.namespace }} is expected to fill up within four
            days. Currently {{ printf "%0.2f" $value }}% is available.
        expr: |
          100 * (
            kubelet_volume_stats_available_bytes{job="kubelet"}
              /
            kubelet_volume_stats_capacity_bytes{job="kubelet"}
          ) < 15
          and
          predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
        for: 5m
        labels:
          severity: critical
      - alert: KubePersistentVolumeErrors
        annotations:
          message: The persistent volume {{ $labels.persistentvolume }} has status {{
            $labels.phase }}.
        expr: |
                    kube_persistentvolume_status_phase{phase=~"Failed|Pending",component="kube-state-metrics"} > 0
        for: 5m
        labels:
          severity: critical
    - name: kubernetes-system
      rules:
      - alert: KubeNodeNotReady
        annotations:
          message: '{{ $labels.node }} has been unready for more than an hour.'
        expr: |
                    kube_node_status_condition{component="kube-state-metrics",condition="Ready",status="true"} == 0
        for: 1h
        labels:
          severity: warning
      - alert: KubeVersionMismatch
        annotations:
          message: There are {{ $value }} different semantic versions of Kubernetes
            components running.
        expr: |
                    count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
        for: 1h
        labels:
          severity: warning
      - alert: KubeClientErrors
        annotations:
          message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
            }}' is experiencing {{ printf "%0.0f" $value }}% errors.'
        expr: |
          (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
            /
          sum(rate(rest_client_requests_total[5m])) by (instance, job))
          * 100 > 1
        for: 15m
        labels:
          severity: warning
      - alert: KubeClientErrors
        annotations:
          message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
            }}' is experiencing {{ printf "%0.0f" $value }} errors / second.
        expr: |
          sum(rate(ksm_scrape_error_total{component="kube-state-metrics"}[5m])) by (instance, job) > 0.1
        for: 15m
        labels:
          severity: warning
      - alert: KubeletTooManyPods
        annotations:
          message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
            to the limit of 110.
        expr: |
          kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
        for: 15m
        labels:
          severity: warning
      - alert: KubeAPILatencyHigh
        annotations:
          message: The API server has a 99th percentile latency of {{ $value }} seconds
            for {{ $labels.verb }} {{ $labels.resource }}.
        expr: |
          cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
        for: 10m
        labels:
          severity: warning
      - alert: KubeAPILatencyHigh
        annotations:
          message: The API server has a 99th percentile latency of {{ $value }} seconds
            for {{ $labels.verb }} {{ $labels.resource }}.
        expr: |
          cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
        for: 10m
        labels:
          severity: critical
      - alert: KubeAPIErrorsHigh
        annotations:
          message: API server is returning errors for {{ $value }}% of requests.
        expr: |
          sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m]))
            /
          sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 3
        for: 10m
        labels:
          severity: critical
      - alert: KubeAPIErrorsHigh
        annotations:
          message: API server is returning errors for {{ $value }}% of requests.
        expr: |
          sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m]))
            /
          sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 1
        for: 10m
        labels:
          severity: warning
      - alert: KubeAPIErrorsHigh
        annotations:
          message: API server is returning errors for {{ $value }}% of requests for
            {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
        expr: |
          sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
            /
          sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 10
        for: 10m
        labels:
          severity: critical
      - alert: KubeAPIErrorsHigh
        annotations:
          message: API server is returning errors for {{ $value }}% of requests for
            {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
        expr: |
          sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
            /
          sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 5
        for: 10m
        labels:
          severity: warning
      - alert: KubeClientCertificateExpiration
        annotations:
          message: A client certificate used to authenticate to the apiserver is expiring
            in less than 7.0 days.
        expr: |
          apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 604800
        labels:
          severity: warning
      - alert: KubeClientCertificateExpiration
        annotations:
          message: A client certificate used to authenticate to the apiserver is expiring
            in less than 24.0 hours.
        expr: |
          apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 86400
        labels:
          severity: critical
    - name: alertmanager.rules
      rules:
      - alert: AlertmanagerConfigInconsistent
        annotations:
          message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
            are out of sync.
        expr: |
          count_values("config_hash", alertmanager_config_hash{job="prometheus-alertmanager"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
        for: 5m
        labels:
          severity: critical
      - alert: AlertmanagerFailedReload
        annotations:
          message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
            }}/{{ $labels.pod}}.
        expr: |
                    alertmanager_config_last_reload_successful{job="prometheus-alertmanager"} == 0
        for: 10m
        labels:
          severity: warning
      - alert: AlertmanagerMembersInconsistent
        annotations:
          message: Alertmanager has not found all other members of the cluster.
        expr: |
          alertmanager_cluster_members{job="prometheus-alertmanager"}
            != on (service) GROUP_LEFT()
          count by (service) (alertmanager_cluster_members{job="prometheus-alertmanager"})
        for: 5m
        labels:
          severity: critical
    - name: etcd.rules
      rules:
      - alert: EtcdDown
        annotations:
          message: Etcd instance is down on node {{ $labels.node }}.
        expr: satellite_etcd_up == 0
        for: 5m
        labels:
          severity: critical
      - alert: EtcdUnhealthy
        annotations:
          message: Etcd cluster is unhealthy.
        expr: satellite_etcd_health == 0
        for: 1m
        labels:
          severity: critical
    - name: sysctl.rules
      rules:
      - alert: BrNetfilterMissing
        annotations:
          message: Bridge netfilter is disabled on node {{ $labels.node }}
          runbook_url: https://gravitational.com/gravity/docs/requirements/#br_netfilter-module
        expr: max_over_time(satellite_sysctl_br_netfilter[1h]) unless satellite_sysctl_br_netfilter or satellite_sysctl_br_netfilter == 0
        for: 5m
        labels:
          severity: critical
      - alert: IPv4ForwardingMissing
        annotations:
          message: IPv4 forwarding is disabled on node {{ $labels.node }}
          runbook_url: https://gravitational.com/gravity/docs/faq/#ipv4-forwarding
        expr: max_over_time(satellite_sysctl_ipv4_forwarding[1h]) unless satellite_sysctl_ipv4_forwarding or satellite_sysctl_ipv4_forwarding == 0
        for: 5m
        labels:
          severity: critical
    - name: docker.rules
      rules:
      - alert: DockerDown
        annotations:
          message: Docker daemon is down on host {{ $labels.node }}
        expr: satellite_docker_health == 0
        for: 5m
        labels:
          severity: critical
    - name: systemd.rules
      rules:
      - alert: SystemdDegraded
        annotations:
          message: Systemd on host {{ $labels.node }}
        expr: satellite_systemd_health == 0
        for: 5m
        labels:
          severity: critical
      - alert: SystemdUnitDegraded
        annotations:
          message: Systemd unit {{ $labels.unit_name }} is degraded on host {{ $labels.node }}
        expr: satellite_systemd_unit_health == 0
        for: 5m
        labels:
          severity: critical
    - name: general.rules
      rules:
      - alert: TargetDown
        annotations:
          message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
        expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
        for: 10m
        labels:
          severity: warning
      - alert: Watchdog
        annotations:
          message: |
            This is an alert meant to ensure that the entire alerting pipeline is functional.
            This alert is always firing, therefore it should always be firing in Alertmanager
            and always fire against a receiver. There are integrations with various notification
            mechanisms that send a notification when this alert is not firing. For example the
            "DeadMansSnitch" integration in PagerDuty.
        expr: vector(1)
        labels:
          severity: none
    - name: kube-prometheus-node-alerting.rules
      rules:
      - alert: NodeDiskRunningFull
        annotations:
          message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
            }}/{{ $labels.pod }} will be full within the next 24 hours.
        expr: |
                    (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
        for: 30m
        labels:
          severity: warning
      - alert: NodeDiskRunningFull
        annotations:
          message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
            }}/{{ $labels.pod }} will be full within the next 2 hours.
        expr: |
                    (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
        for: 10m
        labels:
          severity: critical
    - name: node-network
      rules:
      - alert: NetworkReceiveErrors
        annotations:
          message: Network interface "{{ $labels.device }}" showing receive errors on
            node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
        expr: |
                    rate(node_network_receive_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0
        for: 2m
        labels:
          severity: warning
      - alert: NetworkTransmitErrors
        annotations:
          message: Network interface "{{ $labels.device }}" showing transmit errors
            on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
        expr: |
                    rate(node_network_transmit_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0
        for: 2m
        labels:
          severity: warning
      - alert: NodeNetworkInterfaceFlapping
        annotations:
          message: Network interface "{{ $labels.device }}" changing it's up status
            often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
        expr: |
                    changes(node_network_up{component="node-exporter",device!~"veth.+"}[2m]) > 2
        for: 2m
        labels:
          severity: warning
    - name: prometheus.rules
      rules:
      - alert: PrometheusConfigReloadFailed
        annotations:
          description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
          summary: Reloading Prometheus' configuration failed
        expr: |
                    prometheus_config_last_reload_successful{job="prometheus"} == 0
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusNotificationQueueRunningFull
        annotations:
          description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
            $labels.pod}}
          summary: Prometheus' alert notification queue is running full
        expr: |
                    predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"}
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusErrorSendingAlerts
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
          summary: Errors while sending alert from Prometheus
        expr: |
                    rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusErrorSendingAlerts
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
          summary: Errors while sending alerts from Prometheus
        expr: |
                    rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03
        for: 10m
        labels:
          severity: critical
      - alert: PrometheusNotConnectedToAlertmanagers
        annotations:
          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
            to any Alertmanagers
          summary: Prometheus is not connected to any Alertmanagers
        expr: |
                    prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusTSDBReloadsFailing
        annotations:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            reload failures over the last four hours.'
          summary: Prometheus has issues reloading data blocks from disk
        expr: |
                    increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0
        for: 12h
        labels:
          severity: warning
      - alert: PrometheusTSDBCompactionsFailing
        annotations:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            compaction failures over the last four hours.'
          summary: Prometheus has issues compacting sample blocks
        expr: |
                    increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0
        for: 12h
        labels:
          severity: warning
      - alert: PrometheusTSDBWALCorruptions
        annotations:
          description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
            log (WAL).'
          summary: Prometheus write-ahead log is corrupted
        expr: |
                    prometheus_tsdb_wal_corruptions_total{job="prometheus"} > 0
        for: 4h
        labels:
          severity: warning
      - alert: PrometheusNotIngestingSamples
        annotations:
          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting
            samples.
          summary: Prometheus isn't ingesting samples
        expr: |
                    rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusTargetScrapesDuplicate
        annotations:
          description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected
            due to duplicate timestamps but different values'
          summary: Prometheus has many samples rejected
        expr: |
                    increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
        for: 10m
        labels:
          severity: warning
  # groups:
  #   - name: Instances
  #     rules:
  #       - alert: InstanceDown
  #         expr: up == 0
  #         for: 5m
  #         labels:
  #           severity: page
  #         annotations:
  #           description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
  #           summary: 'Instance {{ $labels.instance }} down'
  ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml
  alerts: {}

  ## Records configuration
  ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
  recording_rules.yml: {}
  ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml
  rules: {}

  prometheus.yml:
    global:
      evaluation_interval: 30s
      scrape_interval: 20s
      scrape_timeout: 10s
    rule_files:
      - /etc/config/recording_rules.yml
      - /etc/config/alerting_rules.yml
    ## Below two files are DEPRECATED will be removed from this default values file
      - /etc/config/rules
      - /etc/config/alerts

    scrape_configs:
      - job_name: prometheus
        static_configs:
          - targets:
            - localhost:9090

      # A scrape configuration for running Prometheus on a Kubernetes cluster.
      # This uses separate scrape configs for cluster components (i.e. API server, node)
      # and services to allow each to use different authentication configs.
      #
      # Kubernetes labels will be added as Prometheus labels on metrics via the
      # `labelmap` relabeling action.

      # Scrape config for API servers.
      #
      # Kubernetes exposes API servers as endpoints to the default/kubernetes
      # service so this uses `endpoints` role and uses relabelling to only keep
      # the endpoints associated with the default/kubernetes service using the
      # default named port `https`. This works for single API server deployments as
      # well as HA API server deployments.
      - job_name: 'kubernetes-apiservers'

        kubernetes_sd_configs:
          - role: endpoints

        # Default to scraping over https. If required, just disable this or change to
        # `http`.
        scheme: https

        # This TLS & bearer token file config is used to connect to the actual scrape
        # endpoints for cluster components. This is separate to discovery auth
        # configuration because discovery & scraping are two separate concerns in
        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
        # the cluster. Otherwise, more config options have to be provided within the
        # <kubernetes_sd_config>.
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          # If your node certificates are self-signed or use a different CA to the
          # master CA, then disable certificate verification below. Note that
          # certificate verification is an integral part of a secure infrastructure
          # so this should only be disabled in a controlled environment. You can
          # disable certificate verification by uncommenting the line below.
          #
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # Keep only the default/kubernetes service endpoints for the https port. This
        # will add targets for each API server which Kubernetes adds an endpoint to
        # the default/kubernetes service.
        relabel_configs:
          - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
            action: keep
            regex: default;kubernetes;https

      - job_name: 'kubernetes-nodes'

        # Default to scraping over https. If required, just disable this or change to
        # `http`.
        scheme: https

        # This TLS & bearer token file config is used to connect to the actual scrape
        # endpoints for cluster components. This is separate to discovery auth
        # configuration because discovery & scraping are two separate concerns in
        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
        # the cluster. Otherwise, more config options have to be provided within the
        # <kubernetes_sd_config>.
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          # If your node certificates are self-signed or use a different CA to the
          # master CA, then disable certificate verification below. Note that
          # certificate verification is an integral part of a secure infrastructure
          # so this should only be disabled in a controlled environment. You can
          # disable certificate verification by uncommenting the line below.
          #
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        kubernetes_sd_configs:
          - role: node

        relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          - target_label: __address__
            replacement: kubernetes.default.svc:443
          - source_labels: [__meta_kubernetes_node_name]
            regex: (.+)
            target_label: __metrics_path__
            replacement: /api/v1/nodes/$1/proxy/metrics


      - job_name: 'kubernetes-nodes-cadvisor'

        # Default to scraping over https. If required, just disable this or change to
        # `http`.
        scheme: https

        # This TLS & bearer token file config is used to connect to the actual scrape
        # endpoints for cluster components. This is separate to discovery auth
        # configuration because discovery & scraping are two separate concerns in
        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
        # the cluster. Otherwise, more config options have to be provided within the
        # <kubernetes_sd_config>.
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          # If your node certificates are self-signed or use a different CA to the
          # master CA, then disable certificate verification below. Note that
          # certificate verification is an integral part of a secure infrastructure
          # so this should only be disabled in a controlled environment. You can
          # disable certificate verification by uncommenting the line below.
          #
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        kubernetes_sd_configs:
          - role: node

        # This configuration will work only on kubelet 1.7.3+
        # As the scrape endpoints for cAdvisor have changed
        # if you are using older version you need to change the replacement to
        # replacement: /api/v1/nodes/$1:4194/proxy/metrics
        # more info here https://github.com/coreos/prometheus-operator/issues/633
        relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          - target_label: __address__
            replacement: kubernetes.default.svc:443
          - source_labels: [__meta_kubernetes_node_name]
            regex: (.+)
            target_label: __metrics_path__
            replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor

      # Scrape config for service endpoints.
      #
      # The relabeling allows the actual service scrape endpoint to be configured
      # via the following annotations:
      #
      # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
      # to set this to `https` & most likely set the `tls_config` of the scrape config.
      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
      # service then set this appropriately.
      - job_name: 'kubernetes-service-endpoints'

        kubernetes_sd_configs:
          - role: endpoints

        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
            action: replace
            target_label: __scheme__
            regex: (https?)
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
            action: replace
            target_label: __address__
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            action: replace
            target_label: kubernetes_name
          - source_labels: [__meta_kubernetes_pod_node_name]
            action: replace
            target_label: kubernetes_node

      # Scrape config for slow service endpoints; same as above, but with a larger
      # timeout and a larger interval
      #
      # The relabeling allows the actual service scrape endpoint to be configured
      # via the following annotations:
      #
      # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true`
      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
      # to set this to `https` & most likely set the `tls_config` of the scrape config.
      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
      # service then set this appropriately.
      - job_name: 'kubernetes-service-endpoints-slow'

        scrape_interval: 5m
        scrape_timeout: 30s

        kubernetes_sd_configs:
          - role: endpoints

        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
            action: replace
            target_label: __scheme__
            regex: (https?)
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
            action: replace
            target_label: __address__
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            action: replace
            target_label: kubernetes_name
          - source_labels: [__meta_kubernetes_pod_node_name]
            action: replace
            target_label: kubernetes_node

      - job_name: 'prometheus-pushgateway'
        honor_labels: true

        kubernetes_sd_configs:
          - role: service

        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
            action: keep
            regex: pushgateway

      # Example scrape config for probing services via the Blackbox Exporter.
      #
      # The relabeling allows the actual service scrape endpoint to be configured
      # via the following annotations:
      #
      # * `prometheus.io/probe`: Only probe services that have a value of `true`
      - job_name: 'kubernetes-services'

        metrics_path: /probe
        params:
          module: [http_2xx]

        kubernetes_sd_configs:
          - role: service

        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
            action: keep
            regex: true
          - source_labels: [__address__]
            target_label: __param_target
          - target_label: __address__
            replacement: blackbox
          - source_labels: [__param_target]
            target_label: instance
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            target_label: kubernetes_name

      # Example scrape config for pods
      #
      # The relabeling allows the actual pod scrape endpoint to be configured via the
      # following annotations:
      #
      # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
      - job_name: 'kubernetes-pods'

        kubernetes_sd_configs:
          - role: pod

        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
            action: replace
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
            target_label: __address__
          - action: labelmap
            regex: __meta_kubernetes_pod_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_pod_name]
            action: replace
            target_label: kubernetes_pod_name
          - source_labels: [__meta_kubernetes_pod_phase]
            regex: Pending|Succeeded|Failed
            action: drop

      # Example Scrape config for pods which should be scraped slower. An useful example
      # would be stackriver-exporter which querys an API on every scrape of the pod
      #
      # The relabeling allows the actual pod scrape endpoint to be configured via the
      # following annotations:
      #
      # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
      - job_name: 'kubernetes-pods-slow'

        scrape_interval: 5m
        scrape_timeout: 30s

        kubernetes_sd_configs:
          - role: pod

        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
            action: replace
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
            target_label: __address__
          - action: labelmap
            regex: __meta_kubernetes_pod_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_pod_name]
            action: replace
            target_label: kubernetes_pod_name
          - source_labels: [__meta_kubernetes_pod_phase]
            regex: Pending|Succeeded|Failed
            action: drop

# adds additional scrape configs to prometheus.yml
# must be a string so you have to add a | after extraScrapeConfigs:
# example adds prometheus-blackbox-exporter scrape config
extraScrapeConfigs:
  # - job_name: 'prometheus-blackbox-exporter'
  #   metrics_path: /probe
  #   params:
  #     module: [http_2xx]
  #   static_configs:
  #     - targets:
  #       - https://example.com
  #   relabel_configs:
  #     - source_labels: [__address__]
  #       target_label: __param_target
  #     - source_labels: [__param_target]
  #       target_label: instance
  #     - target_label: __address__
  #       replacement: prometheus-blackbox-exporter:9115

# Adds option to add alert_relabel_configs to avoid duplicate alerts in alertmanager
# useful in H/A prometheus with different external labels but the same alerts
alertRelabelConfigs:
  # alert_relabel_configs:
  # - source_labels: [dc]
  #   regex: (.+)\d+
  #   target_label: dc

networkPolicy:
  ## Enable creation of NetworkPolicy resources.
  ##
  enabled: true

# Force namespace of namespaced resources
forceNamespace: null