You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3213 lines
118 KiB
YAML

rbac:
create: true
podSecurityPolicy:
enabled: false
imagePullSecrets:
# - name: "image-pull-secret"
## Define serviceAccount names for components. Defaults to component's fully qualified name.
##
serviceAccounts:
alertmanager:
create: true
name:
annotations: {}
nodeExporter:
create: true
name:
annotations: {}
pushgateway:
create: true
name:
annotations: {}
kubeStateMetrics:
create: true
name:
annotations: {}
server:
create: true
name:
annotations: {}
alertmanager:
## If false, alertmanager will not be installed
##
enabled: true
## Use a ClusterRole (and ClusterRoleBinding)
## - If set to false - we define a Role and RoleBinding in the defined namespaces ONLY
## This makes alertmanager work - for users who do not have ClusterAdmin privs, but wants alertmanager to operate on their own namespaces, instead of clusterwide.
useClusterRole: true
## Set to a rolename to use existing role - skipping role creating - but still doing serviceaccount and rolebinding to the rolename set here.
useExistingRole: false
## alertmanager container name
##
name: alertmanager
## alertmanager container image
##
image:
repository: prom/alertmanager
tag: v0.21.0
pullPolicy: IfNotPresent
## alertmanager priorityClassName
##
priorityClassName: ""
## Additional alertmanager container arguments
##
extraArgs: {}
## Additional InitContainers to initialize the pod
##
extraInitContainers: []
## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug
## so that the various internal URLs are still able to access as they are in the default case.
## (Optional)
prefixURL: ""
## External URL which can access alertmanager
baseURL: "http://localhost:9093"
## Additional alertmanager container environment variable
## For instance to add a http_proxy
##
extraEnv: {}
## Additional alertmanager Secret mounts
# Defines additional mounts with secrets. Secrets must be manually created in the namespace.
extraSecretMounts: []
# - name: secret-files
# mountPath: /etc/secrets
# subPath: ""
# secretName: alertmanager-secret-files
# readOnly: true
## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}}
## Defining configMapOverrideName will cause templates/alertmanager-configmap.yaml
## to NOT generate a ConfigMap resource
##
configMapOverrideName: ""
## The name of a secret in the same kubernetes namespace which contains the Alertmanager config
## Defining configFromSecret will cause templates/alertmanager-configmap.yaml
## to NOT generate a ConfigMap resource
##
configFromSecret: ""
## The configuration file name to be loaded to alertmanager
## Must match the key within configuration loaded from ConfigMap/Secret
##
configFileName: alertmanager.yml
ingress:
## If true, alertmanager Ingress will be created
##
enabled: false
## alertmanager Ingress annotations
##
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: 'true'
## alertmanager Ingress additional labels
##
extraLabels: {}
## alertmanager Ingress hostnames with optional path
## Must be provided if Ingress is enabled
##
hosts: []
# - alertmanager.domain.com
# - domain.com/alertmanager
## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
extraPaths: []
# - path: /*
# backend:
# serviceName: ssl-redirect
# servicePort: use-annotation
## alertmanager Ingress TLS configuration
## Secrets must be manually created in the namespace
##
tls: []
# - secretName: prometheus-alerts-tls
# hosts:
# - alertmanager.domain.com
## Alertmanager Deployment Strategy type
# strategy:
# type: Recreate
## Node tolerations for alertmanager scheduling to nodes with taints
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
##
tolerations: []
# - key: "key"
# operator: "Equal|Exists"
# value: "value"
# effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
## Node labels for alertmanager pod assignment
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
##
nodeSelector: {}
## Pod affinity
##
affinity: {}
## PodDisruptionBudget settings
## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
##
podDisruptionBudget:
enabled: false
maxUnavailable: 1
## Use an alternate scheduler, e.g. "stork".
## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
##
# schedulerName:
persistentVolume:
## If true, alertmanager will create/use a Persistent Volume Claim
## If false, use emptyDir
##
enabled: true
## alertmanager data Persistent Volume access modes
## Must match those of existing PV or dynamic provisioner
## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
##
accessModes:
- ReadWriteOnce
## alertmanager data Persistent Volume Claim annotations
##
annotations: {}
## alertmanager data Persistent Volume existing claim name
## Requires alertmanager.persistentVolume.enabled: true
## If defined, PVC must be created manually before volume will be bound
existingClaim: ""
## alertmanager data Persistent Volume mount root path
##
mountPath: /data
## alertmanager data Persistent Volume size
##
size: 2Gi
## alertmanager data Persistent Volume Storage Class
## If defined, storageClassName: <storageClass>
## If set to "-", storageClassName: "", which disables dynamic provisioning
## If undefined (the default) or set to null, no storageClassName spec is
## set, choosing the default provisioner. (gp2 on AWS, standard on
## GKE, AWS & OpenStack)
##
# storageClass: "-"
## alertmanager data Persistent Volume Binding Mode
## If defined, volumeBindingMode: <volumeBindingMode>
## If undefined (the default) or set to null, no volumeBindingMode spec is
## set, choosing the default mode.
##
# volumeBindingMode: ""
## Subdirectory of alertmanager data Persistent Volume to mount
## Useful if the volume's root directory is not empty
##
subPath: ""
## Annotations to be added to alertmanager pods
##
podAnnotations: {}
## Tell prometheus to use a specific set of alertmanager pods
## instead of all alertmanager pods found in the same namespace
## Useful if you deploy multiple releases within the same namespace
##
## prometheus.io/probe: alertmanager-teamA
## Labels to be added to Prometheus AlertManager pods
##
podLabels: {}
## Specify if a Pod Security Policy for node-exporter must be created
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
##
podSecurityPolicy:
annotations: {}
## Specify pod annotations
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
##
# seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
# seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
# apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
## Use a StatefulSet if replicaCount needs to be greater than 1 (see below)
##
replicaCount: 1
## Annotations to be added to deployment
##
deploymentAnnotations: {}
statefulSet:
## If true, use a statefulset instead of a deployment for pod management.
## This allows to scale replicas to more than 1 pod
##
enabled: false
annotations: {}
labels: {}
podManagementPolicy: OrderedReady
## Alertmanager headless service to use for the statefulset
##
headless:
annotations: {}
labels: {}
## Enabling peer mesh service end points for enabling the HA alert manager
## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md
enableMeshPeer: false
servicePort: 80
## alertmanager resource requests and limits
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
##
resources: {}
# limits:
# cpu: 10m
# memory: 32Mi
# requests:
# cpu: 10m
# memory: 32Mi
## Security context to be added to alertmanager pods
##
securityContext:
runAsUser: 65534
runAsNonRoot: true
runAsGroup: 65534
fsGroup: 65534
service:
annotations: {}
labels: {}
clusterIP: ""
## Enabling peer mesh service end points for enabling the HA alert manager
## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md
# enableMeshPeer : true
## List of IP addresses at which the alertmanager service is available
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
##
externalIPs: []
loadBalancerIP: ""
loadBalancerSourceRanges: []
servicePort: 80
# nodePort: 30000
sessionAffinity: None
type: ClusterIP
## Monitors ConfigMap changes and POSTs to a URL
## Ref: https://github.com/jimmidyson/configmap-reload
##
configmapReload:
prometheus:
## If false, the configmap-reload container will not be deployed
##
enabled: true
## configmap-reload container name
##
name: configmap-reload
## configmap-reload container image
##
image:
repository: jimmidyson/configmap-reload
tag: v0.4.0
pullPolicy: IfNotPresent
## Additional configmap-reload container arguments
##
extraArgs: {}
## Additional configmap-reload volume directories
##
extraVolumeDirs: []
## Additional configmap-reload mounts
##
extraConfigmapMounts: []
# - name: prometheus-alerts
# mountPath: /etc/alerts.d
# subPath: ""
# configMap: prometheus-alerts
# readOnly: true
## configmap-reload resource requests and limits
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
##
resources: {}
alertmanager:
## If false, the configmap-reload container will not be deployed
##
enabled: true
## configmap-reload container name
##
name: configmap-reload
## configmap-reload container image
##
image:
repository: jimmidyson/configmap-reload
tag: v0.4.0
pullPolicy: IfNotPresent
## Additional configmap-reload container arguments
##
extraArgs: {}
## Additional configmap-reload volume directories
##
extraVolumeDirs: []
## Additional configmap-reload mounts
##
extraConfigmapMounts: []
# - name: prometheus-alerts
# mountPath: /etc/alerts.d
# subPath: ""
# configMap: prometheus-alerts
# readOnly: true
## configmap-reload resource requests and limits
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
##
resources: {}
kubeStateMetrics:
## If false, kube-state-metrics will not be installed
##
enabled: true
## kube-state-metrics container name
##
name: kube-state-metrics
image:
repository: quay.io/coreos/kube-state-metrics
tag: v1.9.7
pullPolicy: IfNotPresent
podSecurityPolicy:
annotations: {}
additionalVolumes: []
imagePullSecrets: []
# - name: "image-pull-secret"
# If set to true, this will deploy kube-state-metrics as a StatefulSet and the data
# will be automatically sharded across <.Values.replicas> pods using the built-in
# autodiscovery feature: https://github.com/kubernetes/kube-state-metrics#automated-sharding
# This is an experimental feature and there are no stability guarantees.
autosharding:
enabled: false
replicas: 1
service:
port: 8080
# Default to clusterIP for backward compatibility
type: ClusterIP
nodePort: 0
loadBalancerIP: ""
annotations: {}
customLabels: {}
hostNetwork: false
securityContext:
enabled: true
runAsGroup: 65534
runAsUser: 65534
fsGroup: 65534
## Node labels for pod assignment
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
nodeSelector: {}
## Affinity settings for pod assignment
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
affinity: {}
## Tolerations for pod assignment
## Ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
tolerations: []
# Annotations to be added to the pod
podAnnotations: {}
## Assign a PriorityClassName to pods if set
# priorityClassName: ""
# Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
podDisruptionBudget: {}
# Available collectors for kube-state-metrics. By default all available
# collectors are enabled.
collectors:
certificatesigningrequests: true
configmaps: true
cronjobs: true
daemonsets: true
deployments: true
endpoints: true
horizontalpodautoscalers: true
ingresses: true
jobs: true
limitranges: true
mutatingwebhookconfigurations: true
namespaces: true
networkpolicies: true
nodes: true
persistentvolumeclaims: true
persistentvolumes: true
poddisruptionbudgets: true
pods: true
replicasets: true
replicationcontrollers: true
resourcequotas: true
secrets: true
services: true
statefulsets: true
storageclasses: true
validatingwebhookconfigurations: true
verticalpodautoscalers: false
volumeattachments: true
nodeExporter:
## If false, node-exporter will not be installed
##
enabled: true
## If true, node-exporter pods share the host network namespace
##
hostNetwork: false
## If true, node-exporter pods share the host PID namespace
##
hostPID: false
## node-exporter container name
##
name: node-exporter
## node-exporter container image
##
image:
repository: prom/node-exporter
tag: v1.0.1
pullPolicy: IfNotPresent
## Specify if a Pod Security Policy for node-exporter must be created
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
##
podSecurityPolicy:
annotations: {}
## Specify pod annotations
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
##
# seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
# seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
# apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
## node-exporter priorityClassName
##
priorityClassName: ""
## Custom Update Strategy
##
updateStrategy:
type: RollingUpdate
## Additional node-exporter container arguments
##
extraArgs: {}
## Additional InitContainers to initialize the pod
##
extraInitContainers: []
## Additional node-exporter hostPath mounts
##
extraHostPathMounts: []
# - name: textfile-dir
# mountPath: /srv/txt_collector
# hostPath: /var/lib/node-exporter
# readOnly: true
# mountPropagation: HostToContainer
extraConfigmapMounts: []
# - name: certs-configmap
# mountPath: /prometheus
# configMap: certs-configmap
# readOnly: true
## Node tolerations for node-exporter scheduling to nodes with taints
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
##
tolerations: []
# - key: "key"
# operator: "Equal|Exists"
# value: "value"
# effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
## Node labels for node-exporter pod assignment
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
##
nodeSelector: {}
## Annotations to be added to node-exporter pods
##
podAnnotations: {}
## Labels to be added to node-exporter pods
##
pod:
labels: {}
## PodDisruptionBudget settings
## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
##
podDisruptionBudget:
enabled: false
maxUnavailable: 1
## node-exporter resource limits & requests
## Ref: https://kubernetes.io/docs/user-guide/compute-resources/
##
resources: {}
# limits:
# cpu: 200m
# memory: 50Mi
# requests:
# cpu: 100m
# memory: 30Mi
## Security context to be added to node-exporter pods
##
securityContext: {}
# runAsUser: 0
service:
annotations:
prometheus.io/scrape: "true"
labels: {}
# Exposed as a headless service:
# https://kubernetes.io/docs/concepts/services-networking/service/#headless-services
clusterIP: None
## List of IP addresses at which the node-exporter service is available
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
##
externalIPs: []
hostPort: 9100
loadBalancerIP: ""
loadBalancerSourceRanges: []
servicePort: 9100
type: ClusterIP
server:
## Prometheus server container name
##
enabled: true
## Use a ClusterRole (and ClusterRoleBinding)
## - If set to false - we define a RoleBinding in the defined namespaces ONLY
##
## NB: because we need a Role with nonResourceURL's ("/metrics") - you must get someone with Cluster-admin privileges to define this role for you, before running with this setting enabled.
## This makes prometheus work - for users who do not have ClusterAdmin privs, but wants prometheus to operate on their own namespaces, instead of clusterwide.
##
## You MUST also set namespaces to the ones you have access to and want monitored by Prometheus.
##
# useExistingClusterRoleName: nameofclusterrole
## namespaces to monitor (instead of monitoring all - clusterwide). Needed if you want to run without Cluster-admin privileges.
# namespaces:
# - yournamespace
name: server
sidecarContainers:
## Prometheus server container image
##
image:
repository: prom/prometheus
tag: v2.21.0
pullPolicy: IfNotPresent
## prometheus server priorityClassName
##
priorityClassName: ""
## EnableServiceLinks indicates whether information about services should be injected
## into pod's environment variables, matching the syntax of Docker links.
## WARNING: the field is unsupported and will be skipped in K8s prior to v1.13.0.
##
enableServiceLinks: true
## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug
## so that the various internal URLs are still able to access as they are in the default case.
## (Optional)
prefixURL: ""
## External URL which can access alertmanager
## Maybe same with Ingress host name
baseURL: ""
## Additional server container environment variables
##
## You specify this manually like you would a raw deployment manifest.
## This means you can bind in environment variables from secrets.
##
## e.g. static environment variable:
## - name: DEMO_GREETING
## value: "Hello from the environment"
##
## e.g. secret environment variable:
## - name: USERNAME
## valueFrom:
## secretKeyRef:
## name: mysecret
## key: username
env: []
extraFlags:
- web.enable-lifecycle
## web.enable-admin-api flag controls access to the administrative HTTP API which includes functionality such as
## deleting time series. This is disabled by default.
# - web.enable-admin-api
##
## storage.tsdb.no-lockfile flag controls BD locking
# - storage.tsdb.no-lockfile
##
## storage.tsdb.wal-compression flag enables compression of the write-ahead log (WAL)
# - storage.tsdb.wal-compression
## Path to a configuration file on prometheus server container FS
configPath: /etc/config/prometheus.yml
global:
## How frequently to scrape targets by default
##
scrape_interval: 20s
## How long until a scrape request times out
##
scrape_timeout: 10s
## How frequently to evaluate rules
##
evaluation_interval: 30s
## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
##
remoteWrite: []
## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
##
remoteRead: []
## Additional Prometheus server container arguments
##
extraArgs: {}
## Additional InitContainers to initialize the pod
##
extraInitContainers: []
## Additional Prometheus server Volume mounts
##
extraVolumeMounts: []
## Additional Prometheus server Volumes
##
extraVolumes: []
## Additional Prometheus server hostPath mounts
##
extraHostPathMounts: []
# - name: certs-dir
# mountPath: /etc/kubernetes/certs
# subPath: ""
# hostPath: /etc/kubernetes/certs
# readOnly: true
extraConfigmapMounts: []
# - name: certs-configmap
# mountPath: /prometheus
# subPath: ""
# configMap: certs-configmap
# readOnly: true
## Additional Prometheus server Secret mounts
# Defines additional mounts with secrets. Secrets must be manually created in the namespace.
extraSecretMounts: []
# - name: secret-files
# mountPath: /etc/secrets
# subPath: ""
# secretName: prom-secret-files
# readOnly: true
## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.server.configMapOverrideName}}
## Defining configMapOverrideName will cause templates/server-configmap.yaml
## to NOT generate a ConfigMap resource
##
configMapOverrideName: ""
ingress:
## If true, Prometheus server Ingress will be created
##
enabled: false
## Prometheus server Ingress annotations
##
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: 'true'
## Prometheus server Ingress additional labels
##
extraLabels: {}
## Prometheus server Ingress hostnames with optional path
## Must be provided if Ingress is enabled
##
hosts: []
# - prometheus.domain.com
# - domain.com/prometheus
## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
extraPaths: []
# - path: /*
# backend:
# serviceName: ssl-redirect
# servicePort: use-annotation
## Prometheus server Ingress TLS configuration
## Secrets must be manually created in the namespace
##
tls: []
# - secretName: prometheus-server-tls
# hosts:
# - prometheus.domain.com
## Server Deployment Strategy type
# strategy:
# type: Recreate
## hostAliases allows adding entries to /etc/hosts inside the containers
hostAliases: []
# - ip: "127.0.0.1"
# hostnames:
# - "example.com"
## Node tolerations for server scheduling to nodes with taints
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
##
tolerations: []
# - key: "key"
# operator: "Equal|Exists"
# value: "value"
# effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
## Node labels for Prometheus server pod assignment
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
##
nodeSelector: {}
## Pod affinity
##
affinity: {}
## PodDisruptionBudget settings
## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
##
podDisruptionBudget:
enabled: false
maxUnavailable: 1
## Use an alternate scheduler, e.g. "stork".
## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
##
# schedulerName:
persistentVolume:
## If true, Prometheus server will create/use a Persistent Volume Claim
## If false, use emptyDir
##
enabled: true
## Prometheus server data Persistent Volume access modes
## Must match those of existing PV or dynamic provisioner
## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
##
accessModes:
- ReadWriteOnce
## Prometheus server data Persistent Volume annotations
##
annotations: {}
## Prometheus server data Persistent Volume existing claim name
## Requires server.persistentVolume.enabled: true
## If defined, PVC must be created manually before volume will be bound
existingClaim: ""
## Prometheus server data Persistent Volume mount root path
##
mountPath: /data
## Prometheus server data Persistent Volume size
##
size: 8Gi
## Prometheus server data Persistent Volume Storage Class
## If defined, storageClassName: <storageClass>
## If set to "-", storageClassName: "", which disables dynamic provisioning
## If undefined (the default) or set to null, no storageClassName spec is
## set, choosing the default provisioner. (gp2 on AWS, standard on
## GKE, AWS & OpenStack)
##
# storageClass: "-"
## Prometheus server data Persistent Volume Binding Mode
## If defined, volumeBindingMode: <volumeBindingMode>
## If undefined (the default) or set to null, no volumeBindingMode spec is
## set, choosing the default mode.
##
# volumeBindingMode: ""
## Subdirectory of Prometheus server data Persistent Volume to mount
## Useful if the volume's root directory is not empty
##
subPath: ""
emptyDir:
sizeLimit: ""
## Annotations to be added to Prometheus server pods
##
podAnnotations: {}
# iam.amazonaws.com/role: prometheus
## Labels to be added to Prometheus server pods
##
podLabels: {}
## Prometheus AlertManager configuration
##
alertmanagers: []
## Specify if a Pod Security Policy for node-exporter must be created
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
##
podSecurityPolicy:
annotations: {}
## Specify pod annotations
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
##
# seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
# seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
# apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
## Use a StatefulSet if replicaCount needs to be greater than 1 (see below)
##
replicaCount: 1
## Annotations to be added to deployment
##
deploymentAnnotations: {}
statefulSet:
## If true, use a statefulset instead of a deployment for pod management.
## This allows to scale replicas to more than 1 pod
##
enabled: false
annotations: {}
labels: {}
podManagementPolicy: OrderedReady
## Alertmanager headless service to use for the statefulset
##
headless:
annotations: {}
labels: {}
servicePort: 80
## Enable gRPC port on service to allow auto discovery with thanos-querier
gRPC:
enabled: false
servicePort: 10901
# nodePort: 10901
## Prometheus server readiness and liveness probe initial delay and timeout
## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
##
readinessProbeInitialDelay: 30
readinessProbePeriodSeconds: 5
readinessProbeTimeout: 30
readinessProbeFailureThreshold: 3
readinessProbeSuccessThreshold: 1
livenessProbeInitialDelay: 30
livenessProbePeriodSeconds: 15
livenessProbeTimeout: 30
livenessProbeFailureThreshold: 3
livenessProbeSuccessThreshold: 1
## Prometheus server resource requests and limits
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
##
resources: {}
# limits:
# cpu: 500m
# memory: 512Mi
# requests:
# cpu: 500m
# memory: 512Mi
## Vertical Pod Autoscaler config
## Ref: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler
verticalAutoscaler:
## If true a VPA object will be created for the controller (either StatefulSet or Deployemnt, based on above configs)
enabled: false
# updateMode: "Auto"
# containerPolicies:
# - containerName: 'prometheus-server'
## Security context to be added to server pods
##
securityContext:
runAsUser: 65534
runAsNonRoot: true
runAsGroup: 65534
fsGroup: 65534
service:
annotations: {}
labels: {}
clusterIP: ""
## List of IP addresses at which the Prometheus server service is available
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
##
externalIPs: []
loadBalancerIP: ""
loadBalancerSourceRanges: []
servicePort: 9090
sessionAffinity: None
type: ClusterIP
## Enable gRPC port on service to allow auto discovery with thanos-querier
gRPC:
enabled: false
servicePort: 10901
# nodePort: 10901
## If using a statefulSet (statefulSet.enabled=true), configure the
## service to connect to a specific replica to have a consistent view
## of the data.
statefulsetReplica:
enabled: false
replica: 0
## Prometheus server pod termination grace period
##
terminationGracePeriodSeconds: 300
## Prometheus data retention period (default if not specified is 15 days)
##
retention: "15d"
pushgateway:
## If false, pushgateway will not be installed
##
enabled: true
## Use an alternate scheduler, e.g. "stork".
## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
##
# schedulerName:
## pushgateway container name
##
name: pushgateway
## pushgateway container image
##
image:
repository: prom/pushgateway
tag: v1.2.0
pullPolicy: IfNotPresent
## pushgateway priorityClassName
##
priorityClassName: ""
## Additional pushgateway container arguments
##
## for example: persistence.file: /data/pushgateway.data
extraArgs: {}
## Additional InitContainers to initialize the pod
##
extraInitContainers: []
ingress:
## If true, pushgateway Ingress will be created
##
enabled: false
## pushgateway Ingress annotations
##
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: 'true'
## pushgateway Ingress hostnames with optional path
## Must be provided if Ingress is enabled
##
hosts: []
# - pushgateway.domain.com
# - domain.com/pushgateway
## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
extraPaths: []
# - path: /*
# backend:
# serviceName: ssl-redirect
# servicePort: use-annotation
## pushgateway Ingress TLS configuration
## Secrets must be manually created in the namespace
##
tls: []
# - secretName: prometheus-alerts-tls
# hosts:
# - pushgateway.domain.com
## Node tolerations for pushgateway scheduling to nodes with taints
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
##
tolerations: []
# - key: "key"
# operator: "Equal|Exists"
# value: "value"
# effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
## Node labels for pushgateway pod assignment
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
##
nodeSelector: {}
## Annotations to be added to pushgateway pods
##
podAnnotations: {}
## Labels to be added to pushgateway pods
##
podLabels: {}
## Specify if a Pod Security Policy for node-exporter must be created
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
##
podSecurityPolicy:
annotations: {}
## Specify pod annotations
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
##
# seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
# seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
# apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
replicaCount: 1
## Annotations to be added to deployment
##
deploymentAnnotations: {}
## PodDisruptionBudget settings
## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
##
podDisruptionBudget:
enabled: false
maxUnavailable: 1
## pushgateway resource requests and limits
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
##
resources: {}
# limits:
# cpu: 10m
# memory: 32Mi
# requests:
# cpu: 10m
# memory: 32Mi
## Security context to be added to push-gateway pods
##
securityContext:
runAsUser: 65534
runAsNonRoot: true
service:
annotations:
prometheus.io/probe: pushgateway
labels: {}
clusterIP: ""
## List of IP addresses at which the pushgateway service is available
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
##
externalIPs: []
loadBalancerIP: ""
loadBalancerSourceRanges: []
servicePort: 9091
type: ClusterIP
## pushgateway Deployment Strategy type
# strategy:
# type: Recreate
persistentVolume:
## If true, pushgateway will create/use a Persistent Volume Claim
## If false, use emptyDir
##
enabled: false
## pushgateway data Persistent Volume access modes
## Must match those of existing PV or dynamic provisioner
## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
##
accessModes:
- ReadWriteOnce
## pushgateway data Persistent Volume Claim annotations
##
annotations: {}
## pushgateway data Persistent Volume existing claim name
## Requires pushgateway.persistentVolume.enabled: true
## If defined, PVC must be created manually before volume will be bound
existingClaim: ""
## pushgateway data Persistent Volume mount root path
##
mountPath: /data
## pushgateway data Persistent Volume size
##
size: 2Gi
## pushgateway data Persistent Volume Storage Class
## If defined, storageClassName: <storageClass>
## If set to "-", storageClassName: "", which disables dynamic provisioning
## If undefined (the default) or set to null, no storageClassName spec is
## set, choosing the default provisioner. (gp2 on AWS, standard on
## GKE, AWS & OpenStack)
##
# storageClass: "-"
## pushgateway data Persistent Volume Binding Mode
## If defined, volumeBindingMode: <volumeBindingMode>
## If undefined (the default) or set to null, no volumeBindingMode spec is
## set, choosing the default mode.
##
# volumeBindingMode: ""
## Subdirectory of pushgateway data Persistent Volume to mount
## Useful if the volume's root directory is not empty
##
subPath: ""
## alertmanager ConfigMap entries
##
alertmanagerFiles:
alertmanager.yml:
global:
resolve_timeout: 30s
route:
group_by: ["alertname"]
group_wait: 5s
group_interval: 10s
repeat_interval: 999h
receiver: "default"
routes:
- receiver: "default"
group_by: []
match_re:
alertname: .*
continue: true
- receiver: "watchdog"
group_by: ["alertname", "instance"]
match_re:
alertname: Watchdog
continue: false
- receiver: "by-cluster-service"
group_by: ["alertname", "cluster", "service"]
match_re:
alertname: .*
continue: true
- receiver: "by-name"
group_by: [alertname]
match_re:
alertname: .*
continue: true
- receiver: "by-cluster"
group_by: [cluster]
match_re:
alertname: .*
continue: true
inhibit_rules:
- source_match:
severity: "critical"
target_match:
severity: "warning"
# Apply inhibition if the alertname and cluster is the same in both
equal: ["alertname", "cluster"]
receivers:
- name: "default"
- name: "watchdog"
- name: "by-cluster-service"
- name: "by-name"
- name: "by-cluster"
## Prometheus server ConfigMap entries
##
serverFiles:
## Alerts configuration
## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
#alerting_rules.yml: {}
alerting_rules.yml:
groups:
- name: promethus
rules:
- alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus job missing (instance {{ $labels.instance }})"
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus target missing (instance {{ $labels.instance }})"
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAllTargetsMissing
expr: count by (job) (up) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus all targets missing (instance {{ $labels.instance }})"
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})"
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: alertmanager_config_last_reload_successful != 1
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})"
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerConfigNotSynced
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})"
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})"
description: "Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus target empty (instance {{ $labels.instance }})"
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus large scrape (instance {{ $labels.instance }})"
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})"
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCheckpointDeletionFailures
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbHeadTruncationsFailed
expr: increase(prometheus_tsdb_head_truncations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbReloadFailures
expr: increase(prometheus_tsdb_reloads_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: node-exporter
rules:
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of memory (instance {{ $labels.instance }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk write rate (instance {{ $labels.instance }})"
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
# please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostDiskWillFillIn4Hours
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of inodes (instance {{ $labels.instance }})"
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk read latency (instance {{ $labels.instance }})"
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk write latency (instance {{ $labels.instance }})"
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host high CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitching
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 5000
for: 5m
labels:
severity: warning
annotations:
summary: "Host context switching (instance {{ $labels.instance }})"
description: "Context switching is growing on node (> 5000 / s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host swap is filling up (instance {{ $labels.instance }})"
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Host SystemD service crashed (instance {{ $labels.instance }})"
description: "SystemD service crashed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Host kernel version deviations (instance {{ $labels.instance }})"
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host OOM kill detected (instance {{ $labels.instance }})"
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: cadvisor
rules:
- alert: ContainerKilled
expr: time() - container_last_seen > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Container killed (instance {{ $labels.instance }})"
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
# cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly.
# If you want to exclude it from this alert, just use: container_cpu_usage_seconds_total{name!=""}
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{image!=""}[3m])) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container CPU usage (instance {{ $labels.instance }})"
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Memory usage (instance {{ $labels.instance }})"
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ContainerVolumeUsage
expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Volume usage (instance {{ $labels.instance }})"
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ContainerVolumeIoUsage
expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Volume IO usage (instance {{ $labels.instance }})"
description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Container high throttle rate (instance {{ $labels.instance }})"
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: k8s.rules
rules:
- expr: |
sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m])) by (namespace)
record: namespace:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, pod, container) (
rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m])
)
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum(container_memory_usage_bytes{image!="", container!="POD",namespace!=""}) by (namespace)
record: namespace:container_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
)
record: namespace_name:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, label_name) (
sum(container_memory_usage_bytes{image!="", container!="POD"}) by (pod, namespace)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
)
record: namespace_name:container_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_memory_bytes{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
)
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_cpu_cores{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
)
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
- expr: |
sum(
label_replace(
label_replace(
kube_pod_owner{component="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{component="kube-state-metrics"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: deployment
record: mixin_pod_workload
- expr: |
sum(
label_replace(
kube_pod_owner{component="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: daemonset
record: mixin_pod_workload
- expr: |
sum(
label_replace(
kube_pod_owner{component="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: statefulset
record: mixin_pod_workload
- name: kube-scheduler.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- name: kube-apiserver.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- name: node.rules
rules:
- expr: sum(min(kube_pod_info) by (node))
record: ':kube_pod_info_node_count:'
- expr: |
max(label_replace(kube_pod_info{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
count by (node) (sum by (node, cpu) (
node_cpu_seconds_total{component="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
))
record: node:node_num_cpu:sum
- expr: |
1 - avg(rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m]))
record: :node_cpu_utilization:avg1m
- expr: |
1 - avg by (node) (
rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
record: node:node_cpu_utilization:avg1m
- expr: |
node:node_cpu_utilization:avg1m
*
node:node_num_cpu:sum
/
scalar(sum(node:node_num_cpu:sum))
record: node:cluster_cpu_utilization:ratio
- expr: |
sum(node_load1{component="node-exporter"})
/
sum(node:node_num_cpu:sum)
record: ':node_cpu_saturation_load1:'
- expr: |
sum by (node) (
node_load1{component="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
node:node_num_cpu:sum
record: 'node:node_cpu_saturation_load1:'
- expr: |
1 -
sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
/
sum(node_memory_MemTotal_bytes{component="node-exporter"})
record: ':node_memory_utilization:'
- expr: |
sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
record: :node_memory_MemFreeCachedBuffers_bytes:sum
- expr: |
sum(node_memory_MemTotal_bytes{component="node-exporter"})
record: :node_memory_MemTotal_bytes:sum
- expr: |
sum by (node) (
(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_available:sum
- expr: |
sum by (node) (
node_memory_MemTotal_bytes{component="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_total:sum
- expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
node:node_memory_bytes_total:sum
record: node:node_memory_utilization:ratio
- expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
record: node:cluster_memory_utilization:ratio
- expr: |
1e3 * sum(
(rate(node_vmstat_pgpgin{component="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{component="node-exporter"}[1m]))
)
record: :node_memory_swap_io_bytes:sum_rate
- expr: |
1 -
sum by (node) (
(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
sum by (node) (
node_memory_MemTotal_bytes{component="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: 'node:node_memory_utilization:'
- expr: |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
record: 'node:node_memory_utilization_2:'
- expr: |
1e3 * sum by (node) (
(rate(node_vmstat_pgpgin{component="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{component="node-exporter"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_swap_io_bytes:sum_rate
- expr: |
avg(irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
record: :node_disk_utilization:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_utilization:avg_irate
- expr: |
avg(irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
record: :node_disk_saturation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_saturation:avg_irate
- expr: |
max by (namespace, nodename, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_usage:'
- expr: |
max by (namespace, nodename, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_avail:'
- expr: |
sum(irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) +
sum(irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m]))
record: :node_net_utilization:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m]) +
irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_utilization:sum_irate
- expr: |
sum(irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m])) +
sum(irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m]))
record: :node_net_saturation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m]) +
irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_saturation:sum_irate
- expr: |
max(
max(
kube_pod_info{component="kube-state-metrics", host_ip!=""}
) by (node, host_ip)
* on (host_ip) group_right (node)
label_replace(
(max(node_filesystem_files{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
)
) by (node)
record: 'node:node_inodes_total:'
- expr: |
max(
max(
kube_pod_info{component="kube-state-metrics", host_ip!=""}
) by (node, host_ip)
* on (host_ip) group_right (node)
label_replace(
(max(node_filesystem_files_free{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
)
) by (node)
record: 'node:node_inodes_free:'
- name: cluster.rules
rules:
# Total number of CPU cores in the cluster.
- expr: |
sum(node:node_num_cpu:sum)
record: cluster:cpu_total
# Cluster-wide CPU usage rate in percent.
- expr: |
sum(node:cluster_cpu_utilization:ratio * 100)
record: cluster:cpu_usage_rate
# Cluster-wide total RAM in bytes.
- expr: |
sum(node:node_memory_bytes_total:sum)
record: cluster:memory_total_bytes
# Cluster-wide RAM usage in bytes.
- expr: |
sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)
record: cluster:memory_usage_bytes
# Cluster-wide RAM usage rate in percent.
- expr: |
(sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)) / scalar(sum(node:node_memory_bytes_total:sum)) * 100
record: cluster:memory_usage_rate
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
(instance)
record: instance:node_cpu:rate:sum
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
BY (instance)
record: instance:node_filesystem_usage:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
(cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kubernetes-absent
rules:
- alert: KubeAPIDown
annotations:
message: KubeAPI has disappeared from Prometheus target discovery.
expr: |
absent(up{job="kubernetes-apiservers"} == 1)
for: 15m
labels:
severity: critical
- alert: NodeExporterDown
annotations:
message: NodeExporter has disappeared from Prometheus target discovery.
expr: |
absent(up{component="node-exporter"} == 1)
for: 15m
labels:
severity: critical
- alert: PrometheusDown
annotations:
message: Prometheus has disappeared from Prometheus target discovery.
expr: |
absent(up{job="prometheus"} == 1)
for: 15m
labels:
severity: critical
- alert: cAdvisoDown
annotations:
message: cAdviso has disappeared from Prometheus target discovery.
expr: |
absent(up{job="kubernetes-nodes-cadvisor"} == 1)
for: 15m
labels:
severity: critical
- name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
expr: |
rate(kube_pod_container_status_restarts_total{component="kube-state-metrics"}[15m]) * 60 * 5 > 0
for: 1h
labels:
severity: critical
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than an hour.
expr: |
sum by (namespace, pod) (kube_pod_status_phase{component="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
for: 1h
labels:
severity: critical
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
expr: |
kube_deployment_status_observed_generation{component="kube-state-metrics"}
!=
kube_deployment_metadata_generation{component="kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than an hour.
expr: |
kube_deployment_spec_replicas{component="kube-state-metrics"}
!=
kube_deployment_status_replicas_available{component="kube-state-metrics"}
for: 1h
labels:
severity: critical
- alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
expr: |
kube_statefulset_status_replicas_ready{component="kube-state-metrics"}
!=
kube_statefulset_status_replicas{component="kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
expr: |
kube_statefulset_status_observed_generation{component="kube-state-metrics"}
!=
kube_statefulset_metadata_generation{component="kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
expr: |
max without (revision) (
kube_statefulset_status_current_revision{component="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{component="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{component="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{component="kube-state-metrics"}
)
for: 15m
labels:
severity: critical
- alert: KubeDaemonSetRolloutStuck
annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
}}/{{ $labels.daemonset }} are scheduled and ready.
expr: |
kube_daemonset_status_number_ready{component="kube-state-metrics"}
/
kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"} * 100 < 100
for: 15m
labels:
severity: critical
- alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
expr: |
kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{component="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
expr: |
kube_daemonset_status_number_misscheduled{component="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
- alert: KubeCronJobRunning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
than 1h to complete.
expr: |
time() - kube_cronjob_next_schedule_time{component="kube-state-metrics"} > 3600
for: 1h
labels:
severity: warning
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than one hour to complete.
expr: |
kube_job_spec_completions{component="kube-state-metrics"} - kube_job_status_succeeded{component="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
- alert: KubeJobFailed
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
expr: |
kube_job_status_failed{component="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
- name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
expr: |
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
/
sum(node:node_num_cpu:sum)
>
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
expr: |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
/
sum(node_memory_MemTotal_bytes)
>
(count(node:node_num_cpu:sum)-1)
/
count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
expr: |
sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="cpu"})
/
sum(node:node_num_cpu:sum)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
expr: |
sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="memory"})
/
sum(node_memory_MemTotal_bytes{component="node-exporter"})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaExceeded
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
expr: |
100 * kube_resourcequota{component="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{component="kube-state-metrics", type="hard"} > 0)
> 90
for: 15m
labels:
severity: warning
- name: kubernetes-storage
rules:
- alert: KubePersistentVolumeUsageCritical
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
}}% free.
expr: |
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
< 3
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFullInFourDays
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ printf "%0.2f" $value }}% is available.
expr: |
100 * (
kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
) < 15
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 5m
labels:
severity: critical
- alert: KubePersistentVolumeErrors
annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{
$labels.phase }}.
expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",component="kube-state-metrics"} > 0
for: 5m
labels:
severity: critical
- name: kubernetes-system
rules:
- alert: KubeNodeNotReady
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
expr: |
kube_node_status_condition{component="kube-state-metrics",condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
- alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different semantic versions of Kubernetes
components running.
expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
for: 1h
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
/
sum(rate(rest_client_requests_total[5m])) by (instance, job))
* 100 > 1
for: 15m
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
expr: |
sum(rate(ksm_scrape_error_total{component="kube-state-metrics"}[5m])) by (instance, job) > 0.1
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
to the limit of 110.
expr: |
kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
for: 15m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
expr: |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
for: 10m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
expr: |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
expr: |
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m]))
/
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 3
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
expr: |
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m]))
/
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 1
for: 10m
labels:
severity: warning
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests for
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
expr: |
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 10
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests for
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
expr: |
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 5
for: 10m
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
expr: |
apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 604800
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
expr: |
apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 86400
labels:
severity: critical
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
annotations:
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync.
expr: |
count_values("config_hash", alertmanager_config_hash{job="prometheus-alertmanager"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
for: 5m
labels:
severity: critical
- alert: AlertmanagerFailedReload
annotations:
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
expr: |
alertmanager_config_last_reload_successful{job="prometheus-alertmanager"} == 0
for: 10m
labels:
severity: warning
- alert: AlertmanagerMembersInconsistent
annotations:
message: Alertmanager has not found all other members of the cluster.
expr: |
alertmanager_cluster_members{job="prometheus-alertmanager"}
!= on (service) GROUP_LEFT()
count by (service) (alertmanager_cluster_members{job="prometheus-alertmanager"})
for: 5m
labels:
severity: critical
- name: etcd.rules
rules:
- alert: EtcdDown
annotations:
message: Etcd instance is down on node {{ $labels.node }}.
expr: satellite_etcd_up == 0
for: 5m
labels:
severity: critical
- alert: EtcdUnhealthy
annotations:
message: Etcd cluster is unhealthy.
expr: satellite_etcd_health == 0
for: 1m
labels:
severity: critical
- name: sysctl.rules
rules:
- alert: BrNetfilterMissing
annotations:
message: Bridge netfilter is disabled on node {{ $labels.node }}
runbook_url: https://gravitational.com/gravity/docs/requirements/#br_netfilter-module
expr: max_over_time(satellite_sysctl_br_netfilter[1h]) unless satellite_sysctl_br_netfilter or satellite_sysctl_br_netfilter == 0
for: 5m
labels:
severity: critical
- alert: IPv4ForwardingMissing
annotations:
message: IPv4 forwarding is disabled on node {{ $labels.node }}
runbook_url: https://gravitational.com/gravity/docs/faq/#ipv4-forwarding
expr: max_over_time(satellite_sysctl_ipv4_forwarding[1h]) unless satellite_sysctl_ipv4_forwarding or satellite_sysctl_ipv4_forwarding == 0
for: 5m
labels:
severity: critical
- name: docker.rules
rules:
- alert: DockerDown
annotations:
message: Docker daemon is down on host {{ $labels.node }}
expr: satellite_docker_health == 0
for: 5m
labels:
severity: critical
- name: systemd.rules
rules:
- alert: SystemdDegraded
annotations:
message: Systemd on host {{ $labels.node }}
expr: satellite_systemd_health == 0
for: 5m
labels:
severity: critical
- alert: SystemdUnitDegraded
annotations:
message: Systemd unit {{ $labels.unit_name }} is degraded on host {{ $labels.node }}
expr: satellite_systemd_unit_health == 0
for: 5m
labels:
severity: critical
- name: general.rules
rules:
- alert: TargetDown
annotations:
message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
message: |
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
expr: vector(1)
labels:
severity: none
- name: kube-prometheus-node-alerting.rules
rules:
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
}}/{{ $labels.pod }} will be full within the next 24 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
for: 30m
labels:
severity: warning
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
}}/{{ $labels.pod }} will be full within the next 2 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
for: 10m
labels:
severity: critical
- name: node-network
rules:
- alert: NetworkReceiveErrors
annotations:
message: Network interface "{{ $labels.device }}" showing receive errors on
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
rate(node_network_receive_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0
for: 2m
labels:
severity: warning
- alert: NetworkTransmitErrors
annotations:
message: Network interface "{{ $labels.device }}" showing transmit errors
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
rate(node_network_transmit_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0
for: 2m
labels:
severity: warning
- alert: NodeNetworkInterfaceFlapping
annotations:
message: Network interface "{{ $labels.device }}" changing it's up status
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
changes(node_network_up{component="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning
- name: prometheus.rules
rules:
- alert: PrometheusConfigReloadFailed
annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
summary: Reloading Prometheus' configuration failed
expr: |
prometheus_config_last_reload_successful{job="prometheus"} == 0
for: 10m
labels:
severity: warning
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}}
summary: Prometheus' alert notification queue is running full
expr: |
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"}
for: 10m
labels:
severity: warning
- alert: PrometheusErrorSendingAlerts
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alert from Prometheus
expr: |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01
for: 10m
labels:
severity: warning
- alert: PrometheusErrorSendingAlerts
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alerts from Prometheus
expr: |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03
for: 10m
labels:
severity: critical
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers
summary: Prometheus is not connected to any Alertmanagers
expr: |
prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0
for: 12h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0
for: 12h
labels:
severity: warning
- alert: PrometheusTSDBWALCorruptions
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).'
summary: Prometheus write-ahead log is corrupted
expr: |
prometheus_tsdb_wal_corruptions_total{job="prometheus"} > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting
samples.
summary: Prometheus isn't ingesting samples
expr: |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
for: 10m
labels:
severity: warning
- alert: PrometheusTargetScrapesDuplicate
annotations:
description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected
due to duplicate timestamps but different values'
summary: Prometheus has many samples rejected
expr: |
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
for: 10m
labels:
severity: warning
# groups:
# - name: Instances
# rules:
# - alert: InstanceDown
# expr: up == 0
# for: 5m
# labels:
# severity: page
# annotations:
# description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
# summary: 'Instance {{ $labels.instance }} down'
## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml
alerts: {}
## Records configuration
## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
recording_rules.yml: {}
## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml
rules: {}
prometheus.yml:
rule_files:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
## Below two files are DEPRECATED will be removed from this default values file
- /etc/config/rules
- /etc/config/alerts
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
# and services to allow each to use different authentication configs.
#
# Kubernetes labels will be added as Prometheus labels on metrics via the
# `labelmap` relabeling action.
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics
- job_name: 'kubernetes-nodes-cadvisor'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
# This configuration will work only on kubelet 1.7.3+
# As the scrape endpoints for cAdvisor have changed
# if you are using older version you need to change the replacement to
# replacement: /api/v1/nodes/$1:4194/proxy/metrics
# more info here https://github.com/coreos/prometheus-operator/issues/633
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: kubernetes_node
# Scrape config for slow service endpoints; same as above, but with a larger
# timeout and a larger interval
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints-slow'
scrape_interval: 5m
scrape_timeout: 30s
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: kubernetes_node
- job_name: 'prometheus-pushgateway'
honor_labels: true
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: pushgateway
# Example scrape config for probing services via the Blackbox Exporter.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/probe`: Only probe services that have a value of `true`
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- source_labels: [__meta_kubernetes_pod_phase]
regex: Pending|Succeeded|Failed
action: drop
# Example Scrape config for pods which should be scraped slower. An useful example
# would be stackriver-exporter which querys an API on every scrape of the pod
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
- job_name: 'kubernetes-pods-slow'
scrape_interval: 5m
scrape_timeout: 30s
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- source_labels: [__meta_kubernetes_pod_phase]
regex: Pending|Succeeded|Failed
action: drop
# adds additional scrape configs to prometheus.yml
# must be a string so you have to add a | after extraScrapeConfigs:
# example adds prometheus-blackbox-exporter scrape config
extraScrapeConfigs:
# - job_name: 'prometheus-blackbox-exporter'
# metrics_path: /probe
# params:
# module: [http_2xx]
# static_configs:
# - targets:
# - https://example.com
# relabel_configs:
# - source_labels: [__address__]
# target_label: __param_target
# - source_labels: [__param_target]
# target_label: instance
# - target_label: __address__
# replacement: prometheus-blackbox-exporter:9115
# Adds option to add alert_relabel_configs to avoid duplicate alerts in alertmanager
# useful in H/A prometheus with different external labels but the same alerts
alertRelabelConfigs:
# alert_relabel_configs:
# - source_labels: [dc]
# regex: (.+)\d+
# target_label: dc
networkPolicy:
## Enable creation of NetworkPolicy resources.
##
enabled: true
# Force namespace of namespaced resources
forceNamespace: null