You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3217 lines
118 KiB
YAML
3217 lines
118 KiB
YAML
rbac:
|
|
create: true
|
|
|
|
podSecurityPolicy:
|
|
enabled: false
|
|
|
|
imagePullSecrets:
|
|
# - name: "image-pull-secret"
|
|
|
|
## Define serviceAccount names for components. Defaults to component's fully qualified name.
|
|
##
|
|
serviceAccounts:
|
|
alertmanager:
|
|
create: true
|
|
name:
|
|
annotations: {}
|
|
nodeExporter:
|
|
create: true
|
|
name:
|
|
annotations: {}
|
|
pushgateway:
|
|
create: true
|
|
name:
|
|
annotations: {}
|
|
kubeStateMetrics:
|
|
create: true
|
|
name:
|
|
annotations: {}
|
|
server:
|
|
create: true
|
|
name:
|
|
annotations: {}
|
|
|
|
alertmanager:
|
|
## If false, alertmanager will not be installed
|
|
##
|
|
enabled: true
|
|
|
|
## Use a ClusterRole (and ClusterRoleBinding)
|
|
## - If set to false - we define a Role and RoleBinding in the defined namespaces ONLY
|
|
## This makes alertmanager work - for users who do not have ClusterAdmin privs, but wants alertmanager to operate on their own namespaces, instead of clusterwide.
|
|
useClusterRole: true
|
|
|
|
## Set to a rolename to use existing role - skipping role creating - but still doing serviceaccount and rolebinding to the rolename set here.
|
|
useExistingRole: false
|
|
|
|
## alertmanager container name
|
|
##
|
|
name: alertmanager
|
|
|
|
## alertmanager container image
|
|
##
|
|
image:
|
|
repository: prom/alertmanager
|
|
tag: v0.21.0
|
|
pullPolicy: IfNotPresent
|
|
|
|
## alertmanager priorityClassName
|
|
##
|
|
priorityClassName: ""
|
|
|
|
## Additional alertmanager container arguments
|
|
##
|
|
extraArgs: {}
|
|
|
|
## Additional InitContainers to initialize the pod
|
|
##
|
|
extraInitContainers: []
|
|
|
|
## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug
|
|
## so that the various internal URLs are still able to access as they are in the default case.
|
|
## (Optional)
|
|
prefixURL: ""
|
|
|
|
## External URL which can access alertmanager
|
|
baseURL: "http://localhost:9093"
|
|
|
|
## Additional alertmanager container environment variable
|
|
## For instance to add a http_proxy
|
|
##
|
|
extraEnv: {}
|
|
|
|
## Additional alertmanager Secret mounts
|
|
# Defines additional mounts with secrets. Secrets must be manually created in the namespace.
|
|
extraSecretMounts: []
|
|
# - name: secret-files
|
|
# mountPath: /etc/secrets
|
|
# subPath: ""
|
|
# secretName: alertmanager-secret-files
|
|
# readOnly: true
|
|
|
|
## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}}
|
|
## Defining configMapOverrideName will cause templates/alertmanager-configmap.yaml
|
|
## to NOT generate a ConfigMap resource
|
|
##
|
|
configMapOverrideName: ""
|
|
|
|
## The name of a secret in the same kubernetes namespace which contains the Alertmanager config
|
|
## Defining configFromSecret will cause templates/alertmanager-configmap.yaml
|
|
## to NOT generate a ConfigMap resource
|
|
##
|
|
configFromSecret: ""
|
|
|
|
## The configuration file name to be loaded to alertmanager
|
|
## Must match the key within configuration loaded from ConfigMap/Secret
|
|
##
|
|
configFileName: alertmanager.yml
|
|
|
|
ingress:
|
|
## If true, alertmanager Ingress will be created
|
|
##
|
|
enabled: false
|
|
|
|
## alertmanager Ingress annotations
|
|
##
|
|
annotations: {}
|
|
# kubernetes.io/ingress.class: nginx
|
|
# kubernetes.io/tls-acme: 'true'
|
|
|
|
## alertmanager Ingress additional labels
|
|
##
|
|
extraLabels: {}
|
|
|
|
## alertmanager Ingress hostnames with optional path
|
|
## Must be provided if Ingress is enabled
|
|
##
|
|
hosts: []
|
|
# - alertmanager.domain.com
|
|
# - domain.com/alertmanager
|
|
|
|
## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
|
|
extraPaths: []
|
|
# - path: /*
|
|
# backend:
|
|
# serviceName: ssl-redirect
|
|
# servicePort: use-annotation
|
|
|
|
## alertmanager Ingress TLS configuration
|
|
## Secrets must be manually created in the namespace
|
|
##
|
|
tls: []
|
|
# - secretName: prometheus-alerts-tls
|
|
# hosts:
|
|
# - alertmanager.domain.com
|
|
|
|
## Alertmanager Deployment Strategy type
|
|
# strategy:
|
|
# type: Recreate
|
|
|
|
## Node tolerations for alertmanager scheduling to nodes with taints
|
|
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
|
|
##
|
|
tolerations: []
|
|
# - key: "key"
|
|
# operator: "Equal|Exists"
|
|
# value: "value"
|
|
# effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
|
|
|
|
## Node labels for alertmanager pod assignment
|
|
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
|
|
##
|
|
nodeSelector: {}
|
|
|
|
## Pod affinity
|
|
##
|
|
affinity: {}
|
|
|
|
## PodDisruptionBudget settings
|
|
## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
|
|
##
|
|
podDisruptionBudget:
|
|
enabled: false
|
|
maxUnavailable: 1
|
|
|
|
## Use an alternate scheduler, e.g. "stork".
|
|
## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
|
|
##
|
|
# schedulerName:
|
|
|
|
persistentVolume:
|
|
## If true, alertmanager will create/use a Persistent Volume Claim
|
|
## If false, use emptyDir
|
|
##
|
|
enabled: true
|
|
|
|
## alertmanager data Persistent Volume access modes
|
|
## Must match those of existing PV or dynamic provisioner
|
|
## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
|
|
##
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
|
|
## alertmanager data Persistent Volume Claim annotations
|
|
##
|
|
annotations: {}
|
|
|
|
## alertmanager data Persistent Volume existing claim name
|
|
## Requires alertmanager.persistentVolume.enabled: true
|
|
## If defined, PVC must be created manually before volume will be bound
|
|
existingClaim: ""
|
|
|
|
## alertmanager data Persistent Volume mount root path
|
|
##
|
|
mountPath: /data
|
|
|
|
## alertmanager data Persistent Volume size
|
|
##
|
|
size: 2Gi
|
|
|
|
## alertmanager data Persistent Volume Storage Class
|
|
## If defined, storageClassName: <storageClass>
|
|
## If set to "-", storageClassName: "", which disables dynamic provisioning
|
|
## If undefined (the default) or set to null, no storageClassName spec is
|
|
## set, choosing the default provisioner. (gp2 on AWS, standard on
|
|
## GKE, AWS & OpenStack)
|
|
##
|
|
# storageClass: "-"
|
|
|
|
## alertmanager data Persistent Volume Binding Mode
|
|
## If defined, volumeBindingMode: <volumeBindingMode>
|
|
## If undefined (the default) or set to null, no volumeBindingMode spec is
|
|
## set, choosing the default mode.
|
|
##
|
|
# volumeBindingMode: ""
|
|
|
|
## Subdirectory of alertmanager data Persistent Volume to mount
|
|
## Useful if the volume's root directory is not empty
|
|
##
|
|
subPath: ""
|
|
|
|
## Annotations to be added to alertmanager pods
|
|
##
|
|
podAnnotations: {}
|
|
## Tell prometheus to use a specific set of alertmanager pods
|
|
## instead of all alertmanager pods found in the same namespace
|
|
## Useful if you deploy multiple releases within the same namespace
|
|
##
|
|
## prometheus.io/probe: alertmanager-teamA
|
|
|
|
## Labels to be added to Prometheus AlertManager pods
|
|
##
|
|
podLabels: {}
|
|
|
|
## Specify if a Pod Security Policy for node-exporter must be created
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
|
|
##
|
|
podSecurityPolicy:
|
|
annotations: {}
|
|
## Specify pod annotations
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
|
|
##
|
|
# seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
|
|
# seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
|
|
# apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
|
|
|
|
## Use a StatefulSet if replicaCount needs to be greater than 1 (see below)
|
|
##
|
|
replicaCount: 1
|
|
|
|
## Annotations to be added to deployment
|
|
##
|
|
deploymentAnnotations: {}
|
|
|
|
statefulSet:
|
|
## If true, use a statefulset instead of a deployment for pod management.
|
|
## This allows to scale replicas to more than 1 pod
|
|
##
|
|
enabled: false
|
|
|
|
annotations: {}
|
|
labels: {}
|
|
podManagementPolicy: OrderedReady
|
|
|
|
## Alertmanager headless service to use for the statefulset
|
|
##
|
|
headless:
|
|
annotations: {}
|
|
labels: {}
|
|
|
|
## Enabling peer mesh service end points for enabling the HA alert manager
|
|
## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md
|
|
enableMeshPeer: false
|
|
|
|
servicePort: 80
|
|
|
|
## alertmanager resource requests and limits
|
|
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
|
|
##
|
|
resources: {}
|
|
# limits:
|
|
# cpu: 10m
|
|
# memory: 32Mi
|
|
# requests:
|
|
# cpu: 10m
|
|
# memory: 32Mi
|
|
|
|
## Security context to be added to alertmanager pods
|
|
##
|
|
securityContext:
|
|
runAsUser: 65534
|
|
runAsNonRoot: true
|
|
runAsGroup: 65534
|
|
fsGroup: 65534
|
|
|
|
service:
|
|
annotations: {}
|
|
labels: {}
|
|
clusterIP: ""
|
|
|
|
## Enabling peer mesh service end points for enabling the HA alert manager
|
|
## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md
|
|
# enableMeshPeer : true
|
|
|
|
## List of IP addresses at which the alertmanager service is available
|
|
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
|
|
##
|
|
externalIPs: []
|
|
|
|
loadBalancerIP: ""
|
|
loadBalancerSourceRanges: []
|
|
servicePort: 80
|
|
# nodePort: 30000
|
|
sessionAffinity: None
|
|
type: ClusterIP
|
|
|
|
## Monitors ConfigMap changes and POSTs to a URL
|
|
## Ref: https://github.com/jimmidyson/configmap-reload
|
|
##
|
|
configmapReload:
|
|
prometheus:
|
|
## If false, the configmap-reload container will not be deployed
|
|
##
|
|
enabled: true
|
|
|
|
## configmap-reload container name
|
|
##
|
|
name: configmap-reload
|
|
|
|
## configmap-reload container image
|
|
##
|
|
image:
|
|
repository: jimmidyson/configmap-reload
|
|
tag: v0.4.0
|
|
pullPolicy: IfNotPresent
|
|
|
|
## Additional configmap-reload container arguments
|
|
##
|
|
extraArgs: {}
|
|
## Additional configmap-reload volume directories
|
|
##
|
|
extraVolumeDirs: []
|
|
|
|
|
|
## Additional configmap-reload mounts
|
|
##
|
|
extraConfigmapMounts: []
|
|
# - name: prometheus-alerts
|
|
# mountPath: /etc/alerts.d
|
|
# subPath: ""
|
|
# configMap: prometheus-alerts
|
|
# readOnly: true
|
|
|
|
|
|
## configmap-reload resource requests and limits
|
|
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
|
|
##
|
|
resources: {}
|
|
alertmanager:
|
|
## If false, the configmap-reload container will not be deployed
|
|
##
|
|
enabled: true
|
|
|
|
## configmap-reload container name
|
|
##
|
|
name: configmap-reload
|
|
|
|
## configmap-reload container image
|
|
##
|
|
image:
|
|
repository: jimmidyson/configmap-reload
|
|
tag: v0.4.0
|
|
pullPolicy: IfNotPresent
|
|
|
|
## Additional configmap-reload container arguments
|
|
##
|
|
extraArgs: {}
|
|
## Additional configmap-reload volume directories
|
|
##
|
|
extraVolumeDirs: []
|
|
|
|
|
|
## Additional configmap-reload mounts
|
|
##
|
|
extraConfigmapMounts: []
|
|
# - name: prometheus-alerts
|
|
# mountPath: /etc/alerts.d
|
|
# subPath: ""
|
|
# configMap: prometheus-alerts
|
|
# readOnly: true
|
|
|
|
|
|
## configmap-reload resource requests and limits
|
|
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
|
|
##
|
|
resources: {}
|
|
|
|
kubeStateMetrics:
|
|
## If false, kube-state-metrics will not be installed
|
|
##
|
|
enabled: true
|
|
|
|
## kube-state-metrics container name
|
|
##
|
|
name: kube-state-metrics
|
|
|
|
image:
|
|
repository: quay.io/coreos/kube-state-metrics
|
|
tag: v1.9.7
|
|
pullPolicy: IfNotPresent
|
|
|
|
podSecurityPolicy:
|
|
annotations: {}
|
|
additionalVolumes: []
|
|
|
|
imagePullSecrets: []
|
|
# - name: "image-pull-secret"
|
|
|
|
# If set to true, this will deploy kube-state-metrics as a StatefulSet and the data
|
|
# will be automatically sharded across <.Values.replicas> pods using the built-in
|
|
# autodiscovery feature: https://github.com/kubernetes/kube-state-metrics#automated-sharding
|
|
# This is an experimental feature and there are no stability guarantees.
|
|
autosharding:
|
|
enabled: false
|
|
|
|
replicas: 1
|
|
|
|
service:
|
|
port: 8080
|
|
# Default to clusterIP for backward compatibility
|
|
type: ClusterIP
|
|
nodePort: 0
|
|
loadBalancerIP: ""
|
|
annotations: {}
|
|
|
|
customLabels: {}
|
|
|
|
hostNetwork: false
|
|
|
|
securityContext:
|
|
enabled: true
|
|
runAsGroup: 65534
|
|
runAsUser: 65534
|
|
fsGroup: 65534
|
|
|
|
## Node labels for pod assignment
|
|
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
|
|
nodeSelector: {}
|
|
|
|
## Affinity settings for pod assignment
|
|
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
|
|
affinity: {}
|
|
|
|
## Tolerations for pod assignment
|
|
## Ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
|
|
tolerations: []
|
|
|
|
# Annotations to be added to the pod
|
|
podAnnotations: {}
|
|
|
|
## Assign a PriorityClassName to pods if set
|
|
# priorityClassName: ""
|
|
|
|
# Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
|
|
podDisruptionBudget: {}
|
|
|
|
# Available collectors for kube-state-metrics. By default all available
|
|
# collectors are enabled.
|
|
collectors:
|
|
certificatesigningrequests: true
|
|
configmaps: true
|
|
cronjobs: true
|
|
daemonsets: true
|
|
deployments: true
|
|
endpoints: true
|
|
horizontalpodautoscalers: true
|
|
ingresses: true
|
|
jobs: true
|
|
limitranges: true
|
|
mutatingwebhookconfigurations: true
|
|
namespaces: true
|
|
networkpolicies: true
|
|
nodes: true
|
|
persistentvolumeclaims: true
|
|
persistentvolumes: true
|
|
poddisruptionbudgets: true
|
|
pods: true
|
|
replicasets: true
|
|
replicationcontrollers: true
|
|
resourcequotas: true
|
|
secrets: true
|
|
services: true
|
|
statefulsets: true
|
|
storageclasses: true
|
|
validatingwebhookconfigurations: true
|
|
verticalpodautoscalers: false
|
|
volumeattachments: true
|
|
|
|
nodeExporter:
|
|
## If false, node-exporter will not be installed
|
|
##
|
|
enabled: true
|
|
|
|
## If true, node-exporter pods share the host network namespace
|
|
##
|
|
hostNetwork: false
|
|
|
|
## If true, node-exporter pods share the host PID namespace
|
|
##
|
|
hostPID: false
|
|
|
|
## node-exporter container name
|
|
##
|
|
name: node-exporter
|
|
|
|
## node-exporter container image
|
|
##
|
|
image:
|
|
repository: prom/node-exporter
|
|
tag: v1.0.1
|
|
pullPolicy: IfNotPresent
|
|
|
|
## Specify if a Pod Security Policy for node-exporter must be created
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
|
|
##
|
|
podSecurityPolicy:
|
|
annotations: {}
|
|
## Specify pod annotations
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
|
|
##
|
|
# seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
|
|
# seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
|
|
# apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
|
|
|
|
## node-exporter priorityClassName
|
|
##
|
|
priorityClassName: ""
|
|
|
|
## Custom Update Strategy
|
|
##
|
|
updateStrategy:
|
|
type: RollingUpdate
|
|
|
|
## Additional node-exporter container arguments
|
|
##
|
|
extraArgs: {}
|
|
|
|
## Additional InitContainers to initialize the pod
|
|
##
|
|
extraInitContainers: []
|
|
|
|
## Additional node-exporter hostPath mounts
|
|
##
|
|
extraHostPathMounts: []
|
|
# - name: textfile-dir
|
|
# mountPath: /srv/txt_collector
|
|
# hostPath: /var/lib/node-exporter
|
|
# readOnly: true
|
|
# mountPropagation: HostToContainer
|
|
|
|
extraConfigmapMounts: []
|
|
# - name: certs-configmap
|
|
# mountPath: /prometheus
|
|
# configMap: certs-configmap
|
|
# readOnly: true
|
|
|
|
## Node tolerations for node-exporter scheduling to nodes with taints
|
|
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
|
|
##
|
|
tolerations: []
|
|
# - key: "key"
|
|
# operator: "Equal|Exists"
|
|
# value: "value"
|
|
# effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
|
|
|
|
## Node labels for node-exporter pod assignment
|
|
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
|
|
##
|
|
nodeSelector: {}
|
|
|
|
## Annotations to be added to node-exporter pods
|
|
##
|
|
podAnnotations: {}
|
|
|
|
## Labels to be added to node-exporter pods
|
|
##
|
|
pod:
|
|
labels: {}
|
|
|
|
## PodDisruptionBudget settings
|
|
## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
|
|
##
|
|
podDisruptionBudget:
|
|
enabled: false
|
|
maxUnavailable: 1
|
|
|
|
## node-exporter resource limits & requests
|
|
## Ref: https://kubernetes.io/docs/user-guide/compute-resources/
|
|
##
|
|
resources: {}
|
|
# limits:
|
|
# cpu: 200m
|
|
# memory: 50Mi
|
|
# requests:
|
|
# cpu: 100m
|
|
# memory: 30Mi
|
|
|
|
## Security context to be added to node-exporter pods
|
|
##
|
|
securityContext: {}
|
|
# runAsUser: 0
|
|
|
|
service:
|
|
annotations:
|
|
prometheus.io/scrape: "true"
|
|
labels: {}
|
|
|
|
# Exposed as a headless service:
|
|
# https://kubernetes.io/docs/concepts/services-networking/service/#headless-services
|
|
clusterIP: None
|
|
|
|
## List of IP addresses at which the node-exporter service is available
|
|
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
|
|
##
|
|
externalIPs: []
|
|
|
|
hostPort: 9100
|
|
loadBalancerIP: ""
|
|
loadBalancerSourceRanges: []
|
|
servicePort: 9100
|
|
type: ClusterIP
|
|
|
|
server:
|
|
## Prometheus server container name
|
|
##
|
|
enabled: true
|
|
|
|
## Use a ClusterRole (and ClusterRoleBinding)
|
|
## - If set to false - we define a RoleBinding in the defined namespaces ONLY
|
|
##
|
|
## NB: because we need a Role with nonResourceURL's ("/metrics") - you must get someone with Cluster-admin privileges to define this role for you, before running with this setting enabled.
|
|
## This makes prometheus work - for users who do not have ClusterAdmin privs, but wants prometheus to operate on their own namespaces, instead of clusterwide.
|
|
##
|
|
## You MUST also set namespaces to the ones you have access to and want monitored by Prometheus.
|
|
##
|
|
# useExistingClusterRoleName: nameofclusterrole
|
|
|
|
## namespaces to monitor (instead of monitoring all - clusterwide). Needed if you want to run without Cluster-admin privileges.
|
|
# namespaces:
|
|
# - yournamespace
|
|
|
|
name: server
|
|
sidecarContainers:
|
|
|
|
## Prometheus server container image
|
|
##
|
|
image:
|
|
repository: prom/prometheus
|
|
tag: v2.21.0
|
|
pullPolicy: IfNotPresent
|
|
|
|
## prometheus server priorityClassName
|
|
##
|
|
priorityClassName: ""
|
|
|
|
## EnableServiceLinks indicates whether information about services should be injected
|
|
## into pod's environment variables, matching the syntax of Docker links.
|
|
## WARNING: the field is unsupported and will be skipped in K8s prior to v1.13.0.
|
|
##
|
|
enableServiceLinks: true
|
|
|
|
## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug
|
|
## so that the various internal URLs are still able to access as they are in the default case.
|
|
## (Optional)
|
|
prefixURL: ""
|
|
|
|
## External URL which can access alertmanager
|
|
## Maybe same with Ingress host name
|
|
baseURL: ""
|
|
|
|
## Additional server container environment variables
|
|
##
|
|
## You specify this manually like you would a raw deployment manifest.
|
|
## This means you can bind in environment variables from secrets.
|
|
##
|
|
## e.g. static environment variable:
|
|
## - name: DEMO_GREETING
|
|
## value: "Hello from the environment"
|
|
##
|
|
## e.g. secret environment variable:
|
|
## - name: USERNAME
|
|
## valueFrom:
|
|
## secretKeyRef:
|
|
## name: mysecret
|
|
## key: username
|
|
env: []
|
|
|
|
extraFlags:
|
|
- web.enable-lifecycle
|
|
## web.enable-admin-api flag controls access to the administrative HTTP API which includes functionality such as
|
|
## deleting time series. This is disabled by default.
|
|
# - web.enable-admin-api
|
|
##
|
|
## storage.tsdb.no-lockfile flag controls BD locking
|
|
# - storage.tsdb.no-lockfile
|
|
##
|
|
## storage.tsdb.wal-compression flag enables compression of the write-ahead log (WAL)
|
|
# - storage.tsdb.wal-compression
|
|
|
|
## Path to a configuration file on prometheus server container FS
|
|
configPath: /etc/config/prometheus.yml
|
|
|
|
global:
|
|
## How frequently to scrape targets by default
|
|
##
|
|
scrape_interval: 1m
|
|
## How long until a scrape request times out
|
|
##
|
|
scrape_timeout: 10s
|
|
## How frequently to evaluate rules
|
|
##
|
|
evaluation_interval: 1m
|
|
## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
|
|
##
|
|
remoteWrite: []
|
|
## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
|
|
##
|
|
remoteRead: []
|
|
|
|
## Additional Prometheus server container arguments
|
|
##
|
|
extraArgs: {}
|
|
|
|
## Additional InitContainers to initialize the pod
|
|
##
|
|
extraInitContainers: []
|
|
|
|
## Additional Prometheus server Volume mounts
|
|
##
|
|
extraVolumeMounts: []
|
|
|
|
## Additional Prometheus server Volumes
|
|
##
|
|
extraVolumes: []
|
|
|
|
## Additional Prometheus server hostPath mounts
|
|
##
|
|
extraHostPathMounts: []
|
|
# - name: certs-dir
|
|
# mountPath: /etc/kubernetes/certs
|
|
# subPath: ""
|
|
# hostPath: /etc/kubernetes/certs
|
|
# readOnly: true
|
|
|
|
extraConfigmapMounts: []
|
|
# - name: certs-configmap
|
|
# mountPath: /prometheus
|
|
# subPath: ""
|
|
# configMap: certs-configmap
|
|
# readOnly: true
|
|
|
|
## Additional Prometheus server Secret mounts
|
|
# Defines additional mounts with secrets. Secrets must be manually created in the namespace.
|
|
extraSecretMounts: []
|
|
# - name: secret-files
|
|
# mountPath: /etc/secrets
|
|
# subPath: ""
|
|
# secretName: prom-secret-files
|
|
# readOnly: true
|
|
|
|
## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.server.configMapOverrideName}}
|
|
## Defining configMapOverrideName will cause templates/server-configmap.yaml
|
|
## to NOT generate a ConfigMap resource
|
|
##
|
|
configMapOverrideName: ""
|
|
|
|
ingress:
|
|
## If true, Prometheus server Ingress will be created
|
|
##
|
|
enabled: false
|
|
|
|
## Prometheus server Ingress annotations
|
|
##
|
|
annotations: {}
|
|
# kubernetes.io/ingress.class: nginx
|
|
# kubernetes.io/tls-acme: 'true'
|
|
|
|
## Prometheus server Ingress additional labels
|
|
##
|
|
extraLabels: {}
|
|
|
|
## Prometheus server Ingress hostnames with optional path
|
|
## Must be provided if Ingress is enabled
|
|
##
|
|
hosts: []
|
|
# - prometheus.domain.com
|
|
# - domain.com/prometheus
|
|
|
|
## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
|
|
extraPaths: []
|
|
# - path: /*
|
|
# backend:
|
|
# serviceName: ssl-redirect
|
|
# servicePort: use-annotation
|
|
|
|
## Prometheus server Ingress TLS configuration
|
|
## Secrets must be manually created in the namespace
|
|
##
|
|
tls: []
|
|
# - secretName: prometheus-server-tls
|
|
# hosts:
|
|
# - prometheus.domain.com
|
|
|
|
## Server Deployment Strategy type
|
|
# strategy:
|
|
# type: Recreate
|
|
|
|
## hostAliases allows adding entries to /etc/hosts inside the containers
|
|
hostAliases: []
|
|
# - ip: "127.0.0.1"
|
|
# hostnames:
|
|
# - "example.com"
|
|
|
|
## Node tolerations for server scheduling to nodes with taints
|
|
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
|
|
##
|
|
tolerations: []
|
|
# - key: "key"
|
|
# operator: "Equal|Exists"
|
|
# value: "value"
|
|
# effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
|
|
|
|
## Node labels for Prometheus server pod assignment
|
|
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
|
|
##
|
|
nodeSelector: {}
|
|
|
|
## Pod affinity
|
|
##
|
|
affinity: {}
|
|
|
|
## PodDisruptionBudget settings
|
|
## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
|
|
##
|
|
podDisruptionBudget:
|
|
enabled: false
|
|
maxUnavailable: 1
|
|
|
|
## Use an alternate scheduler, e.g. "stork".
|
|
## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
|
|
##
|
|
# schedulerName:
|
|
|
|
persistentVolume:
|
|
## If true, Prometheus server will create/use a Persistent Volume Claim
|
|
## If false, use emptyDir
|
|
##
|
|
enabled: true
|
|
|
|
## Prometheus server data Persistent Volume access modes
|
|
## Must match those of existing PV or dynamic provisioner
|
|
## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
|
|
##
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
|
|
## Prometheus server data Persistent Volume annotations
|
|
##
|
|
annotations: {}
|
|
|
|
## Prometheus server data Persistent Volume existing claim name
|
|
## Requires server.persistentVolume.enabled: true
|
|
## If defined, PVC must be created manually before volume will be bound
|
|
existingClaim: ""
|
|
|
|
## Prometheus server data Persistent Volume mount root path
|
|
##
|
|
mountPath: /data
|
|
|
|
## Prometheus server data Persistent Volume size
|
|
##
|
|
size: 8Gi
|
|
|
|
## Prometheus server data Persistent Volume Storage Class
|
|
## If defined, storageClassName: <storageClass>
|
|
## If set to "-", storageClassName: "", which disables dynamic provisioning
|
|
## If undefined (the default) or set to null, no storageClassName spec is
|
|
## set, choosing the default provisioner. (gp2 on AWS, standard on
|
|
## GKE, AWS & OpenStack)
|
|
##
|
|
# storageClass: "-"
|
|
|
|
## Prometheus server data Persistent Volume Binding Mode
|
|
## If defined, volumeBindingMode: <volumeBindingMode>
|
|
## If undefined (the default) or set to null, no volumeBindingMode spec is
|
|
## set, choosing the default mode.
|
|
##
|
|
# volumeBindingMode: ""
|
|
|
|
## Subdirectory of Prometheus server data Persistent Volume to mount
|
|
## Useful if the volume's root directory is not empty
|
|
##
|
|
subPath: ""
|
|
|
|
emptyDir:
|
|
sizeLimit: ""
|
|
|
|
## Annotations to be added to Prometheus server pods
|
|
##
|
|
podAnnotations: {}
|
|
# iam.amazonaws.com/role: prometheus
|
|
|
|
## Labels to be added to Prometheus server pods
|
|
##
|
|
podLabels: {}
|
|
|
|
## Prometheus AlertManager configuration
|
|
##
|
|
alertmanagers: []
|
|
|
|
## Specify if a Pod Security Policy for node-exporter must be created
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
|
|
##
|
|
podSecurityPolicy:
|
|
annotations: {}
|
|
## Specify pod annotations
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
|
|
##
|
|
# seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
|
|
# seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
|
|
# apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
|
|
|
|
## Use a StatefulSet if replicaCount needs to be greater than 1 (see below)
|
|
##
|
|
replicaCount: 1
|
|
|
|
## Annotations to be added to deployment
|
|
##
|
|
deploymentAnnotations: {}
|
|
|
|
statefulSet:
|
|
## If true, use a statefulset instead of a deployment for pod management.
|
|
## This allows to scale replicas to more than 1 pod
|
|
##
|
|
enabled: false
|
|
|
|
annotations: {}
|
|
labels: {}
|
|
podManagementPolicy: OrderedReady
|
|
|
|
## Alertmanager headless service to use for the statefulset
|
|
##
|
|
headless:
|
|
annotations: {}
|
|
labels: {}
|
|
servicePort: 80
|
|
## Enable gRPC port on service to allow auto discovery with thanos-querier
|
|
gRPC:
|
|
enabled: false
|
|
servicePort: 10901
|
|
# nodePort: 10901
|
|
|
|
## Prometheus server readiness and liveness probe initial delay and timeout
|
|
## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
|
|
##
|
|
readinessProbeInitialDelay: 30
|
|
readinessProbePeriodSeconds: 5
|
|
readinessProbeTimeout: 30
|
|
readinessProbeFailureThreshold: 3
|
|
readinessProbeSuccessThreshold: 1
|
|
livenessProbeInitialDelay: 30
|
|
livenessProbePeriodSeconds: 15
|
|
livenessProbeTimeout: 30
|
|
livenessProbeFailureThreshold: 3
|
|
livenessProbeSuccessThreshold: 1
|
|
|
|
## Prometheus server resource requests and limits
|
|
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
|
|
##
|
|
resources: {}
|
|
# limits:
|
|
# cpu: 500m
|
|
# memory: 512Mi
|
|
# requests:
|
|
# cpu: 500m
|
|
# memory: 512Mi
|
|
|
|
## Vertical Pod Autoscaler config
|
|
## Ref: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler
|
|
verticalAutoscaler:
|
|
## If true a VPA object will be created for the controller (either StatefulSet or Deployemnt, based on above configs)
|
|
enabled: false
|
|
# updateMode: "Auto"
|
|
# containerPolicies:
|
|
# - containerName: 'prometheus-server'
|
|
|
|
## Security context to be added to server pods
|
|
##
|
|
securityContext:
|
|
runAsUser: 65534
|
|
runAsNonRoot: true
|
|
runAsGroup: 65534
|
|
fsGroup: 65534
|
|
|
|
service:
|
|
annotations: {}
|
|
labels: {}
|
|
clusterIP: ""
|
|
|
|
## List of IP addresses at which the Prometheus server service is available
|
|
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
|
|
##
|
|
externalIPs: []
|
|
|
|
loadBalancerIP: ""
|
|
loadBalancerSourceRanges: []
|
|
servicePort: 9090
|
|
sessionAffinity: None
|
|
type: ClusterIP
|
|
|
|
## Enable gRPC port on service to allow auto discovery with thanos-querier
|
|
gRPC:
|
|
enabled: false
|
|
servicePort: 10901
|
|
# nodePort: 10901
|
|
|
|
## If using a statefulSet (statefulSet.enabled=true), configure the
|
|
## service to connect to a specific replica to have a consistent view
|
|
## of the data.
|
|
statefulsetReplica:
|
|
enabled: false
|
|
replica: 0
|
|
|
|
## Prometheus server pod termination grace period
|
|
##
|
|
terminationGracePeriodSeconds: 300
|
|
|
|
## Prometheus data retention period (default if not specified is 15 days)
|
|
##
|
|
retention: "15d"
|
|
|
|
pushgateway:
|
|
## If false, pushgateway will not be installed
|
|
##
|
|
enabled: true
|
|
|
|
## Use an alternate scheduler, e.g. "stork".
|
|
## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
|
|
##
|
|
# schedulerName:
|
|
|
|
## pushgateway container name
|
|
##
|
|
name: pushgateway
|
|
|
|
## pushgateway container image
|
|
##
|
|
image:
|
|
repository: prom/pushgateway
|
|
tag: v1.2.0
|
|
pullPolicy: IfNotPresent
|
|
|
|
## pushgateway priorityClassName
|
|
##
|
|
priorityClassName: ""
|
|
|
|
## Additional pushgateway container arguments
|
|
##
|
|
## for example: persistence.file: /data/pushgateway.data
|
|
extraArgs: {}
|
|
|
|
## Additional InitContainers to initialize the pod
|
|
##
|
|
extraInitContainers: []
|
|
|
|
ingress:
|
|
## If true, pushgateway Ingress will be created
|
|
##
|
|
enabled: false
|
|
|
|
## pushgateway Ingress annotations
|
|
##
|
|
annotations: {}
|
|
# kubernetes.io/ingress.class: nginx
|
|
# kubernetes.io/tls-acme: 'true'
|
|
|
|
## pushgateway Ingress hostnames with optional path
|
|
## Must be provided if Ingress is enabled
|
|
##
|
|
hosts: []
|
|
# - pushgateway.domain.com
|
|
# - domain.com/pushgateway
|
|
|
|
## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
|
|
extraPaths: []
|
|
# - path: /*
|
|
# backend:
|
|
# serviceName: ssl-redirect
|
|
# servicePort: use-annotation
|
|
|
|
## pushgateway Ingress TLS configuration
|
|
## Secrets must be manually created in the namespace
|
|
##
|
|
tls: []
|
|
# - secretName: prometheus-alerts-tls
|
|
# hosts:
|
|
# - pushgateway.domain.com
|
|
|
|
## Node tolerations for pushgateway scheduling to nodes with taints
|
|
## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
|
|
##
|
|
tolerations: []
|
|
# - key: "key"
|
|
# operator: "Equal|Exists"
|
|
# value: "value"
|
|
# effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
|
|
|
|
## Node labels for pushgateway pod assignment
|
|
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
|
|
##
|
|
nodeSelector: {}
|
|
|
|
## Annotations to be added to pushgateway pods
|
|
##
|
|
podAnnotations: {}
|
|
|
|
## Labels to be added to pushgateway pods
|
|
##
|
|
podLabels: {}
|
|
|
|
## Specify if a Pod Security Policy for node-exporter must be created
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
|
|
##
|
|
podSecurityPolicy:
|
|
annotations: {}
|
|
## Specify pod annotations
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
|
|
## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
|
|
##
|
|
# seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
|
|
# seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
|
|
# apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'
|
|
|
|
replicaCount: 1
|
|
|
|
## Annotations to be added to deployment
|
|
##
|
|
deploymentAnnotations: {}
|
|
|
|
## PodDisruptionBudget settings
|
|
## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/
|
|
##
|
|
podDisruptionBudget:
|
|
enabled: false
|
|
maxUnavailable: 1
|
|
|
|
## pushgateway resource requests and limits
|
|
## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
|
|
##
|
|
resources: {}
|
|
# limits:
|
|
# cpu: 10m
|
|
# memory: 32Mi
|
|
# requests:
|
|
# cpu: 10m
|
|
# memory: 32Mi
|
|
|
|
## Security context to be added to push-gateway pods
|
|
##
|
|
securityContext:
|
|
runAsUser: 65534
|
|
runAsNonRoot: true
|
|
|
|
service:
|
|
annotations:
|
|
prometheus.io/probe: pushgateway
|
|
labels: {}
|
|
clusterIP: ""
|
|
|
|
## List of IP addresses at which the pushgateway service is available
|
|
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
|
|
##
|
|
externalIPs: []
|
|
|
|
loadBalancerIP: ""
|
|
loadBalancerSourceRanges: []
|
|
servicePort: 9091
|
|
type: ClusterIP
|
|
|
|
## pushgateway Deployment Strategy type
|
|
# strategy:
|
|
# type: Recreate
|
|
|
|
persistentVolume:
|
|
## If true, pushgateway will create/use a Persistent Volume Claim
|
|
## If false, use emptyDir
|
|
##
|
|
enabled: false
|
|
|
|
## pushgateway data Persistent Volume access modes
|
|
## Must match those of existing PV or dynamic provisioner
|
|
## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
|
|
##
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
|
|
## pushgateway data Persistent Volume Claim annotations
|
|
##
|
|
annotations: {}
|
|
|
|
## pushgateway data Persistent Volume existing claim name
|
|
## Requires pushgateway.persistentVolume.enabled: true
|
|
## If defined, PVC must be created manually before volume will be bound
|
|
existingClaim: ""
|
|
|
|
## pushgateway data Persistent Volume mount root path
|
|
##
|
|
mountPath: /data
|
|
|
|
## pushgateway data Persistent Volume size
|
|
##
|
|
size: 2Gi
|
|
|
|
## pushgateway data Persistent Volume Storage Class
|
|
## If defined, storageClassName: <storageClass>
|
|
## If set to "-", storageClassName: "", which disables dynamic provisioning
|
|
## If undefined (the default) or set to null, no storageClassName spec is
|
|
## set, choosing the default provisioner. (gp2 on AWS, standard on
|
|
## GKE, AWS & OpenStack)
|
|
##
|
|
# storageClass: "-"
|
|
|
|
## pushgateway data Persistent Volume Binding Mode
|
|
## If defined, volumeBindingMode: <volumeBindingMode>
|
|
## If undefined (the default) or set to null, no volumeBindingMode spec is
|
|
## set, choosing the default mode.
|
|
##
|
|
# volumeBindingMode: ""
|
|
|
|
## Subdirectory of pushgateway data Persistent Volume to mount
|
|
## Useful if the volume's root directory is not empty
|
|
##
|
|
subPath: ""
|
|
|
|
|
|
## alertmanager ConfigMap entries
|
|
##
|
|
alertmanagerFiles:
|
|
alertmanager.yml:
|
|
global:
|
|
resolve_timeout: 30s
|
|
route:
|
|
group_by: ["alertname"]
|
|
group_wait: 5s
|
|
group_interval: 10s
|
|
repeat_interval: 999h
|
|
receiver: "default"
|
|
routes:
|
|
- receiver: "default"
|
|
group_by: []
|
|
match_re:
|
|
alertname: .*
|
|
continue: true
|
|
- receiver: "watchdog"
|
|
group_by: ["alertname", "instance"]
|
|
match_re:
|
|
alertname: Watchdog
|
|
continue: false
|
|
- receiver: "by-cluster-service"
|
|
group_by: ["alertname", "cluster", "service"]
|
|
match_re:
|
|
alertname: .*
|
|
continue: true
|
|
- receiver: "by-name"
|
|
group_by: [alertname]
|
|
match_re:
|
|
alertname: .*
|
|
continue: true
|
|
- receiver: "by-cluster"
|
|
group_by: [cluster]
|
|
match_re:
|
|
alertname: .*
|
|
continue: true
|
|
|
|
inhibit_rules:
|
|
- source_match:
|
|
severity: "critical"
|
|
target_match:
|
|
severity: "warning"
|
|
# Apply inhibition if the alertname and cluster is the same in both
|
|
equal: ["alertname", "cluster"]
|
|
|
|
receivers:
|
|
- name: "default"
|
|
- name: "watchdog"
|
|
- name: "by-cluster-service"
|
|
- name: "by-name"
|
|
- name: "by-cluster"
|
|
|
|
## Prometheus server ConfigMap entries
|
|
##
|
|
serverFiles:
|
|
|
|
## Alerts configuration
|
|
## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
|
|
|
|
#alerting_rules.yml: {}
|
|
alerting_rules.yml:
|
|
groups:
|
|
- name: promethus
|
|
rules:
|
|
- alert: PrometheusJobMissing
|
|
expr: absent(up{job="prometheus"})
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus job missing (instance {{ $labels.instance }})"
|
|
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTargetMissing
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus target missing (instance {{ $labels.instance }})"
|
|
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusAllTargetsMissing
|
|
expr: count by (job) (up) == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus all targets missing (instance {{ $labels.instance }})"
|
|
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusConfigurationReloadFailure
|
|
expr: prometheus_config_last_reload_successful != 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})"
|
|
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTooManyRestarts
|
|
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
|
|
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
|
expr: alertmanager_config_last_reload_successful != 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})"
|
|
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusAlertmanagerConfigNotSynced
|
|
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})"
|
|
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusNotConnectedToAlertmanager
|
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
|
|
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusRuleEvaluationFailures
|
|
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTemplateTextExpansionFailures
|
|
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusRuleEvaluationSlow
|
|
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})"
|
|
description: "Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusNotificationsBacklog
|
|
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
|
|
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusAlertmanagerNotificationFailing
|
|
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
|
|
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTargetEmpty
|
|
expr: prometheus_sd_discovered_targets == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus target empty (instance {{ $labels.instance }})"
|
|
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTargetScrapingSlow
|
|
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
|
|
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusLargeScrape
|
|
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus large scrape (instance {{ $labels.instance }})"
|
|
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTargetScrapeDuplicate
|
|
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})"
|
|
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTsdbCheckpointCreationFailures
|
|
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTsdbCheckpointDeletionFailures
|
|
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTsdbCompactionsFailed
|
|
expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTsdbHeadTruncationsFailed
|
|
expr: increase(prometheus_tsdb_head_truncations_failed_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTsdbReloadFailures
|
|
expr: increase(prometheus_tsdb_reloads_failures_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTsdbWalCorruptions
|
|
expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: PrometheusTsdbWalTruncationsFailed
|
|
expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
|
|
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
|
|
- name: node-exporter
|
|
rules:
|
|
- alert: HostOutOfMemory
|
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host out of memory (instance {{ $labels.instance }})"
|
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostMemoryUnderMemoryPressure
|
|
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
|
|
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostUnusualNetworkThroughputIn
|
|
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
|
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostUnusualNetworkThroughputOut
|
|
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
|
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskReadRate
|
|
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
|
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskWriteRate
|
|
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host unusual disk write rate (instance {{ $labels.instance }})"
|
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
# please add ignored mountpoints in node_exporter parameters like
|
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)"
|
|
- alert: HostOutOfDiskSpace
|
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host out of disk space (instance {{ $labels.instance }})"
|
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostDiskWillFillIn4Hours
|
|
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
|
|
description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostOutOfInodes
|
|
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host out of inodes (instance {{ $labels.instance }})"
|
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskReadLatency
|
|
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host unusual disk read latency (instance {{ $labels.instance }})"
|
|
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostUnusualDiskWriteLatency
|
|
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host unusual disk write latency (instance {{ $labels.instance }})"
|
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostHighCpuLoad
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host high CPU load (instance {{ $labels.instance }})"
|
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
# 1000 context switches is an arbitrary number.
|
|
# Alert threshold depends on nature of application.
|
|
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
|
- alert: HostContextSwitching
|
|
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 5000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host context switching (instance {{ $labels.instance }})"
|
|
description: "Context switching is growing on node (> 5000 / s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostSwapIsFillingUp
|
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host swap is filling up (instance {{ $labels.instance }})"
|
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostSystemdServiceCrashed
|
|
expr: node_systemd_unit_state{state="failed"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host SystemD service crashed (instance {{ $labels.instance }})"
|
|
description: "SystemD service crashed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
|
|
|
|
- alert: HostKernelVersionDeviations
|
|
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host kernel version deviations (instance {{ $labels.instance }})"
|
|
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: HostOomKillDetected
|
|
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host OOM kill detected (instance {{ $labels.instance }})"
|
|
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
|
|
- name: cadvisor
|
|
rules:
|
|
- alert: ContainerKilled
|
|
expr: time() - container_last_seen > 60
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container killed (instance {{ $labels.instance }})"
|
|
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
# cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly.
|
|
# If you want to exclude it from this alert, just use: container_cpu_usage_seconds_total{name!=""}
|
|
- alert: ContainerCpuUsage
|
|
expr: (sum(rate(container_cpu_usage_seconds_total{image!=""}[3m])) BY (instance, name) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container CPU usage (instance {{ $labels.instance }})"
|
|
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
|
|
- alert: ContainerMemoryUsage
|
|
expr: (sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container Memory usage (instance {{ $labels.instance }})"
|
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: ContainerVolumeUsage
|
|
expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container Volume usage (instance {{ $labels.instance }})"
|
|
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: ContainerVolumeIoUsage
|
|
expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container Volume IO usage (instance {{ $labels.instance }})"
|
|
description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- alert: ContainerHighThrottleRate
|
|
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container high throttle rate (instance {{ $labels.instance }})"
|
|
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
|
|
- name: k8s.rules
|
|
rules:
|
|
- expr: |
|
|
sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m])) by (namespace)
|
|
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
|
- expr: |
|
|
sum by (namespace, pod, container) (
|
|
rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m])
|
|
)
|
|
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
|
|
- expr: |
|
|
sum(container_memory_usage_bytes{image!="", container!="POD",namespace!=""}) by (namespace)
|
|
record: namespace:container_memory_usage_bytes:sum
|
|
- expr: |
|
|
sum by (namespace, label_name) (
|
|
sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (namespace, pod)
|
|
* on (namespace, pod) group_left(label_name)
|
|
label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
|
|
)
|
|
record: namespace_name:container_cpu_usage_seconds_total:sum_rate
|
|
- expr: |
|
|
sum by (namespace, label_name) (
|
|
sum(container_memory_usage_bytes{image!="", container!="POD"}) by (pod, namespace)
|
|
* on (namespace, pod) group_left(label_name)
|
|
label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
|
|
)
|
|
record: namespace_name:container_memory_usage_bytes:sum
|
|
- expr: |
|
|
sum by (namespace, label_name) (
|
|
sum(kube_pod_container_resource_requests_memory_bytes{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
|
* on (namespace, pod) group_left(label_name)
|
|
label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
|
|
)
|
|
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
|
|
- expr: |
|
|
sum by (namespace, label_name) (
|
|
sum(kube_pod_container_resource_requests_cpu_cores{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
|
* on (namespace, pod) group_left(label_name)
|
|
label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
|
|
)
|
|
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
|
|
- expr: |
|
|
sum(
|
|
label_replace(
|
|
label_replace(
|
|
kube_pod_owner{component="kube-state-metrics", owner_kind="ReplicaSet"},
|
|
"replicaset", "$1", "owner_name", "(.*)"
|
|
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{component="kube-state-metrics"},
|
|
"workload", "$1", "owner_name", "(.*)"
|
|
)
|
|
) by (namespace, workload, pod)
|
|
labels:
|
|
workload_type: deployment
|
|
record: mixin_pod_workload
|
|
- expr: |
|
|
sum(
|
|
label_replace(
|
|
kube_pod_owner{component="kube-state-metrics", owner_kind="DaemonSet"},
|
|
"workload", "$1", "owner_name", "(.*)"
|
|
)
|
|
) by (namespace, workload, pod)
|
|
labels:
|
|
workload_type: daemonset
|
|
record: mixin_pod_workload
|
|
- expr: |
|
|
sum(
|
|
label_replace(
|
|
kube_pod_owner{component="kube-state-metrics", owner_kind="StatefulSet"},
|
|
"workload", "$1", "owner_name", "(.*)"
|
|
)
|
|
) by (namespace, workload, pod)
|
|
labels:
|
|
workload_type: statefulset
|
|
record: mixin_pod_workload
|
|
|
|
- name: kube-scheduler.rules
|
|
rules:
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
|
|
|
- name: kube-apiserver.rules
|
|
rules:
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
|
|
|
- name: node.rules
|
|
rules:
|
|
- expr: sum(min(kube_pod_info) by (node))
|
|
record: ':kube_pod_info_node_count:'
|
|
- expr: |
|
|
max(label_replace(kube_pod_info{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
|
record: 'node_namespace_pod:kube_pod_info:'
|
|
- expr: |
|
|
count by (node) (sum by (node, cpu) (
|
|
node_cpu_seconds_total{component="node-exporter"}
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
))
|
|
record: node:node_num_cpu:sum
|
|
- expr: |
|
|
1 - avg(rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m]))
|
|
record: :node_cpu_utilization:avg1m
|
|
- expr: |
|
|
1 - avg by (node) (
|
|
rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m])
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:)
|
|
record: node:node_cpu_utilization:avg1m
|
|
- expr: |
|
|
node:node_cpu_utilization:avg1m
|
|
*
|
|
node:node_num_cpu:sum
|
|
/
|
|
scalar(sum(node:node_num_cpu:sum))
|
|
record: node:cluster_cpu_utilization:ratio
|
|
- expr: |
|
|
sum(node_load1{component="node-exporter"})
|
|
/
|
|
sum(node:node_num_cpu:sum)
|
|
record: ':node_cpu_saturation_load1:'
|
|
- expr: |
|
|
sum by (node) (
|
|
node_load1{component="node-exporter"}
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
/
|
|
node:node_num_cpu:sum
|
|
record: 'node:node_cpu_saturation_load1:'
|
|
- expr: |
|
|
1 -
|
|
sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
|
|
/
|
|
sum(node_memory_MemTotal_bytes{component="node-exporter"})
|
|
record: ':node_memory_utilization:'
|
|
- expr: |
|
|
sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
|
|
record: :node_memory_MemFreeCachedBuffers_bytes:sum
|
|
- expr: |
|
|
sum(node_memory_MemTotal_bytes{component="node-exporter"})
|
|
record: :node_memory_MemTotal_bytes:sum
|
|
- expr: |
|
|
sum by (node) (
|
|
(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_memory_bytes_available:sum
|
|
- expr: |
|
|
sum by (node) (
|
|
node_memory_MemTotal_bytes{component="node-exporter"}
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_memory_bytes_total:sum
|
|
- expr: |
|
|
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
|
/
|
|
node:node_memory_bytes_total:sum
|
|
record: node:node_memory_utilization:ratio
|
|
- expr: |
|
|
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
|
/
|
|
scalar(sum(node:node_memory_bytes_total:sum))
|
|
record: node:cluster_memory_utilization:ratio
|
|
- expr: |
|
|
1e3 * sum(
|
|
(rate(node_vmstat_pgpgin{component="node-exporter"}[1m])
|
|
+ rate(node_vmstat_pgpgout{component="node-exporter"}[1m]))
|
|
)
|
|
record: :node_memory_swap_io_bytes:sum_rate
|
|
- expr: |
|
|
1 -
|
|
sum by (node) (
|
|
(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"})
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
/
|
|
sum by (node) (
|
|
node_memory_MemTotal_bytes{component="node-exporter"}
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: 'node:node_memory_utilization:'
|
|
- expr: |
|
|
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
|
record: 'node:node_memory_utilization_2:'
|
|
- expr: |
|
|
1e3 * sum by (node) (
|
|
(rate(node_vmstat_pgpgin{component="node-exporter"}[1m])
|
|
+ rate(node_vmstat_pgpgout{component="node-exporter"}[1m]))
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_memory_swap_io_bytes:sum_rate
|
|
- expr: |
|
|
avg(irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
|
record: :node_disk_utilization:avg_irate
|
|
- expr: |
|
|
avg by (node) (
|
|
irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_disk_utilization:avg_irate
|
|
- expr: |
|
|
avg(irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
|
record: :node_disk_saturation:avg_irate
|
|
- expr: |
|
|
avg by (node) (
|
|
irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_disk_saturation:avg_irate
|
|
- expr: |
|
|
max by (namespace, nodename, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
|
|
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
record: 'node:node_filesystem_usage:'
|
|
- expr: |
|
|
max by (namespace, nodename, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
record: 'node:node_filesystem_avail:'
|
|
- expr: |
|
|
sum(irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) +
|
|
sum(irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m]))
|
|
record: :node_net_utilization:sum_irate
|
|
- expr: |
|
|
sum by (node) (
|
|
(irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m]) +
|
|
irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m]))
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_net_utilization:sum_irate
|
|
- expr: |
|
|
sum(irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m])) +
|
|
sum(irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m]))
|
|
record: :node_net_saturation:sum_irate
|
|
- expr: |
|
|
sum by (node) (
|
|
(irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m]) +
|
|
irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m]))
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_net_saturation:sum_irate
|
|
- expr: |
|
|
max(
|
|
max(
|
|
kube_pod_info{component="kube-state-metrics", host_ip!=""}
|
|
) by (node, host_ip)
|
|
* on (host_ip) group_right (node)
|
|
label_replace(
|
|
(max(node_filesystem_files{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
|
)
|
|
) by (node)
|
|
record: 'node:node_inodes_total:'
|
|
- expr: |
|
|
max(
|
|
max(
|
|
kube_pod_info{component="kube-state-metrics", host_ip!=""}
|
|
) by (node, host_ip)
|
|
* on (host_ip) group_right (node)
|
|
label_replace(
|
|
(max(node_filesystem_files_free{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
|
)
|
|
) by (node)
|
|
record: 'node:node_inodes_free:'
|
|
- name: cluster.rules
|
|
rules:
|
|
# Total number of CPU cores in the cluster.
|
|
- expr: |
|
|
sum(node:node_num_cpu:sum)
|
|
record: cluster:cpu_total
|
|
# Cluster-wide CPU usage rate in percent.
|
|
- expr: |
|
|
sum(node:cluster_cpu_utilization:ratio * 100)
|
|
record: cluster:cpu_usage_rate
|
|
# Cluster-wide total RAM in bytes.
|
|
- expr: |
|
|
sum(node:node_memory_bytes_total:sum)
|
|
record: cluster:memory_total_bytes
|
|
# Cluster-wide RAM usage in bytes.
|
|
- expr: |
|
|
sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)
|
|
record: cluster:memory_usage_bytes
|
|
# Cluster-wide RAM usage rate in percent.
|
|
- expr: |
|
|
(sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)) / scalar(sum(node:node_memory_bytes_total:sum)) * 100
|
|
record: cluster:memory_usage_rate
|
|
- name: kube-prometheus-node-recording.rules
|
|
rules:
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
|
(instance)
|
|
record: instance:node_cpu:rate:sum
|
|
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
|
|
BY (instance)
|
|
record: instance:node_filesystem_usage:sum
|
|
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
|
record: instance:node_network_receive_bytes:rate:sum
|
|
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
|
record: instance:node_network_transmit_bytes:rate:sum
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
|
|
(cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
|
|
BY (instance, cpu)) BY (instance)
|
|
record: instance:node_cpu:ratio
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
|
|
record: cluster:node_cpu:sum_rate5m
|
|
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
|
|
BY (instance, cpu))
|
|
record: cluster:node_cpu:ratio
|
|
- name: kubernetes-absent
|
|
rules:
|
|
- alert: KubeAPIDown
|
|
annotations:
|
|
message: KubeAPI has disappeared from Prometheus target discovery.
|
|
expr: |
|
|
absent(up{job="kubernetes-apiservers"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
|
|
- alert: NodeExporterDown
|
|
annotations:
|
|
message: NodeExporter has disappeared from Prometheus target discovery.
|
|
expr: |
|
|
absent(up{component="node-exporter"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusDown
|
|
annotations:
|
|
message: Prometheus has disappeared from Prometheus target discovery.
|
|
expr: |
|
|
absent(up{job="prometheus"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: cAdvisoDown
|
|
annotations:
|
|
message: cAdviso has disappeared from Prometheus target discovery.
|
|
expr: |
|
|
absent(up{job="kubernetes-nodes-cadvisor"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- name: kubernetes-apps
|
|
rules:
|
|
- alert: KubePodCrashLooping
|
|
annotations:
|
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
|
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
|
expr: |
|
|
rate(kube_pod_container_status_restarts_total{component="kube-state-metrics"}[15m]) * 60 * 5 > 0
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: KubePodNotReady
|
|
annotations:
|
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
|
state for longer than an hour.
|
|
expr: |
|
|
sum by (namespace, pod) (kube_pod_status_phase{component="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeDeploymentGenerationMismatch
|
|
annotations:
|
|
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
|
}} does not match, this indicates that the Deployment has failed but has
|
|
not been rolled back.
|
|
expr: |
|
|
kube_deployment_status_observed_generation{component="kube-state-metrics"}
|
|
!=
|
|
kube_deployment_metadata_generation{component="kube-state-metrics"}
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeDeploymentReplicasMismatch
|
|
annotations:
|
|
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
|
matched the expected number of replicas for longer than an hour.
|
|
expr: |
|
|
kube_deployment_spec_replicas{component="kube-state-metrics"}
|
|
!=
|
|
kube_deployment_status_replicas_available{component="kube-state-metrics"}
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStatefulSetReplicasMismatch
|
|
annotations:
|
|
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
|
not matched the expected number of replicas for longer than 15 minutes.
|
|
expr: |
|
|
kube_statefulset_status_replicas_ready{component="kube-state-metrics"}
|
|
!=
|
|
kube_statefulset_status_replicas{component="kube-state-metrics"}
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStatefulSetGenerationMismatch
|
|
annotations:
|
|
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
|
}} does not match, this indicates that the StatefulSet has failed but has
|
|
not been rolled back.
|
|
expr: |
|
|
kube_statefulset_status_observed_generation{component="kube-state-metrics"}
|
|
!=
|
|
kube_statefulset_metadata_generation{component="kube-state-metrics"}
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
|
annotations:
|
|
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
|
has not been rolled out.
|
|
expr: |
|
|
max without (revision) (
|
|
kube_statefulset_status_current_revision{component="kube-state-metrics"}
|
|
unless
|
|
kube_statefulset_status_update_revision{component="kube-state-metrics"}
|
|
)
|
|
*
|
|
(
|
|
kube_statefulset_replicas{component="kube-state-metrics"}
|
|
!=
|
|
kube_statefulset_status_replicas_updated{component="kube-state-metrics"}
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeDaemonSetRolloutStuck
|
|
annotations:
|
|
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
|
|
}}/{{ $labels.daemonset }} are scheduled and ready.
|
|
expr: |
|
|
kube_daemonset_status_number_ready{component="kube-state-metrics"}
|
|
/
|
|
kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"} * 100 < 100
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeDaemonSetNotScheduled
|
|
annotations:
|
|
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
|
}} are not scheduled.'
|
|
expr: |
|
|
kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"}
|
|
-
|
|
kube_daemonset_status_current_number_scheduled{component="kube-state-metrics"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDaemonSetMisScheduled
|
|
annotations:
|
|
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
|
}} are running where they are not supposed to run.'
|
|
expr: |
|
|
kube_daemonset_status_number_misscheduled{component="kube-state-metrics"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeCronJobRunning
|
|
annotations:
|
|
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
|
|
than 1h to complete.
|
|
expr: |
|
|
time() - kube_cronjob_next_schedule_time{component="kube-state-metrics"} > 3600
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeJobCompletion
|
|
annotations:
|
|
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
|
than one hour to complete.
|
|
expr: |
|
|
kube_job_spec_completions{component="kube-state-metrics"} - kube_job_status_succeeded{component="kube-state-metrics"} > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeJobFailed
|
|
annotations:
|
|
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
|
expr: |
|
|
kube_job_status_failed{component="kube-state-metrics"} > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
|
|
- name: kubernetes-resources
|
|
rules:
|
|
- alert: KubeCPUOvercommit
|
|
annotations:
|
|
message: Cluster has overcommitted CPU resource requests for Pods and cannot
|
|
tolerate node failure.
|
|
expr: |
|
|
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
|
|
/
|
|
sum(node:node_num_cpu:sum)
|
|
>
|
|
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeMemOvercommit
|
|
annotations:
|
|
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
|
tolerate node failure.
|
|
expr: |
|
|
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
|
|
/
|
|
sum(node_memory_MemTotal_bytes)
|
|
>
|
|
(count(node:node_num_cpu:sum)-1)
|
|
/
|
|
count(node:node_num_cpu:sum)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeCPUOvercommit
|
|
annotations:
|
|
message: Cluster has overcommitted CPU resource requests for Namespaces.
|
|
expr: |
|
|
sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="cpu"})
|
|
/
|
|
sum(node:node_num_cpu:sum)
|
|
> 1.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeMemOvercommit
|
|
annotations:
|
|
message: Cluster has overcommitted memory resource requests for Namespaces.
|
|
expr: |
|
|
sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="memory"})
|
|
/
|
|
sum(node_memory_MemTotal_bytes{component="node-exporter"})
|
|
> 1.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeQuotaExceeded
|
|
annotations:
|
|
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
|
|
}}% of its {{ $labels.resource }} quota.
|
|
expr: |
|
|
100 * kube_resourcequota{component="kube-state-metrics", type="used"}
|
|
/ ignoring(instance, job, type)
|
|
(kube_resourcequota{component="kube-state-metrics", type="hard"} > 0)
|
|
> 90
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- name: kubernetes-storage
|
|
rules:
|
|
- alert: KubePersistentVolumeUsageCritical
|
|
annotations:
|
|
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
|
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
|
|
}}% free.
|
|
expr: |
|
|
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
|
|
/
|
|
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
|
< 3
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubePersistentVolumeFullInFourDays
|
|
annotations:
|
|
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
|
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
|
days. Currently {{ printf "%0.2f" $value }}% is available.
|
|
expr: |
|
|
100 * (
|
|
kubelet_volume_stats_available_bytes{job="kubelet"}
|
|
/
|
|
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
|
) < 15
|
|
and
|
|
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubePersistentVolumeErrors
|
|
annotations:
|
|
message: The persistent volume {{ $labels.persistentvolume }} has status {{
|
|
$labels.phase }}.
|
|
expr: |
|
|
kube_persistentvolume_status_phase{phase=~"Failed|Pending",component="kube-state-metrics"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- name: kubernetes-system
|
|
rules:
|
|
- alert: KubeNodeNotReady
|
|
annotations:
|
|
message: '{{ $labels.node }} has been unready for more than an hour.'
|
|
expr: |
|
|
kube_node_status_condition{component="kube-state-metrics",condition="Ready",status="true"} == 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeVersionMismatch
|
|
annotations:
|
|
message: There are {{ $value }} different semantic versions of Kubernetes
|
|
components running.
|
|
expr: |
|
|
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientErrors
|
|
annotations:
|
|
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
|
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
|
|
expr: |
|
|
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
|
/
|
|
sum(rate(rest_client_requests_total[5m])) by (instance, job))
|
|
* 100 > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientErrors
|
|
annotations:
|
|
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
|
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
|
|
expr: |
|
|
sum(rate(ksm_scrape_error_total{component="kube-state-metrics"}[5m])) by (instance, job) > 0.1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletTooManyPods
|
|
annotations:
|
|
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
|
|
to the limit of 110.
|
|
expr: |
|
|
kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeAPILatencyHigh
|
|
annotations:
|
|
message: The API server has a 99th percentile latency of {{ $value }} seconds
|
|
for {{ $labels.verb }} {{ $labels.resource }}.
|
|
expr: |
|
|
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeAPILatencyHigh
|
|
annotations:
|
|
message: The API server has a 99th percentile latency of {{ $value }} seconds
|
|
for {{ $labels.verb }} {{ $labels.resource }}.
|
|
expr: |
|
|
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAPIErrorsHigh
|
|
annotations:
|
|
message: API server is returning errors for {{ $value }}% of requests.
|
|
expr: |
|
|
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m]))
|
|
/
|
|
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 3
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAPIErrorsHigh
|
|
annotations:
|
|
message: API server is returning errors for {{ $value }}% of requests.
|
|
expr: |
|
|
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m]))
|
|
/
|
|
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeAPIErrorsHigh
|
|
annotations:
|
|
message: API server is returning errors for {{ $value }}% of requests for
|
|
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
|
expr: |
|
|
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
|
/
|
|
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 10
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAPIErrorsHigh
|
|
annotations:
|
|
message: API server is returning errors for {{ $value }}% of requests for
|
|
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
|
expr: |
|
|
sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
|
/
|
|
sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientCertificateExpiration
|
|
annotations:
|
|
message: A client certificate used to authenticate to the apiserver is expiring
|
|
in less than 7.0 days.
|
|
expr: |
|
|
apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 604800
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientCertificateExpiration
|
|
annotations:
|
|
message: A client certificate used to authenticate to the apiserver is expiring
|
|
in less than 24.0 hours.
|
|
expr: |
|
|
apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 86400
|
|
labels:
|
|
severity: critical
|
|
- name: alertmanager.rules
|
|
rules:
|
|
- alert: AlertmanagerConfigInconsistent
|
|
annotations:
|
|
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
|
|
are out of sync.
|
|
expr: |
|
|
count_values("config_hash", alertmanager_config_hash{job="prometheus-alertmanager"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: AlertmanagerFailedReload
|
|
annotations:
|
|
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
|
}}/{{ $labels.pod}}.
|
|
expr: |
|
|
alertmanager_config_last_reload_successful{job="prometheus-alertmanager"} == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: AlertmanagerMembersInconsistent
|
|
annotations:
|
|
message: Alertmanager has not found all other members of the cluster.
|
|
expr: |
|
|
alertmanager_cluster_members{job="prometheus-alertmanager"}
|
|
!= on (service) GROUP_LEFT()
|
|
count by (service) (alertmanager_cluster_members{job="prometheus-alertmanager"})
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- name: etcd.rules
|
|
rules:
|
|
- alert: EtcdDown
|
|
annotations:
|
|
message: Etcd instance is down on node {{ $labels.node }}.
|
|
expr: satellite_etcd_up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: EtcdUnhealthy
|
|
annotations:
|
|
message: Etcd cluster is unhealthy.
|
|
expr: satellite_etcd_health == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
- name: sysctl.rules
|
|
rules:
|
|
- alert: BrNetfilterMissing
|
|
annotations:
|
|
message: Bridge netfilter is disabled on node {{ $labels.node }}
|
|
runbook_url: https://gravitational.com/gravity/docs/requirements/#br_netfilter-module
|
|
expr: max_over_time(satellite_sysctl_br_netfilter[1h]) unless satellite_sysctl_br_netfilter or satellite_sysctl_br_netfilter == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: IPv4ForwardingMissing
|
|
annotations:
|
|
message: IPv4 forwarding is disabled on node {{ $labels.node }}
|
|
runbook_url: https://gravitational.com/gravity/docs/faq/#ipv4-forwarding
|
|
expr: max_over_time(satellite_sysctl_ipv4_forwarding[1h]) unless satellite_sysctl_ipv4_forwarding or satellite_sysctl_ipv4_forwarding == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- name: docker.rules
|
|
rules:
|
|
- alert: DockerDown
|
|
annotations:
|
|
message: Docker daemon is down on host {{ $labels.node }}
|
|
expr: satellite_docker_health == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- name: systemd.rules
|
|
rules:
|
|
- alert: SystemdDegraded
|
|
annotations:
|
|
message: Systemd on host {{ $labels.node }}
|
|
expr: satellite_systemd_health == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: SystemdUnitDegraded
|
|
annotations:
|
|
message: Systemd unit {{ $labels.unit_name }} is degraded on host {{ $labels.node }}
|
|
expr: satellite_systemd_unit_health == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- name: general.rules
|
|
rules:
|
|
- alert: TargetDown
|
|
annotations:
|
|
message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
|
|
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: Watchdog
|
|
annotations:
|
|
message: |
|
|
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
|
This alert is always firing, therefore it should always be firing in Alertmanager
|
|
and always fire against a receiver. There are integrations with various notification
|
|
mechanisms that send a notification when this alert is not firing. For example the
|
|
"DeadMansSnitch" integration in PagerDuty.
|
|
expr: vector(1)
|
|
labels:
|
|
severity: none
|
|
- name: kube-prometheus-node-alerting.rules
|
|
rules:
|
|
- alert: NodeDiskRunningFull
|
|
annotations:
|
|
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
|
|
}}/{{ $labels.pod }} will be full within the next 24 hours.
|
|
expr: |
|
|
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeDiskRunningFull
|
|
annotations:
|
|
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
|
|
}}/{{ $labels.pod }} will be full within the next 2 hours.
|
|
expr: |
|
|
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- name: node-network
|
|
rules:
|
|
- alert: NetworkReceiveErrors
|
|
annotations:
|
|
message: Network interface "{{ $labels.device }}" showing receive errors on
|
|
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
|
expr: |
|
|
rate(node_network_receive_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- alert: NetworkTransmitErrors
|
|
annotations:
|
|
message: Network interface "{{ $labels.device }}" showing transmit errors
|
|
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
|
expr: |
|
|
rate(node_network_transmit_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeNetworkInterfaceFlapping
|
|
annotations:
|
|
message: Network interface "{{ $labels.device }}" changing it's up status
|
|
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
|
expr: |
|
|
changes(node_network_up{component="node-exporter",device!~"veth.+"}[2m]) > 2
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- name: prometheus.rules
|
|
rules:
|
|
- alert: PrometheusConfigReloadFailed
|
|
annotations:
|
|
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
|
summary: Reloading Prometheus' configuration failed
|
|
expr: |
|
|
prometheus_config_last_reload_successful{job="prometheus"} == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusNotificationQueueRunningFull
|
|
annotations:
|
|
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
|
$labels.pod}}
|
|
summary: Prometheus' alert notification queue is running full
|
|
expr: |
|
|
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"}
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusErrorSendingAlerts
|
|
annotations:
|
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
|
summary: Errors while sending alert from Prometheus
|
|
expr: |
|
|
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusErrorSendingAlerts
|
|
annotations:
|
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
|
summary: Errors while sending alerts from Prometheus
|
|
expr: |
|
|
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusNotConnectedToAlertmanagers
|
|
annotations:
|
|
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
|
to any Alertmanagers
|
|
summary: Prometheus is not connected to any Alertmanagers
|
|
expr: |
|
|
prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTSDBReloadsFailing
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
|
reload failures over the last four hours.'
|
|
summary: Prometheus has issues reloading data blocks from disk
|
|
expr: |
|
|
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0
|
|
for: 12h
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTSDBCompactionsFailing
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
|
compaction failures over the last four hours.'
|
|
summary: Prometheus has issues compacting sample blocks
|
|
expr: |
|
|
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0
|
|
for: 12h
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTSDBWALCorruptions
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
|
log (WAL).'
|
|
summary: Prometheus write-ahead log is corrupted
|
|
expr: |
|
|
prometheus_tsdb_wal_corruptions_total{job="prometheus"} > 0
|
|
for: 4h
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusNotIngestingSamples
|
|
annotations:
|
|
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting
|
|
samples.
|
|
summary: Prometheus isn't ingesting samples
|
|
expr: |
|
|
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTargetScrapesDuplicate
|
|
annotations:
|
|
description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected
|
|
due to duplicate timestamps but different values'
|
|
summary: Prometheus has many samples rejected
|
|
expr: |
|
|
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
# groups:
|
|
# - name: Instances
|
|
# rules:
|
|
# - alert: InstanceDown
|
|
# expr: up == 0
|
|
# for: 5m
|
|
# labels:
|
|
# severity: page
|
|
# annotations:
|
|
# description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
|
|
# summary: 'Instance {{ $labels.instance }} down'
|
|
## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml
|
|
alerts: {}
|
|
|
|
## Records configuration
|
|
## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
|
|
recording_rules.yml: {}
|
|
## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml
|
|
rules: {}
|
|
|
|
prometheus.yml:
|
|
global:
|
|
evaluation_interval: 30s
|
|
scrape_interval: 20s
|
|
scrape_timeout: 10s
|
|
rule_files:
|
|
- /etc/config/recording_rules.yml
|
|
- /etc/config/alerting_rules.yml
|
|
## Below two files are DEPRECATED will be removed from this default values file
|
|
- /etc/config/rules
|
|
- /etc/config/alerts
|
|
|
|
scrape_configs:
|
|
- job_name: prometheus
|
|
static_configs:
|
|
- targets:
|
|
- localhost:9090
|
|
|
|
# A scrape configuration for running Prometheus on a Kubernetes cluster.
|
|
# This uses separate scrape configs for cluster components (i.e. API server, node)
|
|
# and services to allow each to use different authentication configs.
|
|
#
|
|
# Kubernetes labels will be added as Prometheus labels on metrics via the
|
|
# `labelmap` relabeling action.
|
|
|
|
# Scrape config for API servers.
|
|
#
|
|
# Kubernetes exposes API servers as endpoints to the default/kubernetes
|
|
# service so this uses `endpoints` role and uses relabelling to only keep
|
|
# the endpoints associated with the default/kubernetes service using the
|
|
# default named port `https`. This works for single API server deployments as
|
|
# well as HA API server deployments.
|
|
- job_name: 'kubernetes-apiservers'
|
|
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
|
|
# Default to scraping over https. If required, just disable this or change to
|
|
# `http`.
|
|
scheme: https
|
|
|
|
# This TLS & bearer token file config is used to connect to the actual scrape
|
|
# endpoints for cluster components. This is separate to discovery auth
|
|
# configuration because discovery & scraping are two separate concerns in
|
|
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
|
|
# the cluster. Otherwise, more config options have to be provided within the
|
|
# <kubernetes_sd_config>.
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
# If your node certificates are self-signed or use a different CA to the
|
|
# master CA, then disable certificate verification below. Note that
|
|
# certificate verification is an integral part of a secure infrastructure
|
|
# so this should only be disabled in a controlled environment. You can
|
|
# disable certificate verification by uncommenting the line below.
|
|
#
|
|
insecure_skip_verify: true
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
|
|
# Keep only the default/kubernetes service endpoints for the https port. This
|
|
# will add targets for each API server which Kubernetes adds an endpoint to
|
|
# the default/kubernetes service.
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
|
action: keep
|
|
regex: default;kubernetes;https
|
|
|
|
- job_name: 'kubernetes-nodes'
|
|
|
|
# Default to scraping over https. If required, just disable this or change to
|
|
# `http`.
|
|
scheme: https
|
|
|
|
# This TLS & bearer token file config is used to connect to the actual scrape
|
|
# endpoints for cluster components. This is separate to discovery auth
|
|
# configuration because discovery & scraping are two separate concerns in
|
|
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
|
|
# the cluster. Otherwise, more config options have to be provided within the
|
|
# <kubernetes_sd_config>.
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
# If your node certificates are self-signed or use a different CA to the
|
|
# master CA, then disable certificate verification below. Note that
|
|
# certificate verification is an integral part of a secure infrastructure
|
|
# so this should only be disabled in a controlled environment. You can
|
|
# disable certificate verification by uncommenting the line below.
|
|
#
|
|
insecure_skip_verify: true
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
|
|
kubernetes_sd_configs:
|
|
- role: node
|
|
|
|
relabel_configs:
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_node_label_(.+)
|
|
- target_label: __address__
|
|
replacement: kubernetes.default.svc:443
|
|
- source_labels: [__meta_kubernetes_node_name]
|
|
regex: (.+)
|
|
target_label: __metrics_path__
|
|
replacement: /api/v1/nodes/$1/proxy/metrics
|
|
|
|
|
|
- job_name: 'kubernetes-nodes-cadvisor'
|
|
|
|
# Default to scraping over https. If required, just disable this or change to
|
|
# `http`.
|
|
scheme: https
|
|
|
|
# This TLS & bearer token file config is used to connect to the actual scrape
|
|
# endpoints for cluster components. This is separate to discovery auth
|
|
# configuration because discovery & scraping are two separate concerns in
|
|
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
|
|
# the cluster. Otherwise, more config options have to be provided within the
|
|
# <kubernetes_sd_config>.
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
# If your node certificates are self-signed or use a different CA to the
|
|
# master CA, then disable certificate verification below. Note that
|
|
# certificate verification is an integral part of a secure infrastructure
|
|
# so this should only be disabled in a controlled environment. You can
|
|
# disable certificate verification by uncommenting the line below.
|
|
#
|
|
insecure_skip_verify: true
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
|
|
kubernetes_sd_configs:
|
|
- role: node
|
|
|
|
# This configuration will work only on kubelet 1.7.3+
|
|
# As the scrape endpoints for cAdvisor have changed
|
|
# if you are using older version you need to change the replacement to
|
|
# replacement: /api/v1/nodes/$1:4194/proxy/metrics
|
|
# more info here https://github.com/coreos/prometheus-operator/issues/633
|
|
relabel_configs:
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_node_label_(.+)
|
|
- target_label: __address__
|
|
replacement: kubernetes.default.svc:443
|
|
- source_labels: [__meta_kubernetes_node_name]
|
|
regex: (.+)
|
|
target_label: __metrics_path__
|
|
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
|
|
|
|
# Scrape config for service endpoints.
|
|
#
|
|
# The relabeling allows the actual service scrape endpoint to be configured
|
|
# via the following annotations:
|
|
#
|
|
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
|
|
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
|
|
# to set this to `https` & most likely set the `tls_config` of the scrape config.
|
|
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
|
|
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
|
|
# service then set this appropriately.
|
|
- job_name: 'kubernetes-service-endpoints'
|
|
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
|
action: keep
|
|
regex: true
|
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
|
action: replace
|
|
target_label: __scheme__
|
|
regex: (https?)
|
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
|
|
action: replace
|
|
target_label: __address__
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_service_label_(.+)
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_service_name]
|
|
action: replace
|
|
target_label: kubernetes_name
|
|
- source_labels: [__meta_kubernetes_pod_node_name]
|
|
action: replace
|
|
target_label: kubernetes_node
|
|
|
|
# Scrape config for slow service endpoints; same as above, but with a larger
|
|
# timeout and a larger interval
|
|
#
|
|
# The relabeling allows the actual service scrape endpoint to be configured
|
|
# via the following annotations:
|
|
#
|
|
# * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true`
|
|
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
|
|
# to set this to `https` & most likely set the `tls_config` of the scrape config.
|
|
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
|
|
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
|
|
# service then set this appropriately.
|
|
- job_name: 'kubernetes-service-endpoints-slow'
|
|
|
|
scrape_interval: 5m
|
|
scrape_timeout: 30s
|
|
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow]
|
|
action: keep
|
|
regex: true
|
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
|
action: replace
|
|
target_label: __scheme__
|
|
regex: (https?)
|
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
|
|
action: replace
|
|
target_label: __address__
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_service_label_(.+)
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_service_name]
|
|
action: replace
|
|
target_label: kubernetes_name
|
|
- source_labels: [__meta_kubernetes_pod_node_name]
|
|
action: replace
|
|
target_label: kubernetes_node
|
|
|
|
- job_name: 'prometheus-pushgateway'
|
|
honor_labels: true
|
|
|
|
kubernetes_sd_configs:
|
|
- role: service
|
|
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
|
|
action: keep
|
|
regex: pushgateway
|
|
|
|
# Example scrape config for probing services via the Blackbox Exporter.
|
|
#
|
|
# The relabeling allows the actual service scrape endpoint to be configured
|
|
# via the following annotations:
|
|
#
|
|
# * `prometheus.io/probe`: Only probe services that have a value of `true`
|
|
- job_name: 'kubernetes-services'
|
|
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
|
|
kubernetes_sd_configs:
|
|
- role: service
|
|
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
|
|
action: keep
|
|
regex: true
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- target_label: __address__
|
|
replacement: blackbox
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_service_label_(.+)
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_service_name]
|
|
target_label: kubernetes_name
|
|
|
|
# Example scrape config for pods
|
|
#
|
|
# The relabeling allows the actual pod scrape endpoint to be configured via the
|
|
# following annotations:
|
|
#
|
|
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
|
|
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
|
|
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
|
|
- job_name: 'kubernetes-pods'
|
|
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
|
action: keep
|
|
regex: true
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
action: replace
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
target_label: __address__
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_pod_label_(.+)
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
action: replace
|
|
target_label: kubernetes_pod_name
|
|
- source_labels: [__meta_kubernetes_pod_phase]
|
|
regex: Pending|Succeeded|Failed
|
|
action: drop
|
|
|
|
# Example Scrape config for pods which should be scraped slower. An useful example
|
|
# would be stackriver-exporter which querys an API on every scrape of the pod
|
|
#
|
|
# The relabeling allows the actual pod scrape endpoint to be configured via the
|
|
# following annotations:
|
|
#
|
|
# * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
|
|
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
|
|
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
|
|
- job_name: 'kubernetes-pods-slow'
|
|
|
|
scrape_interval: 5m
|
|
scrape_timeout: 30s
|
|
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow]
|
|
action: keep
|
|
regex: true
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
action: replace
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
target_label: __address__
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_pod_label_(.+)
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
action: replace
|
|
target_label: kubernetes_pod_name
|
|
- source_labels: [__meta_kubernetes_pod_phase]
|
|
regex: Pending|Succeeded|Failed
|
|
action: drop
|
|
|
|
# adds additional scrape configs to prometheus.yml
|
|
# must be a string so you have to add a | after extraScrapeConfigs:
|
|
# example adds prometheus-blackbox-exporter scrape config
|
|
extraScrapeConfigs:
|
|
# - job_name: 'prometheus-blackbox-exporter'
|
|
# metrics_path: /probe
|
|
# params:
|
|
# module: [http_2xx]
|
|
# static_configs:
|
|
# - targets:
|
|
# - https://example.com
|
|
# relabel_configs:
|
|
# - source_labels: [__address__]
|
|
# target_label: __param_target
|
|
# - source_labels: [__param_target]
|
|
# target_label: instance
|
|
# - target_label: __address__
|
|
# replacement: prometheus-blackbox-exporter:9115
|
|
|
|
# Adds option to add alert_relabel_configs to avoid duplicate alerts in alertmanager
|
|
# useful in H/A prometheus with different external labels but the same alerts
|
|
alertRelabelConfigs:
|
|
# alert_relabel_configs:
|
|
# - source_labels: [dc]
|
|
# regex: (.+)\d+
|
|
# target_label: dc
|
|
|
|
networkPolicy:
|
|
## Enable creation of NetworkPolicy resources.
|
|
##
|
|
enabled: true
|
|
|
|
# Force namespace of namespaced resources
|
|
forceNamespace: null
|