rbac: create: true podSecurityPolicy: enabled: false imagePullSecrets: # - name: "image-pull-secret" ## Define serviceAccount names for components. Defaults to component's fully qualified name. ## serviceAccounts: alertmanager: create: true name: annotations: {} nodeExporter: create: true name: annotations: {} pushgateway: create: true name: annotations: {} kubeStateMetrics: create: true name: annotations: {} server: create: true name: annotations: {} alertmanager: ## If false, alertmanager will not be installed ## enabled: true ## Use a ClusterRole (and ClusterRoleBinding) ## - If set to false - we define a Role and RoleBinding in the defined namespaces ONLY ## This makes alertmanager work - for users who do not have ClusterAdmin privs, but wants alertmanager to operate on their own namespaces, instead of clusterwide. useClusterRole: true ## Set to a rolename to use existing role - skipping role creating - but still doing serviceaccount and rolebinding to the rolename set here. useExistingRole: false ## alertmanager container name ## name: alertmanager ## alertmanager container image ## image: repository: prom/alertmanager tag: v0.21.0 pullPolicy: IfNotPresent ## alertmanager priorityClassName ## priorityClassName: "" ## Additional alertmanager container arguments ## extraArgs: {} ## Additional InitContainers to initialize the pod ## extraInitContainers: [] ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug ## so that the various internal URLs are still able to access as they are in the default case. ## (Optional) prefixURL: "" ## External URL which can access alertmanager baseURL: "http://localhost:9093" ## Additional alertmanager container environment variable ## For instance to add a http_proxy ## extraEnv: {} ## Additional alertmanager Secret mounts # Defines additional mounts with secrets. Secrets must be manually created in the namespace. extraSecretMounts: [] # - name: secret-files # mountPath: /etc/secrets # subPath: "" # secretName: alertmanager-secret-files # readOnly: true ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}} ## Defining configMapOverrideName will cause templates/alertmanager-configmap.yaml ## to NOT generate a ConfigMap resource ## configMapOverrideName: "" ## The name of a secret in the same kubernetes namespace which contains the Alertmanager config ## Defining configFromSecret will cause templates/alertmanager-configmap.yaml ## to NOT generate a ConfigMap resource ## configFromSecret: "" ## The configuration file name to be loaded to alertmanager ## Must match the key within configuration loaded from ConfigMap/Secret ## configFileName: alertmanager.yml ingress: ## If true, alertmanager Ingress will be created ## enabled: false ## alertmanager Ingress annotations ## annotations: {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: 'true' ## alertmanager Ingress additional labels ## extraLabels: {} ## alertmanager Ingress hostnames with optional path ## Must be provided if Ingress is enabled ## hosts: [] # - alertmanager.domain.com # - domain.com/alertmanager ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. extraPaths: [] # - path: /* # backend: # serviceName: ssl-redirect # servicePort: use-annotation ## alertmanager Ingress TLS configuration ## Secrets must be manually created in the namespace ## tls: [] # - secretName: prometheus-alerts-tls # hosts: # - alertmanager.domain.com ## Alertmanager Deployment Strategy type # strategy: # type: Recreate ## Node tolerations for alertmanager scheduling to nodes with taints ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ ## tolerations: [] # - key: "key" # operator: "Equal|Exists" # value: "value" # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" ## Node labels for alertmanager pod assignment ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ ## nodeSelector: {} ## Pod affinity ## affinity: {} ## PodDisruptionBudget settings ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/ ## podDisruptionBudget: enabled: false maxUnavailable: 1 ## Use an alternate scheduler, e.g. "stork". ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ ## # schedulerName: persistentVolume: ## If true, alertmanager will create/use a Persistent Volume Claim ## If false, use emptyDir ## enabled: true ## alertmanager data Persistent Volume access modes ## Must match those of existing PV or dynamic provisioner ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ ## accessModes: - ReadWriteOnce ## alertmanager data Persistent Volume Claim annotations ## annotations: {} ## alertmanager data Persistent Volume existing claim name ## Requires alertmanager.persistentVolume.enabled: true ## If defined, PVC must be created manually before volume will be bound existingClaim: "" ## alertmanager data Persistent Volume mount root path ## mountPath: /data ## alertmanager data Persistent Volume size ## size: 2Gi ## alertmanager data Persistent Volume Storage Class ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning ## If undefined (the default) or set to null, no storageClassName spec is ## set, choosing the default provisioner. (gp2 on AWS, standard on ## GKE, AWS & OpenStack) ## # storageClass: "-" ## alertmanager data Persistent Volume Binding Mode ## If defined, volumeBindingMode: ## If undefined (the default) or set to null, no volumeBindingMode spec is ## set, choosing the default mode. ## # volumeBindingMode: "" ## Subdirectory of alertmanager data Persistent Volume to mount ## Useful if the volume's root directory is not empty ## subPath: "" ## Annotations to be added to alertmanager pods ## podAnnotations: {} ## Tell prometheus to use a specific set of alertmanager pods ## instead of all alertmanager pods found in the same namespace ## Useful if you deploy multiple releases within the same namespace ## ## prometheus.io/probe: alertmanager-teamA ## Labels to be added to Prometheus AlertManager pods ## podLabels: {} ## Specify if a Pod Security Policy for node-exporter must be created ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ ## podSecurityPolicy: annotations: {} ## Specify pod annotations ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl ## # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) ## replicaCount: 1 ## Annotations to be added to deployment ## deploymentAnnotations: {} statefulSet: ## If true, use a statefulset instead of a deployment for pod management. ## This allows to scale replicas to more than 1 pod ## enabled: false annotations: {} labels: {} podManagementPolicy: OrderedReady ## Alertmanager headless service to use for the statefulset ## headless: annotations: {} labels: {} ## Enabling peer mesh service end points for enabling the HA alert manager ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md enableMeshPeer: false servicePort: 80 ## alertmanager resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: {} # limits: # cpu: 10m # memory: 32Mi # requests: # cpu: 10m # memory: 32Mi ## Security context to be added to alertmanager pods ## securityContext: runAsUser: 65534 runAsNonRoot: true runAsGroup: 65534 fsGroup: 65534 service: annotations: {} labels: {} clusterIP: "" ## Enabling peer mesh service end points for enabling the HA alert manager ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md # enableMeshPeer : true ## List of IP addresses at which the alertmanager service is available ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips ## externalIPs: [] loadBalancerIP: "" loadBalancerSourceRanges: [] servicePort: 80 # nodePort: 30000 sessionAffinity: None type: ClusterIP ## Monitors ConfigMap changes and POSTs to a URL ## Ref: https://github.com/jimmidyson/configmap-reload ## configmapReload: prometheus: ## If false, the configmap-reload container will not be deployed ## enabled: true ## configmap-reload container name ## name: configmap-reload ## configmap-reload container image ## image: repository: jimmidyson/configmap-reload tag: v0.4.0 pullPolicy: IfNotPresent ## Additional configmap-reload container arguments ## extraArgs: {} ## Additional configmap-reload volume directories ## extraVolumeDirs: [] ## Additional configmap-reload mounts ## extraConfigmapMounts: [] # - name: prometheus-alerts # mountPath: /etc/alerts.d # subPath: "" # configMap: prometheus-alerts # readOnly: true ## configmap-reload resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: {} alertmanager: ## If false, the configmap-reload container will not be deployed ## enabled: true ## configmap-reload container name ## name: configmap-reload ## configmap-reload container image ## image: repository: jimmidyson/configmap-reload tag: v0.4.0 pullPolicy: IfNotPresent ## Additional configmap-reload container arguments ## extraArgs: {} ## Additional configmap-reload volume directories ## extraVolumeDirs: [] ## Additional configmap-reload mounts ## extraConfigmapMounts: [] # - name: prometheus-alerts # mountPath: /etc/alerts.d # subPath: "" # configMap: prometheus-alerts # readOnly: true ## configmap-reload resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: {} kubeStateMetrics: ## If false, kube-state-metrics will not be installed ## enabled: true ## kube-state-metrics container name ## name: kube-state-metrics image: repository: quay.io/coreos/kube-state-metrics tag: v1.9.7 pullPolicy: IfNotPresent podSecurityPolicy: annotations: {} additionalVolumes: [] imagePullSecrets: [] # - name: "image-pull-secret" # If set to true, this will deploy kube-state-metrics as a StatefulSet and the data # will be automatically sharded across <.Values.replicas> pods using the built-in # autodiscovery feature: https://github.com/kubernetes/kube-state-metrics#automated-sharding # This is an experimental feature and there are no stability guarantees. autosharding: enabled: false replicas: 1 service: port: 8080 # Default to clusterIP for backward compatibility type: ClusterIP nodePort: 0 loadBalancerIP: "" annotations: {} customLabels: {} hostNetwork: false securityContext: enabled: true runAsGroup: 65534 runAsUser: 65534 fsGroup: 65534 ## Node labels for pod assignment ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ nodeSelector: {} ## Affinity settings for pod assignment ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ affinity: {} ## Tolerations for pod assignment ## Ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ tolerations: [] # Annotations to be added to the pod podAnnotations: {} ## Assign a PriorityClassName to pods if set # priorityClassName: "" # Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ podDisruptionBudget: {} # Available collectors for kube-state-metrics. By default all available # collectors are enabled. collectors: certificatesigningrequests: true configmaps: true cronjobs: true daemonsets: true deployments: true endpoints: true horizontalpodautoscalers: true ingresses: true jobs: true limitranges: true mutatingwebhookconfigurations: true namespaces: true networkpolicies: true nodes: true persistentvolumeclaims: true persistentvolumes: true poddisruptionbudgets: true pods: true replicasets: true replicationcontrollers: true resourcequotas: true secrets: true services: true statefulsets: true storageclasses: true validatingwebhookconfigurations: true verticalpodautoscalers: false volumeattachments: true nodeExporter: ## If false, node-exporter will not be installed ## enabled: true ## If true, node-exporter pods share the host network namespace ## hostNetwork: false ## If true, node-exporter pods share the host PID namespace ## hostPID: false ## node-exporter container name ## name: node-exporter ## node-exporter container image ## image: repository: prom/node-exporter tag: v1.0.1 pullPolicy: IfNotPresent ## Specify if a Pod Security Policy for node-exporter must be created ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ ## podSecurityPolicy: annotations: {} ## Specify pod annotations ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl ## # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' ## node-exporter priorityClassName ## priorityClassName: "" ## Custom Update Strategy ## updateStrategy: type: RollingUpdate ## Additional node-exporter container arguments ## extraArgs: {} ## Additional InitContainers to initialize the pod ## extraInitContainers: [] ## Additional node-exporter hostPath mounts ## extraHostPathMounts: [] # - name: textfile-dir # mountPath: /srv/txt_collector # hostPath: /var/lib/node-exporter # readOnly: true # mountPropagation: HostToContainer extraConfigmapMounts: [] # - name: certs-configmap # mountPath: /prometheus # configMap: certs-configmap # readOnly: true ## Node tolerations for node-exporter scheduling to nodes with taints ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ ## tolerations: [] # - key: "key" # operator: "Equal|Exists" # value: "value" # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" ## Node labels for node-exporter pod assignment ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ ## nodeSelector: {} ## Annotations to be added to node-exporter pods ## podAnnotations: {} ## Labels to be added to node-exporter pods ## pod: labels: {} ## PodDisruptionBudget settings ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/ ## podDisruptionBudget: enabled: false maxUnavailable: 1 ## node-exporter resource limits & requests ## Ref: https://kubernetes.io/docs/user-guide/compute-resources/ ## resources: {} # limits: # cpu: 200m # memory: 50Mi # requests: # cpu: 100m # memory: 30Mi ## Security context to be added to node-exporter pods ## securityContext: {} # runAsUser: 0 service: annotations: prometheus.io/scrape: "true" labels: {} # Exposed as a headless service: # https://kubernetes.io/docs/concepts/services-networking/service/#headless-services clusterIP: None ## List of IP addresses at which the node-exporter service is available ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips ## externalIPs: [] hostPort: 9100 loadBalancerIP: "" loadBalancerSourceRanges: [] servicePort: 9100 type: ClusterIP server: ## Prometheus server container name ## enabled: true ## Use a ClusterRole (and ClusterRoleBinding) ## - If set to false - we define a RoleBinding in the defined namespaces ONLY ## ## NB: because we need a Role with nonResourceURL's ("/metrics") - you must get someone with Cluster-admin privileges to define this role for you, before running with this setting enabled. ## This makes prometheus work - for users who do not have ClusterAdmin privs, but wants prometheus to operate on their own namespaces, instead of clusterwide. ## ## You MUST also set namespaces to the ones you have access to and want monitored by Prometheus. ## # useExistingClusterRoleName: nameofclusterrole ## namespaces to monitor (instead of monitoring all - clusterwide). Needed if you want to run without Cluster-admin privileges. # namespaces: # - yournamespace name: server sidecarContainers: ## Prometheus server container image ## image: repository: prom/prometheus tag: v2.21.0 pullPolicy: IfNotPresent ## prometheus server priorityClassName ## priorityClassName: "" ## EnableServiceLinks indicates whether information about services should be injected ## into pod's environment variables, matching the syntax of Docker links. ## WARNING: the field is unsupported and will be skipped in K8s prior to v1.13.0. ## enableServiceLinks: true ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug ## so that the various internal URLs are still able to access as they are in the default case. ## (Optional) prefixURL: "" ## External URL which can access alertmanager ## Maybe same with Ingress host name baseURL: "" ## Additional server container environment variables ## ## You specify this manually like you would a raw deployment manifest. ## This means you can bind in environment variables from secrets. ## ## e.g. static environment variable: ## - name: DEMO_GREETING ## value: "Hello from the environment" ## ## e.g. secret environment variable: ## - name: USERNAME ## valueFrom: ## secretKeyRef: ## name: mysecret ## key: username env: [] extraFlags: - web.enable-lifecycle ## web.enable-admin-api flag controls access to the administrative HTTP API which includes functionality such as ## deleting time series. This is disabled by default. # - web.enable-admin-api ## ## storage.tsdb.no-lockfile flag controls BD locking # - storage.tsdb.no-lockfile ## ## storage.tsdb.wal-compression flag enables compression of the write-ahead log (WAL) # - storage.tsdb.wal-compression ## Path to a configuration file on prometheus server container FS configPath: /etc/config/prometheus.yml global: ## How frequently to scrape targets by default ## scrape_interval: 20s ## How long until a scrape request times out ## scrape_timeout: 10s ## How frequently to evaluate rules ## evaluation_interval: 30s ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write ## remoteWrite: [] ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read ## remoteRead: [] ## Additional Prometheus server container arguments ## extraArgs: {} ## Additional InitContainers to initialize the pod ## extraInitContainers: [] ## Additional Prometheus server Volume mounts ## extraVolumeMounts: [] ## Additional Prometheus server Volumes ## extraVolumes: [] ## Additional Prometheus server hostPath mounts ## extraHostPathMounts: [] # - name: certs-dir # mountPath: /etc/kubernetes/certs # subPath: "" # hostPath: /etc/kubernetes/certs # readOnly: true extraConfigmapMounts: [] # - name: certs-configmap # mountPath: /prometheus # subPath: "" # configMap: certs-configmap # readOnly: true ## Additional Prometheus server Secret mounts # Defines additional mounts with secrets. Secrets must be manually created in the namespace. extraSecretMounts: [] # - name: secret-files # mountPath: /etc/secrets # subPath: "" # secretName: prom-secret-files # readOnly: true ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.server.configMapOverrideName}} ## Defining configMapOverrideName will cause templates/server-configmap.yaml ## to NOT generate a ConfigMap resource ## configMapOverrideName: "" ingress: ## If true, Prometheus server Ingress will be created ## enabled: false ## Prometheus server Ingress annotations ## annotations: {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: 'true' ## Prometheus server Ingress additional labels ## extraLabels: {} ## Prometheus server Ingress hostnames with optional path ## Must be provided if Ingress is enabled ## hosts: [] # - prometheus.domain.com # - domain.com/prometheus ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. extraPaths: [] # - path: /* # backend: # serviceName: ssl-redirect # servicePort: use-annotation ## Prometheus server Ingress TLS configuration ## Secrets must be manually created in the namespace ## tls: [] # - secretName: prometheus-server-tls # hosts: # - prometheus.domain.com ## Server Deployment Strategy type # strategy: # type: Recreate ## hostAliases allows adding entries to /etc/hosts inside the containers hostAliases: [] # - ip: "127.0.0.1" # hostnames: # - "example.com" ## Node tolerations for server scheduling to nodes with taints ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ ## tolerations: [] # - key: "key" # operator: "Equal|Exists" # value: "value" # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" ## Node labels for Prometheus server pod assignment ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ ## nodeSelector: {} ## Pod affinity ## affinity: {} ## PodDisruptionBudget settings ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/ ## podDisruptionBudget: enabled: false maxUnavailable: 1 ## Use an alternate scheduler, e.g. "stork". ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ ## # schedulerName: persistentVolume: ## If true, Prometheus server will create/use a Persistent Volume Claim ## If false, use emptyDir ## enabled: true ## Prometheus server data Persistent Volume access modes ## Must match those of existing PV or dynamic provisioner ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ ## accessModes: - ReadWriteOnce ## Prometheus server data Persistent Volume annotations ## annotations: {} ## Prometheus server data Persistent Volume existing claim name ## Requires server.persistentVolume.enabled: true ## If defined, PVC must be created manually before volume will be bound existingClaim: "" ## Prometheus server data Persistent Volume mount root path ## mountPath: /data ## Prometheus server data Persistent Volume size ## size: 8Gi ## Prometheus server data Persistent Volume Storage Class ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning ## If undefined (the default) or set to null, no storageClassName spec is ## set, choosing the default provisioner. (gp2 on AWS, standard on ## GKE, AWS & OpenStack) ## # storageClass: "-" ## Prometheus server data Persistent Volume Binding Mode ## If defined, volumeBindingMode: ## If undefined (the default) or set to null, no volumeBindingMode spec is ## set, choosing the default mode. ## # volumeBindingMode: "" ## Subdirectory of Prometheus server data Persistent Volume to mount ## Useful if the volume's root directory is not empty ## subPath: "" emptyDir: sizeLimit: "" ## Annotations to be added to Prometheus server pods ## podAnnotations: {} # iam.amazonaws.com/role: prometheus ## Labels to be added to Prometheus server pods ## podLabels: {} ## Prometheus AlertManager configuration ## alertmanagers: [] ## Specify if a Pod Security Policy for node-exporter must be created ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ ## podSecurityPolicy: annotations: {} ## Specify pod annotations ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl ## # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) ## replicaCount: 1 ## Annotations to be added to deployment ## deploymentAnnotations: {} statefulSet: ## If true, use a statefulset instead of a deployment for pod management. ## This allows to scale replicas to more than 1 pod ## enabled: false annotations: {} labels: {} podManagementPolicy: OrderedReady ## Alertmanager headless service to use for the statefulset ## headless: annotations: {} labels: {} servicePort: 80 ## Enable gRPC port on service to allow auto discovery with thanos-querier gRPC: enabled: false servicePort: 10901 # nodePort: 10901 ## Prometheus server readiness and liveness probe initial delay and timeout ## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ ## readinessProbeInitialDelay: 30 readinessProbePeriodSeconds: 5 readinessProbeTimeout: 30 readinessProbeFailureThreshold: 3 readinessProbeSuccessThreshold: 1 livenessProbeInitialDelay: 30 livenessProbePeriodSeconds: 15 livenessProbeTimeout: 30 livenessProbeFailureThreshold: 3 livenessProbeSuccessThreshold: 1 ## Prometheus server resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: {} # limits: # cpu: 500m # memory: 512Mi # requests: # cpu: 500m # memory: 512Mi ## Vertical Pod Autoscaler config ## Ref: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler verticalAutoscaler: ## If true a VPA object will be created for the controller (either StatefulSet or Deployemnt, based on above configs) enabled: false # updateMode: "Auto" # containerPolicies: # - containerName: 'prometheus-server' ## Security context to be added to server pods ## securityContext: runAsUser: 65534 runAsNonRoot: true runAsGroup: 65534 fsGroup: 65534 service: annotations: {} labels: {} clusterIP: "" ## List of IP addresses at which the Prometheus server service is available ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips ## externalIPs: [] loadBalancerIP: "" loadBalancerSourceRanges: [] servicePort: 9090 sessionAffinity: None type: ClusterIP ## Enable gRPC port on service to allow auto discovery with thanos-querier gRPC: enabled: false servicePort: 10901 # nodePort: 10901 ## If using a statefulSet (statefulSet.enabled=true), configure the ## service to connect to a specific replica to have a consistent view ## of the data. statefulsetReplica: enabled: false replica: 0 ## Prometheus server pod termination grace period ## terminationGracePeriodSeconds: 300 ## Prometheus data retention period (default if not specified is 15 days) ## retention: "15d" pushgateway: ## If false, pushgateway will not be installed ## enabled: true ## Use an alternate scheduler, e.g. "stork". ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ ## # schedulerName: ## pushgateway container name ## name: pushgateway ## pushgateway container image ## image: repository: prom/pushgateway tag: v1.2.0 pullPolicy: IfNotPresent ## pushgateway priorityClassName ## priorityClassName: "" ## Additional pushgateway container arguments ## ## for example: persistence.file: /data/pushgateway.data extraArgs: {} ## Additional InitContainers to initialize the pod ## extraInitContainers: [] ingress: ## If true, pushgateway Ingress will be created ## enabled: false ## pushgateway Ingress annotations ## annotations: {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: 'true' ## pushgateway Ingress hostnames with optional path ## Must be provided if Ingress is enabled ## hosts: [] # - pushgateway.domain.com # - domain.com/pushgateway ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. extraPaths: [] # - path: /* # backend: # serviceName: ssl-redirect # servicePort: use-annotation ## pushgateway Ingress TLS configuration ## Secrets must be manually created in the namespace ## tls: [] # - secretName: prometheus-alerts-tls # hosts: # - pushgateway.domain.com ## Node tolerations for pushgateway scheduling to nodes with taints ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ ## tolerations: [] # - key: "key" # operator: "Equal|Exists" # value: "value" # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" ## Node labels for pushgateway pod assignment ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ ## nodeSelector: {} ## Annotations to be added to pushgateway pods ## podAnnotations: {} ## Labels to be added to pushgateway pods ## podLabels: {} ## Specify if a Pod Security Policy for node-exporter must be created ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ ## podSecurityPolicy: annotations: {} ## Specify pod annotations ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl ## # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' replicaCount: 1 ## Annotations to be added to deployment ## deploymentAnnotations: {} ## PodDisruptionBudget settings ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/ ## podDisruptionBudget: enabled: false maxUnavailable: 1 ## pushgateway resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: {} # limits: # cpu: 10m # memory: 32Mi # requests: # cpu: 10m # memory: 32Mi ## Security context to be added to push-gateway pods ## securityContext: runAsUser: 65534 runAsNonRoot: true service: annotations: prometheus.io/probe: pushgateway labels: {} clusterIP: "" ## List of IP addresses at which the pushgateway service is available ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips ## externalIPs: [] loadBalancerIP: "" loadBalancerSourceRanges: [] servicePort: 9091 type: ClusterIP ## pushgateway Deployment Strategy type # strategy: # type: Recreate persistentVolume: ## If true, pushgateway will create/use a Persistent Volume Claim ## If false, use emptyDir ## enabled: false ## pushgateway data Persistent Volume access modes ## Must match those of existing PV or dynamic provisioner ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ ## accessModes: - ReadWriteOnce ## pushgateway data Persistent Volume Claim annotations ## annotations: {} ## pushgateway data Persistent Volume existing claim name ## Requires pushgateway.persistentVolume.enabled: true ## If defined, PVC must be created manually before volume will be bound existingClaim: "" ## pushgateway data Persistent Volume mount root path ## mountPath: /data ## pushgateway data Persistent Volume size ## size: 2Gi ## pushgateway data Persistent Volume Storage Class ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning ## If undefined (the default) or set to null, no storageClassName spec is ## set, choosing the default provisioner. (gp2 on AWS, standard on ## GKE, AWS & OpenStack) ## # storageClass: "-" ## pushgateway data Persistent Volume Binding Mode ## If defined, volumeBindingMode: ## If undefined (the default) or set to null, no volumeBindingMode spec is ## set, choosing the default mode. ## # volumeBindingMode: "" ## Subdirectory of pushgateway data Persistent Volume to mount ## Useful if the volume's root directory is not empty ## subPath: "" ## alertmanager ConfigMap entries ## alertmanagerFiles: alertmanager.yml: global: resolve_timeout: 30s route: group_by: ["alertname"] group_wait: 5s group_interval: 10s repeat_interval: 999h receiver: "default" routes: - receiver: "default" group_by: [] match_re: alertname: .* continue: true - receiver: "watchdog" group_by: ["alertname", "instance"] match_re: alertname: Watchdog continue: false - receiver: "by-cluster-service" group_by: ["alertname", "cluster", "service"] match_re: alertname: .* continue: true - receiver: "by-name" group_by: [alertname] match_re: alertname: .* continue: true - receiver: "by-cluster" group_by: [cluster] match_re: alertname: .* continue: true inhibit_rules: - source_match: severity: "critical" target_match: severity: "warning" # Apply inhibition if the alertname and cluster is the same in both equal: ["alertname", "cluster"] receivers: - name: "default" - name: "watchdog" - name: "by-cluster-service" - name: "by-name" - name: "by-cluster" ## Prometheus server ConfigMap entries ## serverFiles: ## Alerts configuration ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ #alerting_rules.yml: {} alerting_rules.yml: groups: - name: promethus rules: - alert: PrometheusJobMissing expr: absent(up{job="prometheus"}) for: 5m labels: severity: warning annotations: summary: "Prometheus job missing (instance {{ $labels.instance }})" description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTargetMissing expr: up == 0 for: 5m labels: severity: critical annotations: summary: "Prometheus target missing (instance {{ $labels.instance }})" description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusAllTargetsMissing expr: count by (job) (up) == 0 for: 5m labels: severity: critical annotations: summary: "Prometheus all targets missing (instance {{ $labels.instance }})" description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 5m labels: severity: warning annotations: summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})" description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTooManyRestarts expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 for: 5m labels: severity: warning annotations: summary: "Prometheus too many restarts (instance {{ $labels.instance }})" description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusAlertmanagerConfigurationReloadFailure expr: alertmanager_config_last_reload_successful != 1 for: 5m labels: severity: warning annotations: summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})" description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusAlertmanagerConfigNotSynced expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 for: 5m labels: severity: warning annotations: summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})" description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusNotConnectedToAlertmanager expr: prometheus_notifications_alertmanagers_discovered < 1 for: 5m labels: severity: critical annotations: summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})" description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusRuleEvaluationFailures expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTemplateTextExpansionFailures expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusRuleEvaluationSlow expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds for: 5m labels: severity: warning annotations: summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})" description: "Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 for: 5m labels: severity: warning annotations: summary: "Prometheus notifications backlog (instance {{ $labels.instance }})" description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})" description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTargetEmpty expr: prometheus_sd_discovered_targets == 0 for: 5m labels: severity: critical annotations: summary: "Prometheus target empty (instance {{ $labels.instance }})" description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTargetScrapingSlow expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 for: 5m labels: severity: warning annotations: summary: "Prometheus target scraping slow (instance {{ $labels.instance }})" description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 for: 5m labels: severity: warning annotations: summary: "Prometheus large scrape (instance {{ $labels.instance }})" description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTargetScrapeDuplicate expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 for: 5m labels: severity: warning annotations: summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})" description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTsdbCheckpointCreationFailures expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTsdbCheckpointDeletionFailures expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTsdbCompactionsFailed expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTsdbHeadTruncationsFailed expr: increase(prometheus_tsdb_head_truncations_failed_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTsdbReloadFailures expr: increase(prometheus_tsdb_reloads_failures_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTsdbWalCorruptions expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PrometheusTsdbWalTruncationsFailed expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0 for: 5m labels: severity: critical annotations: summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})" description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - name: node-exporter rules: - alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Host out of memory (instance {{ $labels.instance }})" description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 5m labels: severity: warning annotations: summary: "Host memory under memory pressure (instance {{ $labels.instance }})" description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualNetworkThroughputIn expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: summary: "Host unusual network throughput in (instance {{ $labels.instance }})" description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualNetworkThroughputOut expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: summary: "Host unusual network throughput out (instance {{ $labels.instance }})" description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualDiskReadRate expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning annotations: summary: "Host unusual disk read rate (instance {{ $labels.instance }})" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualDiskWriteRate expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning annotations: summary: "Host unusual disk write rate (instance {{ $labels.instance }})" description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" # please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)" - alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 for: 5m labels: severity: warning annotations: summary: "Host out of disk space (instance {{ $labels.instance }})" description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostDiskWillFillIn4Hours expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0 for: 5m labels: severity: warning annotations: summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})" description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostOutOfInodes expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Host out of inodes (instance {{ $labels.instance }})" description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualDiskReadLatency expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 for: 5m labels: severity: warning annotations: summary: "Host unusual disk read latency (instance {{ $labels.instance }})" description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostUnusualDiskWriteLatency expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 for: 5m labels: severity: warning annotations: summary: "Host unusual disk write latency (instance {{ $labels.instance }})" description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostHighCpuLoad expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Host high CPU load (instance {{ $labels.instance }})" description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" # 1000 context switches is an arbitrary number. # Alert threshold depends on nature of application. # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - alert: HostContextSwitching expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 5000 for: 5m labels: severity: warning annotations: summary: "Host context switching (instance {{ $labels.instance }})" description: "Context switching is growing on node (> 5000 / s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostSwapIsFillingUp expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "Host swap is filling up (instance {{ $labels.instance }})" description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostSystemdServiceCrashed expr: node_systemd_unit_state{state="failed"} == 1 for: 5m labels: severity: warning annotations: summary: "Host SystemD service crashed (instance {{ $labels.instance }})" description: "SystemD service crashed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostKernelVersionDeviations expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 for: 5m labels: severity: warning annotations: summary: "Host kernel version deviations (instance {{ $labels.instance }})" description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[5m]) > 0 for: 5m labels: severity: warning annotations: summary: "Host OOM kill detected (instance {{ $labels.instance }})" description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - name: cadvisor rules: - alert: ContainerKilled expr: time() - container_last_seen > 60 for: 5m labels: severity: warning annotations: summary: "Container killed (instance {{ $labels.instance }})" description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" # cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly. # If you want to exclude it from this alert, just use: container_cpu_usage_seconds_total{name!=""} - alert: ContainerCpuUsage expr: (sum(rate(container_cpu_usage_seconds_total{image!=""}[3m])) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container CPU usage (instance {{ $labels.instance }})" description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d - alert: ContainerMemoryUsage expr: (sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Memory usage (instance {{ $labels.instance }})" description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerVolumeUsage expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Volume usage (instance {{ $labels.instance }})" description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerVolumeIoUsage expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Volume IO usage (instance {{ $labels.instance }})" description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 5m labels: severity: warning annotations: summary: "Container high throttle rate (instance {{ $labels.instance }})" description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - name: k8s.rules rules: - expr: | sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m])) by (namespace) record: namespace:container_cpu_usage_seconds_total:sum_rate - expr: | sum by (namespace, pod, container) ( rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m]) ) record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate - expr: | sum(container_memory_usage_bytes{image!="", container!="POD",namespace!=""}) by (namespace) record: namespace:container_memory_usage_bytes:sum - expr: | sum by (namespace, label_name) ( sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (namespace, pod) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") ) record: namespace_name:container_cpu_usage_seconds_total:sum_rate - expr: | sum by (namespace, label_name) ( sum(container_memory_usage_bytes{image!="", container!="POD"}) by (pod, namespace) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") ) record: namespace_name:container_memory_usage_bytes:sum - expr: | sum by (namespace, label_name) ( sum(kube_pod_container_resource_requests_memory_bytes{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") ) record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum - expr: | sum by (namespace, label_name) ( sum(kube_pod_container_resource_requests_cpu_cores{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") ) record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum - expr: | sum( label_replace( label_replace( kube_pod_owner{component="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{component="kube-state-metrics"}, "workload", "$1", "owner_name", "(.*)" ) ) by (namespace, workload, pod) labels: workload_type: deployment record: mixin_pod_workload - expr: | sum( label_replace( kube_pod_owner{component="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) ) by (namespace, workload, pod) labels: workload_type: daemonset record: mixin_pod_workload - expr: | sum( label_replace( kube_pod_owner{component="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ) ) by (namespace, workload, pod) labels: workload_type: statefulset record: mixin_pod_workload - name: kube-scheduler.rules rules: - expr: | histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.99" record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - expr: | histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.99" record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - expr: | histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.99" record: cluster_quantile:scheduler_binding_latency:histogram_quantile - expr: | histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.9" record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - expr: | histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.9" record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - expr: | histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.9" record: cluster_quantile:scheduler_binding_latency:histogram_quantile - expr: | histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.5" record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - expr: | histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.5" record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - expr: | histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.5" record: cluster_quantile:scheduler_binding_latency:histogram_quantile - name: kube-apiserver.rules rules: - expr: | histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.99" record: cluster_quantile:apiserver_request_latencies:histogram_quantile - expr: | histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.9" record: cluster_quantile:apiserver_request_latencies:histogram_quantile - expr: | histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 labels: quantile: "0.5" record: cluster_quantile:apiserver_request_latencies:histogram_quantile - name: node.rules rules: - expr: sum(min(kube_pod_info) by (node)) record: ':kube_pod_info_node_count:' - expr: | max(label_replace(kube_pod_info{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) record: 'node_namespace_pod:kube_pod_info:' - expr: | count by (node) (sum by (node, cpu) ( node_cpu_seconds_total{component="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: )) record: node:node_num_cpu:sum - expr: | 1 - avg(rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m])) record: :node_cpu_utilization:avg1m - expr: | 1 - avg by (node) ( rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) record: node:node_cpu_utilization:avg1m - expr: | node:node_cpu_utilization:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum)) record: node:cluster_cpu_utilization:ratio - expr: | sum(node_load1{component="node-exporter"}) / sum(node:node_num_cpu:sum) record: ':node_cpu_saturation_load1:' - expr: | sum by (node) ( node_load1{component="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) / node:node_num_cpu:sum record: 'node:node_cpu_saturation_load1:' - expr: | 1 - sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"}) / sum(node_memory_MemTotal_bytes{component="node-exporter"}) record: ':node_memory_utilization:' - expr: | sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"}) record: :node_memory_MemFreeCachedBuffers_bytes:sum - expr: | sum(node_memory_MemTotal_bytes{component="node-exporter"}) record: :node_memory_MemTotal_bytes:sum - expr: | sum by (node) ( (node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_memory_bytes_available:sum - expr: | sum by (node) ( node_memory_MemTotal_bytes{component="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_memory_bytes_total:sum - expr: | (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / node:node_memory_bytes_total:sum record: node:node_memory_utilization:ratio - expr: | (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / scalar(sum(node:node_memory_bytes_total:sum)) record: node:cluster_memory_utilization:ratio - expr: | 1e3 * sum( (rate(node_vmstat_pgpgin{component="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{component="node-exporter"}[1m])) ) record: :node_memory_swap_io_bytes:sum_rate - expr: | 1 - sum by (node) ( (node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) / sum by (node) ( node_memory_MemTotal_bytes{component="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: 'node:node_memory_utilization:' - expr: | 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) record: 'node:node_memory_utilization_2:' - expr: | 1e3 * sum by (node) ( (rate(node_vmstat_pgpgin{component="node-exporter"}[1m]) + rate(node_vmstat_pgpgout{component="node-exporter"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | avg(irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) record: :node_disk_utilization:avg_irate - expr: | avg by (node) ( irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilization:avg_irate - expr: | avg(irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_saturation:avg_irate - expr: | max by (namespace, nodename, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_usage:' - expr: | max by (namespace, nodename, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_avail:' - expr: | sum(irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) + sum(irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) record: :node_net_utilization:sum_irate - expr: | sum by (node) ( (irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m]) + irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_utilization:sum_irate - expr: | sum(irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m])) + sum(irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m])) record: :node_net_saturation:sum_irate - expr: | sum by (node) ( (irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m]) + irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_saturation:sum_irate - expr: | max( max( kube_pod_info{component="kube-state-metrics", host_ip!=""} ) by (node, host_ip) * on (host_ip) group_right (node) label_replace( (max(node_filesystem_files{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" ) ) by (node) record: 'node:node_inodes_total:' - expr: | max( max( kube_pod_info{component="kube-state-metrics", host_ip!=""} ) by (node, host_ip) * on (host_ip) group_right (node) label_replace( (max(node_filesystem_files_free{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" ) ) by (node) record: 'node:node_inodes_free:' - name: cluster.rules rules: # Total number of CPU cores in the cluster. - expr: | sum(node:node_num_cpu:sum) record: cluster:cpu_total # Cluster-wide CPU usage rate in percent. - expr: | sum(node:cluster_cpu_utilization:ratio * 100) record: cluster:cpu_usage_rate # Cluster-wide total RAM in bytes. - expr: | sum(node:node_memory_bytes_total:sum) record: cluster:memory_total_bytes # Cluster-wide RAM usage in bytes. - expr: | sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum) record: cluster:memory_usage_bytes # Cluster-wide RAM usage rate in percent. - expr: | (sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)) / scalar(sum(node:node_memory_bytes_total:sum)) * 100 record: cluster:memory_usage_rate - name: kube-prometheus-node-recording.rules rules: - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance) record: instance:node_cpu:rate:sum - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) record: instance:node_filesystem_usage:sum - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) record: instance:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) record: instance:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) record: instance:node_cpu:ratio - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) record: cluster:node_cpu:sum_rate5m - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) record: cluster:node_cpu:ratio - name: kubernetes-absent rules: - alert: KubeAPIDown annotations: message: KubeAPI has disappeared from Prometheus target discovery. expr: | absent(up{job="kubernetes-apiservers"} == 1) for: 15m labels: severity: critical - alert: NodeExporterDown annotations: message: NodeExporter has disappeared from Prometheus target discovery. expr: | absent(up{component="node-exporter"} == 1) for: 15m labels: severity: critical - alert: PrometheusDown annotations: message: Prometheus has disappeared from Prometheus target discovery. expr: | absent(up{job="prometheus"} == 1) for: 15m labels: severity: critical - alert: cAdvisoDown annotations: message: cAdviso has disappeared from Prometheus target discovery. expr: | absent(up{job="kubernetes-nodes-cadvisor"} == 1) for: 15m labels: severity: critical - name: kubernetes-apps rules: - alert: KubePodCrashLooping annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. expr: | rate(kube_pod_container_status_restarts_total{component="kube-state-metrics"}[15m]) * 60 * 5 > 0 for: 1h labels: severity: critical - alert: KubePodNotReady annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour. expr: | sum by (namespace, pod) (kube_pod_status_phase{component="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 for: 1h labels: severity: critical - alert: KubeDeploymentGenerationMismatch annotations: message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. expr: | kube_deployment_status_observed_generation{component="kube-state-metrics"} != kube_deployment_metadata_generation{component="kube-state-metrics"} for: 15m labels: severity: critical - alert: KubeDeploymentReplicasMismatch annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than an hour. expr: | kube_deployment_spec_replicas{component="kube-state-metrics"} != kube_deployment_status_replicas_available{component="kube-state-metrics"} for: 1h labels: severity: critical - alert: KubeStatefulSetReplicasMismatch annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. expr: | kube_statefulset_status_replicas_ready{component="kube-state-metrics"} != kube_statefulset_status_replicas{component="kube-state-metrics"} for: 15m labels: severity: critical - alert: KubeStatefulSetGenerationMismatch annotations: message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. expr: | kube_statefulset_status_observed_generation{component="kube-state-metrics"} != kube_statefulset_metadata_generation{component="kube-state-metrics"} for: 15m labels: severity: critical - alert: KubeStatefulSetUpdateNotRolledOut annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. expr: | max without (revision) ( kube_statefulset_status_current_revision{component="kube-state-metrics"} unless kube_statefulset_status_update_revision{component="kube-state-metrics"} ) * ( kube_statefulset_replicas{component="kube-state-metrics"} != kube_statefulset_status_replicas_updated{component="kube-state-metrics"} ) for: 15m labels: severity: critical - alert: KubeDaemonSetRolloutStuck annotations: message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready. expr: | kube_daemonset_status_number_ready{component="kube-state-metrics"} / kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"} * 100 < 100 for: 15m labels: severity: critical - alert: KubeDaemonSetNotScheduled annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' expr: | kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{component="kube-state-metrics"} > 0 for: 10m labels: severity: warning - alert: KubeDaemonSetMisScheduled annotations: message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' expr: | kube_daemonset_status_number_misscheduled{component="kube-state-metrics"} > 0 for: 10m labels: severity: warning - alert: KubeCronJobRunning annotations: message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete. expr: | time() - kube_cronjob_next_schedule_time{component="kube-state-metrics"} > 3600 for: 1h labels: severity: warning - alert: KubeJobCompletion annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete. expr: | kube_job_spec_completions{component="kube-state-metrics"} - kube_job_status_succeeded{component="kube-state-metrics"} > 0 for: 1h labels: severity: warning - alert: KubeJobFailed annotations: message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. expr: | kube_job_status_failed{component="kube-state-metrics"} > 0 for: 1h labels: severity: warning - name: kubernetes-resources rules: - alert: KubeCPUOvercommit annotations: message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. expr: | sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) / sum(node:node_num_cpu:sum) > (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) for: 5m labels: severity: warning - alert: KubeMemOvercommit annotations: message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. expr: | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) / sum(node_memory_MemTotal_bytes) > (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) for: 5m labels: severity: warning - alert: KubeCPUOvercommit annotations: message: Cluster has overcommitted CPU resource requests for Namespaces. expr: | sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="cpu"}) / sum(node:node_num_cpu:sum) > 1.5 for: 5m labels: severity: warning - alert: KubeMemOvercommit annotations: message: Cluster has overcommitted memory resource requests for Namespaces. expr: | sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="memory"}) / sum(node_memory_MemTotal_bytes{component="node-exporter"}) > 1.5 for: 5m labels: severity: warning - alert: KubeQuotaExceeded annotations: message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value }}% of its {{ $labels.resource }} quota. expr: | 100 * kube_resourcequota{component="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{component="kube-state-metrics", type="hard"} > 0) > 90 for: 15m labels: severity: warning - name: kubernetes-storage rules: - alert: KubePersistentVolumeUsageCritical annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value }}% free. expr: | 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} < 3 for: 1m labels: severity: critical - alert: KubePersistentVolumeFullInFourDays annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ printf "%0.2f" $value }}% is available. expr: | 100 * ( kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} ) < 15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m labels: severity: critical - alert: KubePersistentVolumeErrors annotations: message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. expr: | kube_persistentvolume_status_phase{phase=~"Failed|Pending",component="kube-state-metrics"} > 0 for: 5m labels: severity: critical - name: kubernetes-system rules: - alert: KubeNodeNotReady annotations: message: '{{ $labels.node }} has been unready for more than an hour.' expr: | kube_node_status_condition{component="kube-state-metrics",condition="Ready",status="true"} == 0 for: 1h labels: severity: warning - alert: KubeVersionMismatch annotations: message: There are {{ $value }} different semantic versions of Kubernetes components running. expr: | count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning - alert: KubeClientErrors annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' expr: | (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) / sum(rate(rest_client_requests_total[5m])) by (instance, job)) * 100 > 1 for: 15m labels: severity: warning - alert: KubeClientErrors annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / second. expr: | sum(rate(ksm_scrape_error_total{component="kube-state-metrics"}[5m])) by (instance, job) > 0.1 for: 15m labels: severity: warning - alert: KubeletTooManyPods annotations: message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the limit of 110. expr: | kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning - alert: KubeAPILatencyHigh annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning - alert: KubeAPILatencyHigh annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical - alert: KubeAPIErrorsHigh annotations: message: API server is returning errors for {{ $value }}% of requests. expr: | sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) / sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 3 for: 10m labels: severity: critical - alert: KubeAPIErrorsHigh annotations: message: API server is returning errors for {{ $value }}% of requests. expr: | sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) / sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 1 for: 10m labels: severity: warning - alert: KubeAPIErrorsHigh annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. expr: | sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) / sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 10 for: 10m labels: severity: critical - alert: KubeAPIErrorsHigh annotations: message: API server is returning errors for {{ $value }}% of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. expr: | sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) / sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 5 for: 10m labels: severity: warning - alert: KubeClientCertificateExpiration annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. expr: | apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 604800 labels: severity: warning - alert: KubeClientCertificateExpiration annotations: message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. expr: | apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 86400 labels: severity: critical - name: alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent annotations: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. expr: | count_values("config_hash", alertmanager_config_hash{job="prometheus-alertmanager"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 for: 5m labels: severity: critical - alert: AlertmanagerFailedReload annotations: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. expr: | alertmanager_config_last_reload_successful{job="prometheus-alertmanager"} == 0 for: 10m labels: severity: warning - alert: AlertmanagerMembersInconsistent annotations: message: Alertmanager has not found all other members of the cluster. expr: | alertmanager_cluster_members{job="prometheus-alertmanager"} != on (service) GROUP_LEFT() count by (service) (alertmanager_cluster_members{job="prometheus-alertmanager"}) for: 5m labels: severity: critical - name: etcd.rules rules: - alert: EtcdDown annotations: message: Etcd instance is down on node {{ $labels.node }}. expr: satellite_etcd_up == 0 for: 5m labels: severity: critical - alert: EtcdUnhealthy annotations: message: Etcd cluster is unhealthy. expr: satellite_etcd_health == 0 for: 1m labels: severity: critical - name: sysctl.rules rules: - alert: BrNetfilterMissing annotations: message: Bridge netfilter is disabled on node {{ $labels.node }} runbook_url: https://gravitational.com/gravity/docs/requirements/#br_netfilter-module expr: max_over_time(satellite_sysctl_br_netfilter[1h]) unless satellite_sysctl_br_netfilter or satellite_sysctl_br_netfilter == 0 for: 5m labels: severity: critical - alert: IPv4ForwardingMissing annotations: message: IPv4 forwarding is disabled on node {{ $labels.node }} runbook_url: https://gravitational.com/gravity/docs/faq/#ipv4-forwarding expr: max_over_time(satellite_sysctl_ipv4_forwarding[1h]) unless satellite_sysctl_ipv4_forwarding or satellite_sysctl_ipv4_forwarding == 0 for: 5m labels: severity: critical - name: docker.rules rules: - alert: DockerDown annotations: message: Docker daemon is down on host {{ $labels.node }} expr: satellite_docker_health == 0 for: 5m labels: severity: critical - name: systemd.rules rules: - alert: SystemdDegraded annotations: message: Systemd on host {{ $labels.node }} expr: satellite_systemd_health == 0 for: 5m labels: severity: critical - alert: SystemdUnitDegraded annotations: message: Systemd unit {{ $labels.unit_name }} is degraded on host {{ $labels.node }} expr: satellite_systemd_unit_health == 0 for: 5m labels: severity: critical - name: general.rules rules: - alert: TargetDown annotations: message: '{{ $value }}% of the {{ $labels.job }} targets are down.' expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m labels: severity: warning - alert: Watchdog annotations: message: | This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. expr: vector(1) labels: severity: none - name: kube-prometheus-node-alerting.rules rules: - alert: NodeDiskRunningFull annotations: message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} will be full within the next 24 hours. expr: | (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) for: 30m labels: severity: warning - alert: NodeDiskRunningFull annotations: message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} will be full within the next 2 hours. expr: | (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) for: 10m labels: severity: critical - name: node-network rules: - alert: NetworkReceiveErrors annotations: message: Network interface "{{ $labels.device }}" showing receive errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" expr: | rate(node_network_receive_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0 for: 2m labels: severity: warning - alert: NetworkTransmitErrors annotations: message: Network interface "{{ $labels.device }}" showing transmit errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" expr: | rate(node_network_transmit_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0 for: 2m labels: severity: warning - alert: NodeNetworkInterfaceFlapping annotations: message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" expr: | changes(node_network_up{component="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m labels: severity: warning - name: prometheus.rules rules: - alert: PrometheusConfigReloadFailed annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} summary: Reloading Prometheus' configuration failed expr: | prometheus_config_last_reload_successful{job="prometheus"} == 0 for: 10m labels: severity: warning - alert: PrometheusNotificationQueueRunningFull annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} summary: Prometheus' alert notification queue is running full expr: | predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"} for: 10m labels: severity: warning - alert: PrometheusErrorSendingAlerts annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alert from Prometheus expr: | rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01 for: 10m labels: severity: warning - alert: PrometheusErrorSendingAlerts annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alerts from Prometheus expr: | rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03 for: 10m labels: severity: critical - alert: PrometheusNotConnectedToAlertmanagers annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers summary: Prometheus is not connected to any Alertmanagers expr: | prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1 for: 10m labels: severity: warning - alert: PrometheusTSDBReloadsFailing annotations: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk expr: | increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0 for: 12h labels: severity: warning - alert: PrometheusTSDBCompactionsFailing annotations: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks expr: | increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0 for: 12h labels: severity: warning - alert: PrometheusTSDBWALCorruptions annotations: description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted expr: | prometheus_tsdb_wal_corruptions_total{job="prometheus"} > 0 for: 4h labels: severity: warning - alert: PrometheusNotIngestingSamples annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples. summary: Prometheus isn't ingesting samples expr: | rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0 for: 10m labels: severity: warning - alert: PrometheusTargetScrapesDuplicate annotations: description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values' summary: Prometheus has many samples rejected expr: | increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 for: 10m labels: severity: warning # groups: # - name: Instances # rules: # - alert: InstanceDown # expr: up == 0 # for: 5m # labels: # severity: page # annotations: # description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.' # summary: 'Instance {{ $labels.instance }} down' ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml alerts: {} ## Records configuration ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ recording_rules.yml: {} ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml rules: {} prometheus.yml: rule_files: - /etc/config/recording_rules.yml - /etc/config/alerting_rules.yml ## Below two files are DEPRECATED will be removed from this default values file - /etc/config/rules - /etc/config/alerts scrape_configs: - job_name: prometheus static_configs: - targets: - localhost:9090 # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) # and services to allow each to use different authentication configs. # # Kubernetes labels will be added as Prometheus labels on metrics via the # `labelmap` relabeling action. # Scrape config for API servers. # # Kubernetes exposes API servers as endpoints to the default/kubernetes # service so this uses `endpoints` role and uses relabelling to only keep # the endpoints associated with the default/kubernetes service using the # default named port `https`. This works for single API server deployments as # well as HA API server deployments. - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints # Default to scraping over https. If required, just disable this or change to # `http`. scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # If your node certificates are self-signed or use a different CA to the # master CA, then disable certificate verification below. Note that # certificate verification is an integral part of a secure infrastructure # so this should only be disabled in a controlled environment. You can # disable certificate verification by uncommenting the line below. # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # Keep only the default/kubernetes service endpoints for the https port. This # will add targets for each API server which Kubernetes adds an endpoint to # the default/kubernetes service. relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https - job_name: 'kubernetes-nodes' # Default to scraping over https. If required, just disable this or change to # `http`. scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # If your node certificates are self-signed or use a different CA to the # master CA, then disable certificate verification below. Note that # certificate verification is an integral part of a secure infrastructure # so this should only be disabled in a controlled environment. You can # disable certificate verification by uncommenting the line below. # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics - job_name: 'kubernetes-nodes-cadvisor' # Default to scraping over https. If required, just disable this or change to # `http`. scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # . tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # If your node certificates are self-signed or use a different CA to the # master CA, then disable certificate verification below. Note that # certificate verification is an integral part of a secure infrastructure # so this should only be disabled in a controlled environment. You can # disable certificate verification by uncommenting the line below. # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node # This configuration will work only on kubelet 1.7.3+ # As the scrape endpoints for cAdvisor have changed # if you are using older version you need to change the replacement to # replacement: /api/v1/nodes/$1:4194/proxy/metrics # more info here https://github.com/coreos/prometheus-operator/issues/633 relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor # Scrape config for service endpoints. # # The relabeling allows the actual service scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/scrape`: Only scrape services that have a value of `true` # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need # to set this to `https` & most likely set the `tls_config` of the scrape config. # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: If the metrics are exposed on a different port to the # service then set this appropriately. - job_name: 'kubernetes-service-endpoints' kubernetes_sd_configs: - role: endpoints relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name - source_labels: [__meta_kubernetes_pod_node_name] action: replace target_label: kubernetes_node # Scrape config for slow service endpoints; same as above, but with a larger # timeout and a larger interval # # The relabeling allows the actual service scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need # to set this to `https` & most likely set the `tls_config` of the scrape config. # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: If the metrics are exposed on a different port to the # service then set this appropriately. - job_name: 'kubernetes-service-endpoints-slow' scrape_interval: 5m scrape_timeout: 30s kubernetes_sd_configs: - role: endpoints relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name - source_labels: [__meta_kubernetes_pod_node_name] action: replace target_label: kubernetes_node - job_name: 'prometheus-pushgateway' honor_labels: true kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] action: keep regex: pushgateway # Example scrape config for probing services via the Blackbox Exporter. # # The relabeling allows the actual service scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/probe`: Only probe services that have a value of `true` - job_name: 'kubernetes-services' metrics_path: /probe params: module: [http_2xx] kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] action: keep regex: true - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: blackbox - source_labels: [__param_target] target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] target_label: kubernetes_name # Example scrape config for pods # # The relabeling allows the actual pod scrape endpoint to be configured via the # following annotations: # # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name - source_labels: [__meta_kubernetes_pod_phase] regex: Pending|Succeeded|Failed action: drop # Example Scrape config for pods which should be scraped slower. An useful example # would be stackriver-exporter which querys an API on every scrape of the pod # # The relabeling allows the actual pod scrape endpoint to be configured via the # following annotations: # # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - job_name: 'kubernetes-pods-slow' scrape_interval: 5m scrape_timeout: 30s kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name - source_labels: [__meta_kubernetes_pod_phase] regex: Pending|Succeeded|Failed action: drop # adds additional scrape configs to prometheus.yml # must be a string so you have to add a | after extraScrapeConfigs: # example adds prometheus-blackbox-exporter scrape config extraScrapeConfigs: # - job_name: 'prometheus-blackbox-exporter' # metrics_path: /probe # params: # module: [http_2xx] # static_configs: # - targets: # - https://example.com # relabel_configs: # - source_labels: [__address__] # target_label: __param_target # - source_labels: [__param_target] # target_label: instance # - target_label: __address__ # replacement: prometheus-blackbox-exporter:9115 # Adds option to add alert_relabel_configs to avoid duplicate alerts in alertmanager # useful in H/A prometheus with different external labels but the same alerts alertRelabelConfigs: # alert_relabel_configs: # - source_labels: [dc] # regex: (.+)\d+ # target_label: dc networkPolicy: ## Enable creation of NetworkPolicy resources. ## enabled: true # Force namespace of namespaced resources forceNamespace: null