From fa3e2b6934d1ff1556435182732685090dc37fad Mon Sep 17 00:00:00 2001 From: Marius Ensrud Date: Wed, 28 Oct 2020 13:01:25 +0100 Subject: [PATCH] update rules --- charts/prometheus/v6.3.0/values.yaml | 1546 +++++++++++++++++++++++++- 1 file changed, 1532 insertions(+), 14 deletions(-) diff --git a/charts/prometheus/v6.3.0/values.yaml b/charts/prometheus/v6.3.0/values.yaml index f76e630..5d7cb83 100644 --- a/charts/prometheus/v6.3.0/values.yaml +++ b/charts/prometheus/v6.3.0/values.yaml @@ -1029,7 +1029,7 @@ server: loadBalancerIP: "" loadBalancerSourceRanges: [] - servicePort: 80 + servicePort: 9090 sessionAffinity: None type: ClusterIP @@ -1263,20 +1263,55 @@ pushgateway: ## alertmanagerFiles: alertmanager.yml: - global: {} - # slack_api_url: '' + global: + resolve_timeout: 30s + route: + group_by: ["alertname"] + group_wait: 5s + group_interval: 10s + repeat_interval: 999h + receiver: "default" + routes: + - receiver: "default" + group_by: [] + match_re: + alertname: .* + continue: true + - receiver: "watchdog" + group_by: ["alertname", "instance"] + match_re: + alertname: Watchdog + continue: false + - receiver: "by-cluster-service" + group_by: ["alertname", "cluster", "service"] + match_re: + alertname: .* + continue: true + - receiver: "by-name" + group_by: [alertname] + match_re: + alertname: .* + continue: true + - receiver: "by-cluster" + group_by: [cluster] + match_re: + alertname: .* + continue: true + + inhibit_rules: + - source_match: + severity: "critical" + target_match: + severity: "warning" + # Apply inhibition if the alertname and cluster is the same in both + equal: ["alertname", "cluster"] receivers: - - name: default-receiver - # slack_configs: - # - channel: '@you' - # send_resolved: true - - route: - group_wait: 10s - group_interval: 5m - receiver: default-receiver - repeat_interval: 3h + - name: "default" + - name: "watchdog" + - name: "by-cluster-service" + - name: "by-name" + - name: "by-cluster" ## Prometheus server ConfigMap entries ## @@ -1284,7 +1319,1486 @@ serverFiles: ## Alerts configuration ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ - alerting_rules.yml: {} + + #alerting_rules.yml: {} + alerting_rules.yml: + groups: + - name: promethus + rules: + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus job missing (instance {{ $labels.instance }})" + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTargetMissing + expr: up == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus target missing (instance {{ $labels.instance }})" + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusAllTargetsMissing + expr: count by (job) (up) == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus all targets missing (instance {{ $labels.instance }})" + description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})" + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus too many restarts (instance {{ $labels.instance }})" + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})" + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigNotSynced + expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})" + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})" + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})" + description: "Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus notifications backlog (instance {{ $labels.instance }})" + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})" + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTargetEmpty + expr: prometheus_sd_discovered_targets == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus target empty (instance {{ $labels.instance }})" + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus target scraping slow (instance {{ $labels.instance }})" + description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus large scrape (instance {{ $labels.instance }})" + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})" + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: increase(prometheus_tsdb_reloads_failures_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})" + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + + - name: node-exporter + rules: + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Host out of memory (instance {{ $labels.instance }})" + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 5m + labels: + severity: warning + annotations: + summary: "Host memory under memory pressure (instance {{ $labels.instance }})" + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualNetworkThroughputIn + expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual network throughput in (instance {{ $labels.instance }})" + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualNetworkThroughputOut + expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual network throughput out (instance {{ $labels.instance }})" + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualDiskReadRate + expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual disk read rate (instance {{ $labels.instance }})" + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualDiskWriteRate + expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual disk write rate (instance {{ $labels.instance }})" + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + # please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)" + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Host out of disk space (instance {{ $labels.instance }})" + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostDiskWillFillIn4Hours + expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})" + description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostOutOfInodes + expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Host out of inodes (instance {{ $labels.instance }})" + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualDiskReadLatency + expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual disk read latency (instance {{ $labels.instance }})" + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualDiskWriteLatency + expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual disk write latency (instance {{ $labels.instance }})" + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Host high CPU load (instance {{ $labels.instance }})" + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + # 1000 context switches is an arbitrary number. + # Alert threshold depends on nature of application. + # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 + - alert: HostContextSwitching + expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 5000 + for: 5m + labels: + severity: warning + annotations: + summary: "Host context switching (instance {{ $labels.instance }})" + description: "Context switching is growing on node (> 5000 / s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Host swap is filling up (instance {{ $labels.instance }})" + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Host SystemD service crashed (instance {{ $labels.instance }})" + description: "SystemD service crashed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + + + - alert: HostKernelVersionDeviations + expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Host kernel version deviations (instance {{ $labels.instance }})" + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host OOM kill detected (instance {{ $labels.instance }})" + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + + - name: cadvisor + rules: + - alert: ContainerKilled + expr: time() - container_last_seen > 60 + for: 5m + labels: + severity: warning + annotations: + summary: "Container killed (instance {{ $labels.instance }})" + description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + # cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly. + # If you want to exclude it from this alert, just use: container_cpu_usage_seconds_total{name!=""} + - alert: ContainerCpuUsage + expr: (sum(rate(container_cpu_usage_seconds_total{image!=""}[3m])) BY (instance, name) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Container CPU usage (instance {{ $labels.instance }})" + description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d + - alert: ContainerMemoryUsage + expr: (sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Container Memory usage (instance {{ $labels.instance }})" + description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: ContainerVolumeUsage + expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Container Volume usage (instance {{ $labels.instance }})" + description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: ContainerVolumeIoUsage + expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Container Volume IO usage (instance {{ $labels.instance }})" + description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: ContainerHighThrottleRate + expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Container high throttle rate (instance {{ $labels.instance }})" + description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - name: k8s.rules + rules: + - expr: | + sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m])) by (namespace) + record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, pod, container) ( + rate(container_cpu_usage_seconds_total{image!="", container!="POD",namespace!=""}[5m]) + ) + record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum(container_memory_usage_bytes{image!="", container!="POD",namespace!=""}) by (namespace) + record: namespace:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") + ) + record: namespace_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, label_name) ( + sum(container_memory_usage_bytes{image!="", container!="POD"}) by (pod, namespace) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") + ) + record: namespace_name:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_memory_bytes{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_cpu_cores{component="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + sum( + label_replace( + label_replace( + kube_pod_owner{component="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{component="kube-state-metrics"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) by (namespace, workload, pod) + labels: + workload_type: deployment + record: mixin_pod_workload + - expr: | + sum( + label_replace( + kube_pod_owner{component="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) by (namespace, workload, pod) + labels: + workload_type: daemonset + record: mixin_pod_workload + - expr: | + sum( + label_replace( + kube_pod_owner{component="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) by (namespace, workload, pod) + labels: + workload_type: statefulset + record: mixin_pod_workload + + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + + - name: kube-apiserver.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="kubernetes-apiservers"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + + - name: node.rules + rules: + - expr: sum(min(kube_pod_info) by (node)) + record: ':kube_pod_info_node_count:' + - expr: | + max(label_replace(kube_pod_info{component="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (node) (sum by (node, cpu) ( + node_cpu_seconds_total{component="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: | + 1 - avg(rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m])) + record: :node_cpu_utilization:avg1m + - expr: | + 1 - avg by (node) ( + rate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info:) + record: node:node_cpu_utilization:avg1m + - expr: | + node:node_cpu_utilization:avg1m + * + node:node_num_cpu:sum + / + scalar(sum(node:node_num_cpu:sum)) + record: node:cluster_cpu_utilization:ratio + - expr: | + sum(node_load1{component="node-exporter"}) + / + sum(node:node_num_cpu:sum) + record: ':node_cpu_saturation_load1:' + - expr: | + sum by (node) ( + node_load1{component="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + node:node_num_cpu:sum + record: 'node:node_cpu_saturation_load1:' + - expr: | + 1 - + sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"}) + / + sum(node_memory_MemTotal_bytes{component="node-exporter"}) + record: ':node_memory_utilization:' + - expr: | + sum(node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers_bytes:sum + - expr: | + sum(node_memory_MemTotal_bytes{component="node-exporter"}) + record: :node_memory_MemTotal_bytes:sum + - expr: | + sum by (node) ( + (node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_available:sum + - expr: | + sum by (node) ( + node_memory_MemTotal_bytes{component="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_total:sum + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + node:node_memory_bytes_total:sum + record: node:node_memory_utilization:ratio + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:cluster_memory_utilization:ratio + - expr: | + 1e3 * sum( + (rate(node_vmstat_pgpgin{component="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{component="node-exporter"}[1m])) + ) + record: :node_memory_swap_io_bytes:sum_rate + - expr: | + 1 - + sum by (node) ( + (node_memory_MemFree_bytes{component="node-exporter"} + node_memory_Cached_bytes{component="node-exporter"} + node_memory_Buffers_bytes{component="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + sum by (node) ( + node_memory_MemTotal_bytes{component="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: 'node:node_memory_utilization:' + - expr: | + 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) + record: 'node:node_memory_utilization_2:' + - expr: | + 1e3 * sum by (node) ( + (rate(node_vmstat_pgpgin{component="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{component="node-exporter"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_swap_io_bytes:sum_rate + - expr: | + avg(irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) + record: :node_disk_utilization:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_utilization:avg_irate + - expr: | + avg(irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) + record: :node_disk_saturation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_weighted_seconds_total{component="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_saturation:avg_irate + - expr: | + max by (namespace, nodename, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_usage:' + - expr: | + max by (namespace, nodename, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_avail:' + - expr: | + sum(irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) + record: :node_net_utilization:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_bytes_total{component="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_bytes_total{component="node-exporter",device!~"veth.+"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_utilization:sum_irate + - expr: | + sum(irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m])) + record: :node_net_saturation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_drop_total{component="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_drop_total{component="node-exporter",device!~"veth.+"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_saturation:sum_irate + - expr: | + max( + max( + kube_pod_info{component="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_total:' + - expr: | + max( + max( + kube_pod_info{component="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files_free{component="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_free:' + - name: cluster.rules + rules: + # Total number of CPU cores in the cluster. + - expr: | + sum(node:node_num_cpu:sum) + record: cluster:cpu_total + # Cluster-wide CPU usage rate in percent. + - expr: | + sum(node:cluster_cpu_utilization:ratio * 100) + record: cluster:cpu_usage_rate + # Cluster-wide total RAM in bytes. + - expr: | + sum(node:node_memory_bytes_total:sum) + record: cluster:memory_total_bytes + # Cluster-wide RAM usage in bytes. + - expr: | + sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum) + record: cluster:memory_usage_bytes + # Cluster-wide RAM usage rate in percent. + - expr: | + (sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)) / scalar(sum(node:node_memory_bytes_total:sum)) * 100 + record: cluster:memory_usage_rate + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY + (instance) + record: instance:node_cpu:rate:sum + - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) + BY (instance) + record: instance:node_filesystem_usage:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT + (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) + BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) + BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kubernetes-absent + rules: + - alert: KubeAPIDown + annotations: + message: KubeAPI has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubernetes-apiservers"} == 1) + for: 15m + labels: + severity: critical + + - alert: NodeExporterDown + annotations: + message: NodeExporter has disappeared from Prometheus target discovery. + expr: | + absent(up{component="node-exporter"} == 1) + for: 15m + labels: + severity: critical + - alert: PrometheusDown + annotations: + message: Prometheus has disappeared from Prometheus target discovery. + expr: | + absent(up{job="prometheus"} == 1) + for: 15m + labels: + severity: critical + - alert: cAdvisoDown + annotations: + message: cAdviso has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubernetes-nodes-cadvisor"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. + expr: | + rate(kube_pod_container_status_restarts_total{component="kube-state-metrics"}[15m]) * 60 * 5 > 0 + for: 1h + labels: + severity: critical + - alert: KubePodNotReady + annotations: + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready + state for longer than an hour. + expr: | + sum by (namespace, pod) (kube_pod_status_phase{component="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 + for: 1h + labels: + severity: critical + - alert: KubeDeploymentGenerationMismatch + annotations: + message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but has + not been rolled back. + expr: | + kube_deployment_status_observed_generation{component="kube-state-metrics"} + != + kube_deployment_metadata_generation{component="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeDeploymentReplicasMismatch + annotations: + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not + matched the expected number of replicas for longer than an hour. + expr: | + kube_deployment_spec_replicas{component="kube-state-metrics"} + != + kube_deployment_status_replicas_available{component="kube-state-metrics"} + for: 1h + labels: + severity: critical + - alert: KubeStatefulSetReplicasMismatch + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has + not matched the expected number of replicas for longer than 15 minutes. + expr: | + kube_statefulset_status_replicas_ready{component="kube-state-metrics"} + != + kube_statefulset_status_replicas{component="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetGenerationMismatch + annotations: + message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but has + not been rolled back. + expr: | + kube_statefulset_status_observed_generation{component="kube-state-metrics"} + != + kube_statefulset_metadata_generation{component="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update + has not been rolled out. + expr: | + max without (revision) ( + kube_statefulset_status_current_revision{component="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{component="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{component="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{component="kube-state-metrics"} + ) + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetRolloutStuck + annotations: + message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace + }}/{{ $labels.daemonset }} are scheduled and ready. + expr: | + kube_daemonset_status_number_ready{component="kube-state-metrics"} + / + kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"} * 100 < 100 + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetNotScheduled + annotations: + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are not scheduled.' + expr: | + kube_daemonset_status_desired_number_scheduled{component="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{component="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are running where they are not supposed to run.' + expr: | + kube_daemonset_status_number_misscheduled{component="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeCronJobRunning + annotations: + message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more + than 1h to complete. + expr: | + time() - kube_cronjob_next_schedule_time{component="kube-state-metrics"} > 3600 + for: 1h + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more + than one hour to complete. + expr: | + kube_job_spec_completions{component="kube-state-metrics"} - kube_job_status_succeeded{component="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. + expr: | + kube_job_status_failed{component="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Pods and cannot + tolerate node failure. + expr: | + sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) + / + sum(node:node_num_cpu:sum) + > + (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Pods and cannot + tolerate node failure. + expr: | + sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) + / + sum(node_memory_MemTotal_bytes) + > + (count(node:node_num_cpu:sum)-1) + / + count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Namespaces. + expr: | + sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="cpu"}) + / + sum(node:node_num_cpu:sum) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Namespaces. + expr: | + sum(kube_resourcequota{component="kube-state-metrics", type="hard", resource="memory"}) + / + sum(node_memory_MemTotal_bytes{component="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: + message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value + }}% of its {{ $labels.resource }} quota. + expr: | + 100 * kube_resourcequota{component="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{component="kube-state-metrics", type="hard"} > 0) + > 90 + for: 15m + labels: + severity: warning + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeUsageCritical + annotations: + message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value + }}% free. + expr: | + 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + < 3 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFullInFourDays + annotations: + message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is expected to fill up within four + days. Currently {{ printf "%0.2f" $value }}% is available. + expr: | + 100 * ( + kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + ) < 15 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 + for: 5m + labels: + severity: critical + - alert: KubePersistentVolumeErrors + annotations: + message: The persistent volume {{ $labels.persistentvolume }} has status {{ + $labels.phase }}. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",component="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeNodeNotReady + annotations: + message: '{{ $labels.node }} has been unready for more than an hour.' + expr: | + kube_node_status_condition{component="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + - alert: KubeVersionMismatch + annotations: + message: There are {{ $value }} different semantic versions of Kubernetes + components running. + expr: | + count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 + for: 1h + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + expr: | + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + * 100 > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }} errors / second. + expr: | + sum(rate(ksm_scrape_error_total{component="kube-state-metrics"}[5m])) by (instance, job) > 0.1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close + to the limit of 110. + expr: | + kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 + for: 15m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{ $labels.verb }} {{ $labels.resource }}. + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{ $labels.verb }} {{ $labels.resource }}. + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="kubernetes-apiservers",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests. + expr: | + sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) + / + sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 3 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests. + expr: | + sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) + / + sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) * 100 > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests for + {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + expr: | + sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 10 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests for + {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + expr: | + sum(rate(apiserver_request_count{job="kubernetes-apiservers",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_count{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb) * 100 > 5 + for: 10m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: A client certificate used to authenticate to the apiserver is expiring + in less than 7.0 days. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: A client certificate used to authenticate to the apiserver is expiring + in less than 24.0 hours. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 86400 + labels: + severity: critical + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + annotations: + message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` + are out of sync. + expr: | + count_values("config_hash", alertmanager_config_hash{job="prometheus-alertmanager"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + for: 5m + labels: + severity: critical + - alert: AlertmanagerFailedReload + annotations: + message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + expr: | + alertmanager_config_last_reload_successful{job="prometheus-alertmanager"} == 0 + for: 10m + labels: + severity: warning + - alert: AlertmanagerMembersInconsistent + annotations: + message: Alertmanager has not found all other members of the cluster. + expr: | + alertmanager_cluster_members{job="prometheus-alertmanager"} + != on (service) GROUP_LEFT() + count by (service) (alertmanager_cluster_members{job="prometheus-alertmanager"}) + for: 5m + labels: + severity: critical + - name: etcd.rules + rules: + - alert: EtcdDown + annotations: + message: Etcd instance is down on node {{ $labels.node }}. + expr: satellite_etcd_up == 0 + for: 5m + labels: + severity: critical + - alert: EtcdUnhealthy + annotations: + message: Etcd cluster is unhealthy. + expr: satellite_etcd_health == 0 + for: 1m + labels: + severity: critical + - name: sysctl.rules + rules: + - alert: BrNetfilterMissing + annotations: + message: Bridge netfilter is disabled on node {{ $labels.node }} + runbook_url: https://gravitational.com/gravity/docs/requirements/#br_netfilter-module + expr: max_over_time(satellite_sysctl_br_netfilter[1h]) unless satellite_sysctl_br_netfilter or satellite_sysctl_br_netfilter == 0 + for: 5m + labels: + severity: critical + - alert: IPv4ForwardingMissing + annotations: + message: IPv4 forwarding is disabled on node {{ $labels.node }} + runbook_url: https://gravitational.com/gravity/docs/faq/#ipv4-forwarding + expr: max_over_time(satellite_sysctl_ipv4_forwarding[1h]) unless satellite_sysctl_ipv4_forwarding or satellite_sysctl_ipv4_forwarding == 0 + for: 5m + labels: + severity: critical + - name: docker.rules + rules: + - alert: DockerDown + annotations: + message: Docker daemon is down on host {{ $labels.node }} + expr: satellite_docker_health == 0 + for: 5m + labels: + severity: critical + - name: systemd.rules + rules: + - alert: SystemdDegraded + annotations: + message: Systemd on host {{ $labels.node }} + expr: satellite_systemd_health == 0 + for: 5m + labels: + severity: critical + - alert: SystemdUnitDegraded + annotations: + message: Systemd unit {{ $labels.unit_name }} is degraded on host {{ $labels.node }} + expr: satellite_systemd_unit_health == 0 + for: 5m + labels: + severity: critical + - name: general.rules + rules: + - alert: TargetDown + annotations: + message: '{{ $value }}% of the {{ $labels.job }} targets are down.' + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + message: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + expr: vector(1) + labels: + severity: none + - name: kube-prometheus-node-alerting.rules + rules: + - alert: NodeDiskRunningFull + annotations: + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} will be full within the next 24 hours. + expr: | + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) + for: 30m + labels: + severity: warning + - alert: NodeDiskRunningFull + annotations: + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} will be full within the next 2 hours. + expr: | + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) + for: 10m + labels: + severity: critical + - name: node-network + rules: + - alert: NetworkReceiveErrors + annotations: + message: Network interface "{{ $labels.device }}" showing receive errors on + node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + expr: | + rate(node_network_receive_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0 + for: 2m + labels: + severity: warning + - alert: NetworkTransmitErrors + annotations: + message: Network interface "{{ $labels.device }}" showing transmit errors + on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + expr: | + rate(node_network_transmit_errs_total{component="node-exporter",device!~"veth.+"}[2m]) > 0 + for: 2m + labels: + severity: warning + - alert: NodeNetworkInterfaceFlapping + annotations: + message: Network interface "{{ $labels.device }}" changing it's up status + often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + expr: | + changes(node_network_up{component="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning + - name: prometheus.rules + rules: + - alert: PrometheusConfigReloadFailed + annotations: + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Prometheus' configuration failed + expr: | + prometheus_config_last_reload_successful{job="prometheus"} == 0 + for: 10m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + summary: Prometheus' alert notification queue is running full + expr: | + predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"} + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03 + for: 10m + labels: + severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers + expr: | + prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBWALCorruptions + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted + expr: | + prometheus_tsdb_wal_corruptions_total{job="prometheus"} > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting + samples. + summary: Prometheus isn't ingesting samples + expr: | + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusTargetScrapesDuplicate + annotations: + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected + due to duplicate timestamps but different values' + summary: Prometheus has many samples rejected + expr: | + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 + for: 10m + labels: + severity: warning # groups: # - name: Instances # rules: @@ -1306,6 +2820,10 @@ serverFiles: rules: {} prometheus.yml: + global: + evaluation_interval: 30s + scrape_interval: 20s + scrape_timeout: 10s rule_files: - /etc/config/recording_rules.yml - /etc/config/alerting_rules.yml