2024-02-27 12:05:24 -06:00
---
# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/prometheusrule_v1.json
apiVersion : monitoring.coreos.com/v1
kind : PrometheusRule
metadata :
name : miscellaneous-rules
labels :
prometheus : k8s
role : alert-rules
spec :
groups :
- name : dockerhub
rules :
- alert : BootstrapRateLimitRisk
annotations :
summary : Kubernetes cluster at risk of being rate limited by dockerhub on bootstrap
expr : count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100
for : 15m
labels :
severity : critical
- name : oom
rules :
- alert : OOMKilled
annotations :
summary : Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
expr : (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
labels :
severity : critical
- name : zfs
rules :
- alert : ZfsUnexpectedPoolState
annotations :
summary : ZFS pool {{$labels.zpool}} on {{$labels.instance}} is in a unexpected state {{$labels.state}}
expr : node_zfs_zpool_state{state!="online"} > 0
for : 15m
labels :
2024-07-30 20:56:14 -05:00
severity : critical