theshire/kubernetes/apps/observability/loki/app/rules/loki-alerting-rules.yml

15 lines
522 B
YAML
Raw Normal View History

2024-02-29 11:11:13 -06:00
---
groups:
- name: smart
rules:
- alert: SMARTFailure
expr: |
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
hostname: "{{ $labels.hostname }}"
summary: "{{ $labels.hostname }} has reported SMART failures"