theshire/kubernetes/apps/observability/smartctl-exporter/app/prometheusrule.yaml

65 lines
2.4 KiB
YAML
Raw Normal View History

2024-10-27 19:10:17 -05:00
---
# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/prometheusrule_v1.json
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: smartctl-exporter-rules
spec:
groups:
- name: smartctl-exporter.rules
rules:
- alert: SmartDeviceHighTemperature
annotations:
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
has a temperature higher than 65°C.
expr: smartctl_device_temperature > 65
for: 15m
labels:
severity: critical
- alert: SmartDeviceTestFailed
annotations:
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
did not pass its SMART test.
expr: |
(
smartctl_device_smart_status != 1
or
smartctl_device_status != 1
)
for: 15m
labels:
severity: critical
- alert: SmartDeviceCriticalWarning
annotations:
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
is in a critical state.
expr: smartctl_device_critical_warning != 0
for: 15m
labels:
severity: critical
- alert: SmartDeviceMediaErrors
annotations:
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
has media errors.
expr: smartctl_device_media_errors{device!~"^nvme.+"} != 0
for: 15m
labels:
severity: critical
- alert: SmartDeviceAvailableSpareUnderThreadhold
annotations:
summary: Device {{ $labels.device }} on instance {{ $labels.instance }}
is under available spare threashold.
expr: smartctl_device_available_spare_threshold > smartctl_device_available_spare
for: 15m
labels:
severity: critical
- alert: SmartDeviceInterfaceSlow
annotations:
summary: Device {{ $labels.device }} on instance {{ $labels.instance }}
interface is slower then it should be.
expr: |
smartctl_device_interface_speed{speed_type="current"} != on(device, instance, namespace, pod) smartctl_device_interface_speed{speed_type="max"}
for: 15m
labels:
severity: critical