67 lines
No EOL
2.5 KiB
YAML
67 lines
No EOL
2.5 KiB
YAML
---
|
|
# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/prometheusrule_v1.json
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: smartctl-exporter-rules
|
|
spec:
|
|
groups:
|
|
- name: smartctl-exporter.rules
|
|
rules:
|
|
- alert: SmartDeviceHighTemperature
|
|
annotations:
|
|
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
|
|
has a temperature higher than 65°C.
|
|
expr: smartctl_device_temperature > 65
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: SmartDeviceTestFailed
|
|
annotations:
|
|
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
|
|
did not pass its SMART test.
|
|
expr: |
|
|
(
|
|
smartctl_device_smart_status != 1
|
|
or
|
|
smartctl_device_status != 1
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: SmartDeviceCriticalWarning
|
|
annotations:
|
|
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
|
|
is in a critical state.
|
|
expr: smartctl_device_critical_warning != 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
#
|
|
# Ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-smartctl-exporter/rules/rules.txt
|
|
#
|
|
- alert: SmartDeviceMediaErrors
|
|
annotations:
|
|
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
|
|
has media errors.
|
|
expr: smartctl_device_media_errors != 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: SmartDeviceAvailableSpareUnderThreadhold
|
|
annotations:
|
|
summary: Device {{ $labels.device }} on instance {{ $labels.instance }}
|
|
is under available spare threashold.
|
|
expr: smartctl_device_available_spare_threshold > smartctl_device_available_spare
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: SmartDeviceInterfaceSlow
|
|
annotations:
|
|
summary: Device {{ $labels.device }} on instance {{ $labels.instance }}
|
|
interface is slower then it should be.
|
|
expr: |
|
|
smartctl_device_interface_speed{speed_type="current"} != on(device, instance, namespace, pod) smartctl_device_interface_speed{speed_type="max"}
|
|
for: 15m
|
|
labels:
|
|
severity: critical |