--- # yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/prometheusrule_v1.json apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: smartctl-exporter-rules spec: groups: - name: smartctl-exporter.rules rules: - alert: SmartDeviceHighTemperature annotations: summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }} has a temperature higher than 65°C. expr: smartctl_device_temperature > 65 for: 15m labels: severity: critical - alert: SmartDeviceTestFailed annotations: summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }} did not pass its SMART test. expr: | ( smartctl_device_smart_status != 1 or smartctl_device_status != 1 ) for: 15m labels: severity: critical - alert: SmartDeviceCriticalWarning annotations: summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }} is in a critical state. expr: smartctl_device_critical_warning != 0 for: 15m labels: severity: critical # # Ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-smartctl-exporter/rules/rules.txt # - alert: SmartDeviceMediaErrors annotations: summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }} has media errors. expr: smartctl_device_media_errors != 0 for: 15m labels: severity: critical - alert: SmartDeviceAvailableSpareUnderThreadhold annotations: summary: Device {{ $labels.device }} on instance {{ $labels.instance }} is under available spare threashold. expr: smartctl_device_available_spare_threshold > smartctl_device_available_spare for: 15m labels: severity: critical - alert: SmartDeviceInterfaceSlow annotations: summary: Device {{ $labels.device }} on instance {{ $labels.instance }} interface is slower then it should be. expr: | smartctl_device_interface_speed{speed_type="current"} != on(device, instance, namespace, pod) smartctl_device_interface_speed{speed_type="max"} for: 15m labels: severity: critical