---
# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/prometheusrule_v1.json
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: smartctl-exporter-rules
spec:
  groups:
    - name: smartctl-exporter.rules
      rules:
        - alert: SmartDeviceHighTemperature
          annotations:
            summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
              has a temperature higher than 65°C.
          expr: smartctl_device_temperature > 65
          for: 15m
          labels:
            severity: critical
        - alert: SmartDeviceTestFailed
          annotations:
            summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
              did not pass its SMART test.
          expr: |
            (
              smartctl_device_smart_status != 1
            or
              smartctl_device_status != 1
            )
          for: 15m
          labels:
            severity: critical
        - alert: SmartDeviceCriticalWarning
          annotations:
            summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
              is in a critical state.
          expr: smartctl_device_critical_warning != 0
          for: 15m
          labels:
            severity: critical
        #
        # Ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-smartctl-exporter/rules/rules.txt
        #
        - alert: SmartDeviceMediaErrors
          annotations:
            summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
              has media errors.
          expr: smartctl_device_media_errors != 0
          for: 15m
          labels:
            severity: critical
        - alert: SmartDeviceAvailableSpareUnderThreadhold
          annotations:
            summary: Device {{ $labels.device }} on instance {{ $labels.instance }}
              is under available spare threashold.
          expr: smartctl_device_available_spare_threshold > smartctl_device_available_spare
          for: 15m
          labels:
            severity: critical
        - alert: SmartDeviceInterfaceSlow
          annotations:
            summary: Device {{ $labels.device }} on instance {{ $labels.instance }}
              interface is slower then it should be.
          expr: |
            smartctl_device_interface_speed{speed_type="current"} != on(device, instance, namespace, pod) smartctl_device_interface_speed{speed_type="max"}
          for: 15m
          labels:
            severity: critical