Compare commits

...

7 commits

8 changed files with 4847 additions and 2 deletions

View file

@ -5,6 +5,7 @@
"redhat.vscode-yaml",
"signageos.signageos-vscode-sops",
"pkief.material-icon-theme",
"ms-vscode-remote.remote-ssh"
"ms-vscode-remote.remote-ssh",
"editorconfig.editorconfig"
]
}

View file

@ -89,7 +89,7 @@ spec:
app:
image:
repository: 1337kavin/piped-proxy
tag: latest@sha256:1d97d5a7c7e464c1b43eca485723962af85b038e1c614fd35ab50b1b6cbdc3ba
tag: latest@sha256:ee18e54bb18aa7f4da0c13d43119fa66e011dbe3d8fa9b43418c0baaf9299108
command:
- /app/piped-proxy
probes:

View file

@ -11,6 +11,7 @@ resources:
- ./grafana/ks.yaml
- ./node-exporter/ks.yaml
- ./prometheus-operator-crds/ks.yaml
- ./smartctl-exporter/ks.yaml
- ./unpoller/ks.yaml
- ./vector-agent/ks.yaml
- ./vector-aggregator/ks.yaml

View file

@ -0,0 +1,38 @@
---
# yaml-language-server: $schema=https://ks.hsn.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app smartctl-exporter
spec:
interval: 30m
chart:
spec:
chart: prometheus-smartctl-exporter
version: 0.10.0
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
install:
remediation:
retries: 3
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
values:
fullnameOverride: *app
serviceMonitor:
enabled: true
prometheusRules:
enabled: false
config:
devices:
- host: 10.1.1.68
devices:
- /dev/nvme0n1
- host: 10.1.1.69
devices:
- /dev/nvme0n1

View file

@ -0,0 +1,18 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./helmrelease.yaml
- ./prometheusrule.yaml
configMapGenerator:
- name: smartctl-exporter-dashboard
files:
- ./resources/blesswinsamuel_smartctl.json
options:
annotations:
kustomize.toolkit.fluxcd.io/substitute: disabled
labels:
grafana_dashboard: "1"
generatorOptions:
disableNameSuffixHash: true

View file

@ -0,0 +1,64 @@
---
# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/prometheusrule_v1.json
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: smartctl-exporter-rules
spec:
groups:
- name: smartctl-exporter.rules
rules:
- alert: SmartDeviceHighTemperature
annotations:
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
has a temperature higher than 65°C.
expr: smartctl_device_temperature > 65
for: 15m
labels:
severity: critical
- alert: SmartDeviceTestFailed
annotations:
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
did not pass its SMART test.
expr: |
(
smartctl_device_smart_status != 1
or
smartctl_device_status != 1
)
for: 15m
labels:
severity: critical
- alert: SmartDeviceCriticalWarning
annotations:
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
is in a critical state.
expr: smartctl_device_critical_warning != 0
for: 15m
labels:
severity: critical
- alert: SmartDeviceMediaErrors
annotations:
summary: Mounted drive {{ $labels.device }} on device {{ $labels.instance }}
has media errors.
expr: smartctl_device_media_errors{device!~"^nvme.+"} != 0
for: 15m
labels:
severity: critical
- alert: SmartDeviceAvailableSpareUnderThreadhold
annotations:
summary: Device {{ $labels.device }} on instance {{ $labels.instance }}
is under available spare threashold.
expr: smartctl_device_available_spare_threshold > smartctl_device_available_spare
for: 15m
labels:
severity: critical
- alert: SmartDeviceInterfaceSlow
annotations:
summary: Device {{ $labels.device }} on instance {{ $labels.instance }}
interface is slower then it should be.
expr: |
smartctl_device_interface_speed{speed_type="current"} != on(device, instance, namespace, pod) smartctl_device_interface_speed{speed_type="max"}
for: 15m
labels:
severity: critical

View file

@ -0,0 +1,20 @@
---
# yaml-language-server: $schema=https://ks.hsn.dev/kustomize.toolkit.fluxcd.io/kustomization_v1.json
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: &app smartctl-exporter
namespace: flux-system
spec:
targetNamespace: observability
commonMetadata:
labels:
app.kubernetes.io/name: *app
path: ./kubernetes/apps/observability/smartctl-exporter/app
prune: true
sourceRef:
kind: GitRepository
name: theshire
wait: false
interval: 30m
retryInterval: 1m