216 lines
8.3 KiB
YAML
216 lines
8.3 KiB
YAML
---
|
|
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta1.json
|
|
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
|
kind: HelmRelease
|
|
metadata:
|
|
name: kube-prometheus-stack
|
|
namespace: monitoring
|
|
spec:
|
|
interval: 30m
|
|
timeout: 15m
|
|
chart:
|
|
spec:
|
|
chart: kube-prometheus-stack
|
|
version: 51.10.0
|
|
sourceRef:
|
|
kind: HelmRepository
|
|
name: prometheus-community
|
|
namespace: flux-system
|
|
maxHistory: 2
|
|
install:
|
|
createNamespace: true
|
|
crds: CreateReplace
|
|
remediation:
|
|
retries: 3
|
|
upgrade:
|
|
cleanupOnFail: true
|
|
crds: CreateReplace
|
|
remediation:
|
|
retries: 3
|
|
uninstall:
|
|
keepHistory: false
|
|
dependsOn:
|
|
- name: rook-ceph-cluster
|
|
namespace: rook-ceph
|
|
- name: thanos
|
|
namespace: monitoring
|
|
values:
|
|
cleanPrometheusOperatorObjectNames: true
|
|
alertmanager:
|
|
config:
|
|
global:
|
|
resolve_timeout: 5m
|
|
receivers:
|
|
- name: heartbeat
|
|
webhook_configs:
|
|
- url: "${SECRET_HEALTHCHECKS_WEBHOOK}"
|
|
send_resolved: true
|
|
- name: "null" # quote
|
|
- name: pushover
|
|
pushover_configs:
|
|
- url_title: View in Alertmanager
|
|
token: "${SECRET_PUSHOVER_ALERT_MANAGER_APIKEY}"
|
|
user_key: "${SECRET_PUSHOVER_USERKEY}"
|
|
send_resolved: true
|
|
html: true
|
|
sound: gamelan
|
|
priority: |-
|
|
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
|
|
title: |-
|
|
{{ .CommonLabels.alertname }} [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
|
|
message: |-
|
|
{{- range .Alerts }}
|
|
{{- if ne .Annotations.description "" }}
|
|
{{ .Annotations.description }}
|
|
{{- else if ne .Annotations.summary "" }}
|
|
{{ .Annotations.summary }}
|
|
{{- else if ne .Annotations.message "" }}
|
|
{{ .Annotations.message }}
|
|
{{- else }}
|
|
Alert description not available
|
|
{{- end }}
|
|
{{- if gt (len .Labels.SortedPairs) 0 }}
|
|
<small>
|
|
{{- range .Labels.SortedPairs }}
|
|
<b>{{ .Name }}:</b> {{ .Value }}
|
|
{{- end }}
|
|
</small>
|
|
{{- end }}
|
|
{{- end }}
|
|
route:
|
|
group_by: ["alertname", "job"]
|
|
group_wait: 1m
|
|
group_interval: 10m
|
|
repeat_interval: 12h
|
|
receiver: pushover
|
|
routes:
|
|
- receiver: heartbeat
|
|
group_wait: 0s
|
|
group_interval: 5m
|
|
repeat_interval: 5m
|
|
matchers:
|
|
- alertname =~ "Watchdog"
|
|
- receiver: "null" # quote
|
|
matchers:
|
|
- alertname =~ "InfoInhibitor"
|
|
- receiver: pushover
|
|
matchers:
|
|
- severity = "critical"
|
|
continue: true
|
|
inhibit_rules:
|
|
- source_matchers:
|
|
- severity = "critical"
|
|
target_matchers:
|
|
- severity = "warning"
|
|
equal: ["alertname", "namespace"]
|
|
ingress:
|
|
enabled: true
|
|
pathType: Prefix
|
|
ingressClassName: nginx
|
|
hosts:
|
|
- &host alert-manager.valinor.social
|
|
tls:
|
|
- hosts:
|
|
- *host
|
|
alertmanagerSpec:
|
|
replicas: 3
|
|
storage:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: ceph-block
|
|
resources:
|
|
requests:
|
|
storage: 1Gi
|
|
grafana:
|
|
enabled: false
|
|
forceDeployDashboards: true
|
|
sidecar:
|
|
dashboards:
|
|
multicluster:
|
|
etcd:
|
|
enabled: true
|
|
kube-state-metrics:
|
|
metricLabelsAllowlist:
|
|
- "pods=[*]"
|
|
- "deployments=[*]"
|
|
- "persistentvolumeclaims=[*]"
|
|
prometheus:
|
|
monitor:
|
|
enabled: true
|
|
relabelings:
|
|
- action: replace
|
|
regex: (.*)
|
|
replacement: $1
|
|
sourceLabels: ["__meta_kubernetes_pod_node_name"]
|
|
targetLabel: kubernetes_node
|
|
kubelet:
|
|
enabled: true
|
|
serviceMonitor:
|
|
metricRelabelings:
|
|
# Remove duplicate metrics
|
|
- sourceLabels: ["__name__"]
|
|
regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authentication_token|cadvisor_version|container_blkio|container_cpu|container_fs|container_last|container_memory|container_network|container_oom|container_processes|container|csi_operations|disabled_metric|get_token|go|hidden_metric|kubelet_certificate|kubelet_cgroup|kubelet_container|kubelet_containers|kubelet_cpu|kubelet_device|kubelet_graceful|kubelet_http|kubelet_lifecycle|kubelet_managed|kubelet_node|kubelet_pleg|kubelet_pod|kubelet_run|kubelet_running|kubelet_runtime|kubelet_server|kubelet_started|kubelet_volume|kubernetes_build|kubernetes_feature|machine_cpu|machine_memory|machine_nvm|machine_scrape|node_namespace|plugin_manager|prober_probe|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|storage_operation|volume_manager|volume_operation|workqueue)_(.+)"
|
|
action: keep
|
|
- sourceLabels: ["node"]
|
|
targetLabel: instance
|
|
action: replace
|
|
kubeApiServer:
|
|
enabled: true
|
|
serviceMonitor:
|
|
metricRelabelings:
|
|
# Remove duplicate metrics
|
|
- sourceLabels: ["__name__"]
|
|
regex: "(aggregator_openapi|aggregator_unavailable|apiextensions_openapi|apiserver_admission|apiserver_audit|apiserver_cache|apiserver_cel|apiserver_client|apiserver_crd|apiserver_current|apiserver_envelope|apiserver_flowcontrol|apiserver_init|apiserver_kube|apiserver_longrunning|apiserver_request|apiserver_requested|apiserver_response|apiserver_selfrequest|apiserver_storage|apiserver_terminated|apiserver_tls|apiserver_watch|apiserver_webhooks|authenticated_user|authentication|disabled_metric|etcd_bookmark|etcd_lease|etcd_request|field_validation|get_token|go|grpc_client|hidden_metric|kube_apiserver|kubernetes_build|kubernetes_feature|node_authorizer|pod_security|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|serviceaccount_legacy|serviceaccount_stale|serviceaccount_valid|watch_cache|workqueue)_(.+)"
|
|
action: keep
|
|
# Remove high cardinality metrics
|
|
- sourceLabels: ["__name__"]
|
|
regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket
|
|
action: drop
|
|
- sourceLabels: ["__name__"]
|
|
regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket)
|
|
action: drop
|
|
kubeControllerManager:
|
|
enabled: false
|
|
kubeEtcd:
|
|
enabled: false
|
|
kubeProxy:
|
|
enabled: false # Disabled because eBPF
|
|
kubeScheduler:
|
|
enabled: false
|
|
prometheus:
|
|
ingress:
|
|
enabled: true
|
|
ingressClassName: nginx
|
|
pathType: Prefix
|
|
hosts:
|
|
- &host prometheus.valinor.social
|
|
tls:
|
|
- hosts:
|
|
- *host
|
|
thanosService:
|
|
enabled: true
|
|
thanosServiceMonitor:
|
|
enabled: true
|
|
prometheusSpec:
|
|
replicas: 3
|
|
replicaExternalLabelName: __replica__
|
|
ruleSelectorNilUsesHelmValues: false
|
|
serviceMonitorSelectorNilUsesHelmValues: false
|
|
podMonitorSelectorNilUsesHelmValues: false
|
|
probeSelectorNilUsesHelmValues: false
|
|
enableAdminAPI: true
|
|
walCompression: true
|
|
thanos:
|
|
image: quay.io/thanos/thanos:v0.32.5
|
|
objectStorageConfig:
|
|
name: thanos-s3-secret
|
|
key: objstore.yml
|
|
retention: 2d
|
|
retentionSize: 15GB
|
|
storageSpec:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: ceph-block
|
|
resources:
|
|
requests:
|
|
storage: 20Gi
|