This repository has been archived on 2024-02-11. You can view files and clone it, but cannot push or open issues or pull requests.
valinor/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml

229 lines
8.7 KiB
YAML

---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta1.json
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: kube-prometheus-stack
namespace: monitoring
spec:
interval: 30m
timeout: 15m
chart:
spec:
chart: kube-prometheus-stack
version: 48.6.0
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
maxHistory: 2
install:
createNamespace: true
crds: CreateReplace
remediation:
retries: 3
upgrade:
cleanupOnFail: true
crds: CreateReplace
remediation:
retries: 3
uninstall:
keepHistory: false
dependsOn:
- name: rook-ceph-cluster
namespace: rook-ceph
- name: thanos
namespace: monitoring
values:
cleanPrometheusOperatorObjectNames: true
alertmanager:
config:
global:
resolve_timeout: 5m
receivers:
- name: heartbeat
webhook_configs:
- url: "${SECRET_HEALTHCHECKS_WEBHOOK}"
send_resolved: true
- name: "null" # quote
- name: pushover
pushover_configs:
- url_title: View in Alertmanager
token: "${SECRET_PUSHOVER_ALERT_MANAGER_APIKEY}"
user_key: "${SECRET_PUSHOVER_USERKEY}"
send_resolved: true
html: true
sound: gamelan
priority: |-
{{ if eq .Status "firing" }}1{{ else }}0{{ end }}
title: |-
{{ .CommonLabels.alertname }} [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
message: |-
{{- range .Alerts }}
{{- if ne .Annotations.description "" }}
{{ .Annotations.description }}
{{- else if ne .Annotations.summary "" }}
{{ .Annotations.summary }}
{{- else if ne .Annotations.message "" }}
{{ .Annotations.message }}
{{- else }}
Alert description not available
{{- end }}
{{- if gt (len .Labels.SortedPairs) 0 }}
<small>
{{- range .Labels.SortedPairs }}
<b>{{ .Name }}:</b> {{ .Value }}
{{- end }}
</small>
{{- end }}
{{- end }}
route:
group_by: ["alertname", "job"]
group_wait: 1m
group_interval: 10m
repeat_interval: 12h
receiver: pushover
routes:
- receiver: heartbeat
group_wait: 0s
group_interval: 5m
repeat_interval: 5m
matchers:
- alertname =~ "Watchdog"
- receiver: "null" # quote
matchers:
- alertname =~ "InfoInhibitor"
- receiver: pushover
matchers:
- severity = "critical"
continue: true
inhibit_rules:
- source_matchers:
- severity = "critical"
target_matchers:
- severity = "warning"
equal: ["alertname", "namespace"]
ingress:
enabled: true
pathType: Prefix
ingressClassName: nginx
hosts:
- &host alert-manager.valinor.social
tls:
- hosts:
- *host
alertmanagerSpec:
replicas: 3
storage:
volumeClaimTemplate:
spec:
storageClassName: ceph-block
resources:
requests:
storage: 1Gi
grafana:
enabled: false
forceDeployDashboards: true
sidecar:
dashboards:
multicluster:
etcd:
enabled: true
kube-state-metrics:
metricLabelsAllowlist:
- "pods=[*]"
- "deployments=[*]"
- "persistentvolumeclaims=[*]"
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: kubernetes_node
kubelet:
enabled: true
serviceMonitor:
metricRelabelings:
# Remove duplicate metrics
- sourceLabels: ["__name__"]
regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authentication_token|cadvisor_version|container_blkio|container_cpu|container_fs|container_last|container_memory|container_network|container_oom|container_processes|container|csi_operations|disabled_metric|get_token|go|hidden_metric|kubelet_certificate|kubelet_cgroup|kubelet_container|kubelet_containers|kubelet_cpu|kubelet_device|kubelet_graceful|kubelet_http|kubelet_lifecycle|kubelet_managed|kubelet_node|kubelet_pleg|kubelet_pod|kubelet_run|kubelet_running|kubelet_runtime|kubelet_server|kubelet_started|kubelet_volume|kubernetes_build|kubernetes_feature|machine_cpu|machine_memory|machine_nvm|machine_scrape|node_namespace|plugin_manager|prober_probe|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|storage_operation|volume_manager|volume_operation|workqueue)_(.+)"
action: keep
- sourceLabels: ["node"]
targetLabel: instance
action: replace
kubeApiServer:
enabled: true
serviceMonitor:
metricRelabelings:
# Remove duplicate metrics
- sourceLabels: ["__name__"]
regex: "(aggregator_openapi|aggregator_unavailable|apiextensions_openapi|apiserver_admission|apiserver_audit|apiserver_cache|apiserver_cel|apiserver_client|apiserver_crd|apiserver_current|apiserver_envelope|apiserver_flowcontrol|apiserver_init|apiserver_kube|apiserver_longrunning|apiserver_request|apiserver_requested|apiserver_response|apiserver_selfrequest|apiserver_storage|apiserver_terminated|apiserver_tls|apiserver_watch|apiserver_webhooks|authenticated_user|authentication|disabled_metric|etcd_bookmark|etcd_lease|etcd_request|field_validation|get_token|go|grpc_client|hidden_metric|kube_apiserver|kubernetes_build|kubernetes_feature|node_authorizer|pod_security|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|serviceaccount_legacy|serviceaccount_stale|serviceaccount_valid|watch_cache|workqueue)_(.+)"
action: keep
# Remove high cardinality metrics
- sourceLabels: ["__name__"]
regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket
action: drop
- sourceLabels: ["__name__"]
regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket)
action: drop
kubeControllerManager:
enabled: false
kubeEtcd:
enabled: false
kubeProxy:
enabled: false # Disabled because eBPF
kubeScheduler:
enabled: false
prometheus:
ingress:
enabled: true
ingressClassName: nginx
pathType: Prefix
hosts:
- &host prometheus.valinor.social
tls:
- hosts:
- *host
thanosService:
enabled: true
thanosServiceMonitor:
enabled: true
prometheusSpec:
replicas: 3
replicaExternalLabelName: __replica__
ruleSelectorNilUsesHelmValues: false
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
probeSelectorNilUsesHelmValues: false
enableAdminAPI: true
walCompression: true
thanos:
image: quay.io/thanos/thanos:v0.32.3
objectStorageConfig:
name: thanos-objstore-secret
key: objstore.yml
retention: 2d
retentionSize: 15GB
additionalScrapeConfigs:
- job_name: statsd-exporter
scrape_interval: 1m
scrape_timeout: 10s
honor_timestamps: true
static_configs:
- targets:
- statsd-exporter.fediverse.svc.cluster.local:9102 # default zalando postgres cluster
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: ceph-block
resources:
requests:
storage: 20Gi
valuesFrom:
- targetPath: objstoreConfig.config.bucket
kind: ConfigMap
name: thanos-bucket-v1
valuesKey: BUCKET_NAME