--- # yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta1.json apiVersion: helm.toolkit.fluxcd.io/v2beta1 kind: HelmRelease metadata: name: kube-prometheus-stack namespace: monitoring spec: interval: 30m timeout: 15m chart: spec: chart: kube-prometheus-stack version: 48.3.6 sourceRef: kind: HelmRepository name: prometheus-community namespace: flux-system maxHistory: 2 install: createNamespace: true crds: CreateReplace remediation: retries: 3 upgrade: cleanupOnFail: true crds: CreateReplace remediation: retries: 3 uninstall: keepHistory: false dependsOn: - name: rook-ceph-cluster namespace: rook-ceph - name: thanos namespace: monitoring values: cleanPrometheusOperatorObjectNames: true alertmanager: config: global: resolve_timeout: 5m receivers: - name: heartbeat webhook_configs: - url: "${SECRET_HEALTHCHECKS_WEBHOOK}" send_resolved: true - name: "null" # quote - name: pushover pushover_configs: - url_title: View in Alertmanager token: "${SECRET_PUSHOVER_ALERT_MANAGER_APIKEY}" user_key: "${SECRET_PUSHOVER_USERKEY}" send_resolved: true html: true sound: gamelan priority: |- {{ if eq .Status "firing" }}1{{ else }}0{{ end }} title: |- {{ .CommonLabels.alertname }} [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] message: |- {{- range .Alerts }} {{- if ne .Annotations.description "" }} {{ .Annotations.description }} {{- else if ne .Annotations.summary "" }} {{ .Annotations.summary }} {{- else if ne .Annotations.message "" }} {{ .Annotations.message }} {{- else }} Alert description not available {{- end }} {{- if gt (len .Labels.SortedPairs) 0 }} {{- range .Labels.SortedPairs }} {{ .Name }}: {{ .Value }} {{- end }} {{- end }} {{- end }} route: group_by: ["alertname", "job"] group_wait: 1m group_interval: 10m repeat_interval: 12h receiver: pushover routes: - receiver: heartbeat group_wait: 0s group_interval: 5m repeat_interval: 5m matchers: - alertname =~ "Watchdog" - receiver: "null" # quote matchers: - alertname =~ "InfoInhibitor" - receiver: pushover matchers: - severity = "critical" continue: true inhibit_rules: - source_matchers: - severity = "critical" target_matchers: - severity = "warning" equal: ["alertname", "namespace"] ingress: enabled: true pathType: Prefix ingressClassName: nginx hosts: - &host alert-manager.valinor.social tls: - hosts: - *host alertmanagerSpec: replicas: 3 storage: volumeClaimTemplate: spec: storageClassName: ceph-block resources: requests: storage: 1Gi grafana: enabled: false forceDeployDashboards: true sidecar: dashboards: multicluster: etcd: enabled: true kube-state-metrics: metricLabelsAllowlist: - "pods=[*]" - "deployments=[*]" - "persistentvolumeclaims=[*]" prometheus: monitor: enabled: true relabelings: - action: replace regex: (.*) replacement: $1 sourceLabels: ["__meta_kubernetes_pod_node_name"] targetLabel: kubernetes_node kubelet: enabled: true serviceMonitor: metricRelabelings: # Remove duplicate metrics - sourceLabels: ["__name__"] regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authentication_token|cadvisor_version|container_blkio|container_cpu|container_fs|container_last|container_memory|container_network|container_oom|container_processes|container|csi_operations|disabled_metric|get_token|go|hidden_metric|kubelet_certificate|kubelet_cgroup|kubelet_container|kubelet_containers|kubelet_cpu|kubelet_device|kubelet_graceful|kubelet_http|kubelet_lifecycle|kubelet_managed|kubelet_node|kubelet_pleg|kubelet_pod|kubelet_run|kubelet_running|kubelet_runtime|kubelet_server|kubelet_started|kubelet_volume|kubernetes_build|kubernetes_feature|machine_cpu|machine_memory|machine_nvm|machine_scrape|node_namespace|plugin_manager|prober_probe|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|storage_operation|volume_manager|volume_operation|workqueue)_(.+)" action: keep - sourceLabels: ["node"] targetLabel: instance action: replace kubeApiServer: enabled: true serviceMonitor: metricRelabelings: # Remove duplicate metrics - sourceLabels: ["__name__"] regex: "(aggregator_openapi|aggregator_unavailable|apiextensions_openapi|apiserver_admission|apiserver_audit|apiserver_cache|apiserver_cel|apiserver_client|apiserver_crd|apiserver_current|apiserver_envelope|apiserver_flowcontrol|apiserver_init|apiserver_kube|apiserver_longrunning|apiserver_request|apiserver_requested|apiserver_response|apiserver_selfrequest|apiserver_storage|apiserver_terminated|apiserver_tls|apiserver_watch|apiserver_webhooks|authenticated_user|authentication|disabled_metric|etcd_bookmark|etcd_lease|etcd_request|field_validation|get_token|go|grpc_client|hidden_metric|kube_apiserver|kubernetes_build|kubernetes_feature|node_authorizer|pod_security|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|serviceaccount_legacy|serviceaccount_stale|serviceaccount_valid|watch_cache|workqueue)_(.+)" action: keep # Remove high cardinality metrics - sourceLabels: ["__name__"] regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket action: drop - sourceLabels: ["__name__"] regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket) action: drop kubeControllerManager: enabled: false kubeEtcd: enabled: false kubeProxy: enabled: false # Disabled because eBPF kubeScheduler: enabled: false prometheus: ingress: enabled: true ingressClassName: nginx pathType: Prefix hosts: - &host prometheus.valinor.social tls: - hosts: - *host thanosService: enabled: true thanosServiceMonitor: enabled: true prometheusSpec: replicas: 3 replicaExternalLabelName: __replica__ ruleSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false probeSelectorNilUsesHelmValues: false enableAdminAPI: true walCompression: true thanos: image: quay.io/thanos/thanos:v0.32.3 objectStorageConfig: name: thanos-objstore-secret key: objstore.yml retention: 2d retentionSize: 15GB additionalScrapeConfigs: - job_name: statsd-exporter scrape_interval: 1m scrape_timeout: 10s honor_timestamps: true static_configs: - targets: - statsd-exporter.fediverse.svc.cluster.local:9102 # default zalando postgres cluster storageSpec: volumeClaimTemplate: spec: storageClassName: ceph-block resources: requests: storage: 20Gi valuesFrom: - targetPath: objstoreConfig.config.bucket kind: ConfigMap name: thanos-bucket-v1 valuesKey: BUCKET_NAME