---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta1.json
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
  name: kube-prometheus-stack
  namespace: monitoring
spec:
  interval: 30m
  timeout: 15m
  chart:
    spec:
      chart: kube-prometheus-stack
      version: 55.3.1
      sourceRef:
        kind: HelmRepository
        name: prometheus-community
        namespace: flux-system
  maxHistory: 2
  install:
    createNamespace: true
    crds: CreateReplace
    remediation:
      retries: 3
  upgrade:
    cleanupOnFail: true
    crds: CreateReplace
    remediation:
      retries: 3
  uninstall:
    keepHistory: false
  dependsOn:
    - name: thanos
      namespace: monitoring
  values:
    cleanPrometheusOperatorObjectNames: true
    alertmanager:
      config:
        global:
          resolve_timeout: 5m
        receivers:
          - name: heartbeat
            webhook_configs:
              - url: "${SECRET_HEALTHCHECKS_WEBHOOK}"
                send_resolved: true
          - name: "null" # quote
          - name: pushover
            pushover_configs:
              - url_title: View in Alertmanager
                token: "${SECRET_PUSHOVER_ALERT_MANAGER_APIKEY}"
                user_key: "${SECRET_PUSHOVER_USERKEY}"
                send_resolved: true
                html: true
                sound: gamelan
                priority: |-
                  {{ if eq .Status "firing" }}1{{ else }}0{{ end }}
                title: |-
                   {{ .CommonLabels.alertname }} [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
                message: |-
                  {{- range .Alerts }}
                    {{- if ne .Annotations.description "" }}
                      {{ .Annotations.description }}
                    {{- else if ne .Annotations.summary "" }}
                      {{ .Annotations.summary }}
                    {{- else if ne .Annotations.message "" }}
                      {{ .Annotations.message }}
                    {{- else }}
                      Alert description not available
                    {{- end }}
                    {{- if gt (len .Labels.SortedPairs) 0 }}
                      <small>
                      {{- range .Labels.SortedPairs }}
                        <b>{{ .Name }}:</b> {{ .Value }}
                      {{- end }}
                      </small>
                    {{- end }}
                  {{- end }}
        route:
          group_by: ["alertname", "job"]
          group_wait: 1m
          group_interval: 10m
          repeat_interval: 12h
          receiver: pushover
          routes:
            - receiver: heartbeat
              group_wait: 0s
              group_interval: 5m
              repeat_interval: 5m
              matchers:
                - alertname =~ "Watchdog"
            - receiver: "null" # quote
              matchers:
                - alertname =~ "InfoInhibitor"
            - receiver: pushover
              matchers:
                - severity = "critical"
              continue: true
        inhibit_rules:
          - source_matchers:
              - severity = "critical"
            target_matchers:
              - severity = "warning"
            equal: ["alertname", "namespace"]
      ingress:
        enabled: true
        pathType: Prefix
        ingressClassName: hsn-nginx
        hosts:
          - &host alert-manager.valinor.social
        tls:
          - hosts:
              - *host
      alertmanagerSpec:
        replicas: 3
        storage:
          volumeClaimTemplate:
            spec:
              storageClassName: local-hostpath
              resources:
                requests:
                  storage: 1Gi
    grafana:
      enabled: false
      forceDeployDashboards: true
      sidecar:
        dashboards:
          multicluster:
            etcd:
              enabled: true
    kube-state-metrics:
      metricLabelsAllowlist:
        - "pods=[*]"
        - "deployments=[*]"
        - "persistentvolumeclaims=[*]"
      prometheus:
        monitor:
          enabled: true
          relabelings:
            - action: replace
              regex: (.*)
              replacement: $1
              sourceLabels: ["__meta_kubernetes_pod_node_name"]
              targetLabel: kubernetes_node
    kubelet:
      enabled: true
      serviceMonitor:
        metricRelabelings:
          # Remove duplicate metrics
          - sourceLabels: ["__name__"]
            regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authentication_token|cadvisor_version|container_blkio|container_cpu|container_fs|container_last|container_memory|container_network|container_oom|container_processes|container|csi_operations|disabled_metric|get_token|go|hidden_metric|kubelet_certificate|kubelet_cgroup|kubelet_container|kubelet_containers|kubelet_cpu|kubelet_device|kubelet_graceful|kubelet_http|kubelet_lifecycle|kubelet_managed|kubelet_node|kubelet_pleg|kubelet_pod|kubelet_run|kubelet_running|kubelet_runtime|kubelet_server|kubelet_started|kubelet_volume|kubernetes_build|kubernetes_feature|machine_cpu|machine_memory|machine_nvm|machine_scrape|node_namespace|plugin_manager|prober_probe|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|storage_operation|volume_manager|volume_operation|workqueue)_(.+)"
            action: keep
          - sourceLabels: ["node"]
            targetLabel: instance
            action: replace
    kubeApiServer:
      enabled: true
      serviceMonitor:
        metricRelabelings:
          # Remove duplicate metrics
          - sourceLabels: ["__name__"]
            regex: "(aggregator_openapi|aggregator_unavailable|apiextensions_openapi|apiserver_admission|apiserver_audit|apiserver_cache|apiserver_cel|apiserver_client|apiserver_crd|apiserver_current|apiserver_envelope|apiserver_flowcontrol|apiserver_init|apiserver_kube|apiserver_longrunning|apiserver_request|apiserver_requested|apiserver_response|apiserver_selfrequest|apiserver_storage|apiserver_terminated|apiserver_tls|apiserver_watch|apiserver_webhooks|authenticated_user|authentication|disabled_metric|etcd_bookmark|etcd_lease|etcd_request|field_validation|get_token|go|grpc_client|hidden_metric|kube_apiserver|kubernetes_build|kubernetes_feature|node_authorizer|pod_security|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|serviceaccount_legacy|serviceaccount_stale|serviceaccount_valid|watch_cache|workqueue)_(.+)"
            action: keep
          # Remove high cardinality metrics
          - sourceLabels: ["__name__"]
            regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket
            action: drop
          - sourceLabels: ["__name__"]
            regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket)
            action: drop
    kubeControllerManager:
      enabled: false
    kubeEtcd:
      enabled: false
    kubeProxy:
      enabled: false # Disabled because eBPF
    kubeScheduler:
      enabled: false
    prometheus:
      ingress:
        enabled: false
      thanosService:
        enabled: true
      thanosServiceMonitor:
        enabled: true
      prometheusSpec:
        replicas: 3
        replicaExternalLabelName: __replica__
        ruleSelectorNilUsesHelmValues: false
        serviceMonitorSelectorNilUsesHelmValues: false
        podMonitorSelectorNilUsesHelmValues: false
        probeSelectorNilUsesHelmValues: false
        scrapeConfigSelectorNilUsesHelmValues: false
        enableAdminAPI: true
        walCompression: true
        thanos:
          image: quay.io/thanos/thanos:v0.32.5
          objectStorageConfig:
            name: thanos-s3-secret
            key: objstore.yml
        retention: 2d
        retentionSize: 15GB
        storageSpec:
          volumeClaimTemplate:
            spec:
              storageClassName: local-hostpath
              resources:
                requests:
                  storage: 20Gi