From fea9d8227d6060b00e97016e81caed08bcf34f43 Mon Sep 17 00:00:00 2001 From: Joseph Hanson Date: Tue, 27 Feb 2024 12:05:24 -0600 Subject: [PATCH] Adding the first part of monitoring stack. --- .../grafana/app/externalsecret.yaml | 32 +++ .../grafana/app/postgresCluster.yaml | 86 ++++++++ kubernetes/apps/observability/grafana/ks.yaml | 27 +++ .../app/externalsecret.yaml | 22 ++ .../app/helmrelease.yaml | 203 ++++++++++++++++++ .../app/kustomization.yaml | 15 ++ .../app/prometheusrules/kustomization.yaml | 6 + .../app/prometheusrules/prometheusrule.yaml | 37 ++++ .../app/resources/alertmanager.yaml | 68 ++++++ .../app/scrapeconfigs/kustomization.yaml | 6 + .../app/scrapeconfigs/node-exporter.yaml | 11 + .../kube-prometheus-stack/ks.yaml | 27 +++ .../apps/observability/kustomization.yaml | 17 ++ kubernetes/apps/observability/namespace.yaml | 8 + .../thanos/app/externalsecret.yaml | 31 +++ .../observability/thanos/app/helmrelease.yaml | 132 ++++++++++++ .../thanos/app/kustomization.yaml | 7 + kubernetes/apps/observability/thanos/ks.yaml | 0 .../flux/vars/cluster-secrets.sops.yaml | 21 +- 19 files changed, 743 insertions(+), 13 deletions(-) create mode 100644 kubernetes/apps/observability/grafana/app/externalsecret.yaml create mode 100644 kubernetes/apps/observability/grafana/app/postgresCluster.yaml create mode 100644 kubernetes/apps/observability/grafana/ks.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/app/externalsecret.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/app/helmrelease.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/app/kustomization.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/app/prometheusrules/kustomization.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/app/prometheusrules/prometheusrule.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/app/resources/alertmanager.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/app/scrapeconfigs/kustomization.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/app/scrapeconfigs/node-exporter.yaml create mode 100644 kubernetes/apps/observability/kube-prometheus-stack/ks.yaml create mode 100644 kubernetes/apps/observability/kustomization.yaml create mode 100644 kubernetes/apps/observability/namespace.yaml create mode 100644 kubernetes/apps/observability/thanos/app/externalsecret.yaml create mode 100644 kubernetes/apps/observability/thanos/app/helmrelease.yaml create mode 100644 kubernetes/apps/observability/thanos/app/kustomization.yaml create mode 100644 kubernetes/apps/observability/thanos/ks.yaml diff --git a/kubernetes/apps/observability/grafana/app/externalsecret.yaml b/kubernetes/apps/observability/grafana/app/externalsecret.yaml new file mode 100644 index 00000000..e31a8283 --- /dev/null +++ b/kubernetes/apps/observability/grafana/app/externalsecret.yaml @@ -0,0 +1,32 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/external-secrets.io/externalsecret_v1beta1.json +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: grafana +spec: + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-connect + target: + name: grafana-secret + template: + engineVersion: v2 + data: + GF_DATABASE_NAME: &dbName grafana + GF_DATABASE_HOST: postgres16-rw.database.svc.cluster.local:5432 + GF_DATABASE_USER: &dbUser "{{ .GRAFANA_POSTGRES_USER }}" + GF_DATABASE_PASSWORD: &dbPass "{{ .GRAFANA_POSTGRES_PASS }}" + GF_DATABASE_SSL_MODE: disable + GF_DATABASE_TYPE: postgres + GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: "{{ .GRAFANA_OAUTH_CLIENT_SECRET }}" + INIT_POSTGRES_DBNAME: *dbName + INIT_POSTGRES_HOST: postgres16-rw.database.svc.cluster.local + INIT_POSTGRES_USER: *dbUser + INIT_POSTGRES_PASS: *dbPass + INIT_POSTGRES_SUPER_PASS: "{{ .POSTGRES_SUPER_PASS }}" + dataFrom: + - extract: + key: grafana + - extract: + key: cloudnative-pg \ No newline at end of file diff --git a/kubernetes/apps/observability/grafana/app/postgresCluster.yaml b/kubernetes/apps/observability/grafana/app/postgresCluster.yaml new file mode 100644 index 00000000..8a2d007b --- /dev/null +++ b/kubernetes/apps/observability/grafana/app/postgresCluster.yaml @@ -0,0 +1,86 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/postgres-operator.crunchydata.com/postgrescluster_v1beta1.json +apiVersion: postgres-operator.crunchydata.com/v1beta1 +kind: PostgresCluster +metadata: + name: "${APP}" +spec: + postgresVersion: 16 + dataSource: + pgbackrest: + stanza: db + configuration: + - secret: + name: pgo-s3-creds + global: + repo1-path: "/${APP}/repo1" + repo1-s3-uri-style: path + repo: + name: repo1 + s3: + bucket: "crunchy-postgres" + endpoint: "s3.hsn.dev" + region: "us-east-1" + patroni: + dynamicConfiguration: + synchronous_mode: true + postgresql: + synchronous_commit: "on" + pg_hba: + - hostnossl all all 10.32.0.0/16 md5 + - hostssl all all all md5 + instances: + - name: postgres + metadata: + labels: + app.kubernetes.io/name: pgo-${APP} + replicas: 2 + dataVolumeClaimSpec: + storageClassName: openebs-hostpath + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: "kubernetes.io/hostname" + whenUnsatisfiable: "DoNotSchedule" + labelSelector: + matchLabels: + postgres-operator.crunchydata.com/cluster: ${APP} + postgres-operator.crunchydata.com/data: postgres + users: + - name: "grafana" + databases: + - "grafana" + options: "SUPERUSER" + password: + type: AlphaNumeric + backups: + pgbackrest: + configuration: + - secret: + name: pgo-s3-creds + global: + archive-push-queue-max: 4GiB + repo1-retention-full: "14" + repo1-retention-full-type: time + repo1-path: "/${APP}/repo1" + repo1-s3-uri-style: path + manual: + repoName: repo1 + options: + - --type=full + metadata: + labels: + app.kubernetes.io/name: pgo-${APP}-backup + repos: + - name: repo1 + schedules: + full: "0 1 * * 0" + differential: "0 1 * * 1-6" + s3: + bucket: "crunchy-postgres" + endpoint: "s3.hsn.dev" + region: "us-east-1" diff --git a/kubernetes/apps/observability/grafana/ks.yaml b/kubernetes/apps/observability/grafana/ks.yaml new file mode 100644 index 00000000..49bec5fa --- /dev/null +++ b/kubernetes/apps/observability/grafana/ks.yaml @@ -0,0 +1,27 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/kustomize.toolkit.fluxcd.io/kustomization_v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app grafana + namespace: flux-system +spec: + targetNamespace: observability + commonMetadata: + labels: + app.kubernetes.io/name: *app + dependsOn: + - name: crunchy-postgres-operator + - name: external-secrets-stores + path: ./kubernetes/apps/observability/grafana/app + prune: true + sourceRef: + kind: GitRepository + name: homelab + wait: false + interval: 30m + retryInterval: 1m + timeout: 5m + postBuild: + substitute: + APP: *app \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/app/externalsecret.yaml b/kubernetes/apps/observability/kube-prometheus-stack/app/externalsecret.yaml new file mode 100644 index 00000000..e1cb5624 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/app/externalsecret.yaml @@ -0,0 +1,22 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/external-secrets.io/externalsecret_v1beta1.json +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: alertmanager +spec: + refreshInterval: 5m + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-connect + target: + name: alertmanager-secret + template: + templateFrom: + - configMap: + name: alertmanager-config-tpl + items: + - key: alertmanager.yaml + dataFrom: + - extract: + key: pushover \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/app/helmrelease.yaml b/kubernetes/apps/observability/kube-prometheus-stack/app/helmrelease.yaml new file mode 100644 index 00000000..8a589f0c --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/app/helmrelease.yaml @@ -0,0 +1,203 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/helm.toolkit.fluxcd.io/helmrelease_v2beta2.json +apiVersion: helm.toolkit.fluxcd.io/v2beta2 +kind: HelmRelease +metadata: + name: kube-prometheus-stack +spec: + interval: 30m + timeout: 15m + chart: + spec: + chart: kube-prometheus-stack + version: 56.13.0 + sourceRef: + kind: HelmRepository + name: prometheus-community + namespace: flux-system + install: + crds: CreateReplace + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + crds: CreateReplace + remediation: + retries: 3 + uninstall: + keepHistory: false + dependsOn: + - name: openebs + namespace: openebs-system + - name: thanos + namespace: observability + values: + crds: + enabled: true + cleanPrometheusOperatorObjectNames: true + alertmanager: + ingress: + enabled: true + pathType: Prefix + ingressClassName: internal-nginx + hosts: + - &host alertmanager.jahanson.tech + tls: + - hosts: + - *host + alertmanagerSpec: + replicas: 2 + useExistingSecret: true + configSecret: alertmanager-secret + storage: + volumeClaimTemplate: + spec: + storageClassName: openebs-hostpath + resources: + requests: + storage: 1Gi + kubelet: + enabled: true + serviceMonitor: + metricRelabelings: + # Drop high cardinality labels + - action: labeldrop + regex: (uid) + - action: labeldrop + regex: (id|name) + - action: drop + sourceLabels: ["__name__"] + regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count) + kubeApiServer: + enabled: true + serviceMonitor: + metricRelabelings: + # Drop high cardinality labels + - action: drop + sourceLabels: ["__name__"] + regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket + - action: drop + sourceLabels: ["__name__"] + regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket) + kubeControllerManager: + enabled: true + endpoints: &cp + - 192.168.1.61 + - 192.168.1.62 + - 192.168.1.63 + kubeEtcd: + enabled: true + endpoints: *cp + kubeScheduler: + enabled: true + endpoints: *cp + kubeProxy: + enabled: false + prometheus: + ingress: + enabled: true + ingressClassName: internal-nginx + pathType: Prefix + hosts: + - &host prometheus.jahanson.tech + tls: + - hosts: + - *host + thanosService: + enabled: true + thanosServiceMonitor: + enabled: true + thanosServiceExternal: + enabled: true + type: LoadBalancer + annotations: + external-dns.alpha.kubernetes.io/hostname: thanos.jahanson.tech + io.cilium/lb-ipam-ips: 10.45.0.6 + externalTrafficPolicy: Cluster + prometheusSpec: + replicas: 2 + replicaExternalLabelName: __replica__ + ruleSelectorNilUsesHelmValues: false + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + probeSelectorNilUsesHelmValues: false + scrapeConfigSelectorNilUsesHelmValues: false + enableAdminAPI: true + walCompression: true + enableFeatures: + - auto-gomaxprocs + - memory-snapshot-on-shutdown + - new-service-discovery-manager + thanos: + image: quay.io/thanos/thanos:${THANOS_VERSION} + version: "${THANOS_VERSION#v}" + objectStorageConfig: + existingSecret: + name: thanos-objstore-secret + key: objstore.yml + retention: 2d + retentionSize: 15GB + externalLabels: + cluster: main + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: openebs-hostpath + resources: + requests: + storage: 20Gi + nodeExporter: + enabled: true + prometheus-node-exporter: + fullnameOverride: node-exporter + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: kubernetes_node + kubeStateMetrics: + enabled: true + kube-state-metrics: + fullnameOverride: kube-state-metrics + metricLabelsAllowlist: + - pods=[*] + - deployments=[*] + - persistentvolumeclaims=[*] + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: kubernetes_node + grafana: + enabled: false + forceDeployDashboards: true + sidecar: + dashboards: + multicluster: + etcd: + enabled: true + postRenderers: + - kustomize: + patches: + - target: + version: v1 + kind: ConfigMap + labelSelector: grafana_dashboard in (1) + patch: |- + apiVersion: v1 + kind: ConfigMap + metadata: + name: not-used + namespace: not-used + annotations: + grafana_folder: Kubernetes \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/app/kustomization.yaml b/kubernetes/apps/observability/kube-prometheus-stack/app/kustomization.yaml new file mode 100644 index 00000000..02cf8583 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/app/kustomization.yaml @@ -0,0 +1,15 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./externalsecret.yaml + - ./helmrelease.yaml + - ./prometheusrules + - ./scrapeconfigs +configMapGenerator: + - name: alertmanager-config-tpl + files: + - alertmanager.yaml=./resources/alertmanager.yaml +generatorOptions: + disableNameSuffixHash: true \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/app/prometheusrules/kustomization.yaml b/kubernetes/apps/observability/kube-prometheus-stack/app/prometheusrules/kustomization.yaml new file mode 100644 index 00000000..9fe10844 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/app/prometheusrules/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./prometheusrule.yaml \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/app/prometheusrules/prometheusrule.yaml b/kubernetes/apps/observability/kube-prometheus-stack/app/prometheusrules/prometheusrule.yaml new file mode 100644 index 00000000..5efb2c3c --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/app/prometheusrules/prometheusrule.yaml @@ -0,0 +1,37 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/prometheusrule_v1.json +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: miscellaneous-rules + labels: + prometheus: k8s + role: alert-rules +spec: + groups: + - name: dockerhub + rules: + - alert: BootstrapRateLimitRisk + annotations: + summary: Kubernetes cluster at risk of being rate limited by dockerhub on bootstrap + expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100 + for: 15m + labels: + severity: critical + - name: oom + rules: + - alert: OOMKilled + annotations: + summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. + expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 + labels: + severity: critical + - name: zfs + rules: + - alert: ZfsUnexpectedPoolState + annotations: + summary: ZFS pool {{$labels.zpool}} on {{$labels.instance}} is in a unexpected state {{$labels.state}} + expr: node_zfs_zpool_state{state!="online"} > 0 + for: 15m + labels: + severity: critical \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/app/resources/alertmanager.yaml b/kubernetes/apps/observability/kube-prometheus-stack/app/resources/alertmanager.yaml new file mode 100644 index 00000000..36cd9b2e --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/app/resources/alertmanager.yaml @@ -0,0 +1,68 @@ +--- +global: + resolve_timeout: 5m +route: + group_by: ["alertname", "job"] + group_interval: 10m + group_wait: 1m + receiver: pushover + repeat_interval: 12h + routes: + - receiver: heartbeat + group_interval: 5m + group_wait: 0s + matchers: + - alertname =~ "Watchdog" + repeat_interval: 5m + - receiver: "null" + matchers: + - alertname =~ "InfoInhibitor" + - receiver: pushover + continue: true + matchers: + - severity = "critical" +inhibit_rules: + - equal: ["alertname", "namespace"] + source_matchers: + - severity = "critical" + target_matchers: + - severity = "warning" +receivers: + - name: heartbeat + webhook_configs: + - send_resolved: true + url: "{{ .alertmanager_heartbeat_url }}" + - name: "null" + - name: pushover + pushover_configs: + - html: true + # Compooters are hard + message: |- + {{ "{{-" }} range .Alerts {{ "}}" }} + {{ "{{-" }} if ne .Annotations.description "" {{ "}}" }} + {{ "{{" }} .Annotations.description {{ "}}" }} + {{ "{{-" }} else if ne .Annotations.summary "" {{ "}}" }} + {{ "{{" }} .Annotations.summary {{ "}}" }} + {{ "{{-" }} else if ne .Annotations.message "" {{ "}}" }} + {{ "{{" }} .Annotations.message {{ "}}" }} + {{ "{{-" }} else {{ "}}" }} + Alert description not available + {{ "{{-" }} end {{ "}}" }} + {{ "{{-" }} if gt (len .Labels.SortedPairs) 0 {{ "}}" }} + + {{ "{{-" }} range .Labels.SortedPairs {{ "}}" }} + {{ "{{" }} .Name {{ "}}" }}: {{ "{{" }} .Value {{ "}}" }} + {{ "{{-" }} end {{ "}}" }} + + {{ "{{-" }} end {{ "}}" }} + {{ "{{-" }} end {{ "}}" }} + priority: |- + {{ "{{" }} if eq .Status "firing" {{ "}}" }}1{{ "{{" }} else {{ "}}" }}0{{ "{{" }} end {{ "}}" }} + send_resolved: true + sound: gamelan + title: >- + {{ "{{" }} .CommonLabels.alertname {{ "}}" }} + [{{ "{{" }} .Status | toUpper {{ "}}" }}{{ "{{" }} if eq .Status "firing" {{ "}}" }}:{{ "{{" }} .Alerts.Firing | len {{ "}}" }}{{ "{{" }} end {{ "}}" }}] + token: "{{ .alertmanager_token }}" + url_title: View in Alertmanager + user_key: "{{ .userkey_jahanson }}" \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/app/scrapeconfigs/kustomization.yaml b/kubernetes/apps/observability/kube-prometheus-stack/app/scrapeconfigs/kustomization.yaml new file mode 100644 index 00000000..93f0b824 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/app/scrapeconfigs/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./node-exporter.yaml \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/app/scrapeconfigs/node-exporter.yaml b/kubernetes/apps/observability/kube-prometheus-stack/app/scrapeconfigs/node-exporter.yaml new file mode 100644 index 00000000..68e210dd --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/app/scrapeconfigs/node-exporter.yaml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + name: node-exporter +spec: + staticConfigs: + - targets: + - 10.1.1.1:9100 + metricsPath: /metrics \ No newline at end of file diff --git a/kubernetes/apps/observability/kube-prometheus-stack/ks.yaml b/kubernetes/apps/observability/kube-prometheus-stack/ks.yaml new file mode 100644 index 00000000..eb5690c4 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/ks.yaml @@ -0,0 +1,27 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/kustomize.toolkit.fluxcd.io/kustomization_v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app kube-prometheus-stack + namespace: flux-system +spec: + targetNamespace: observability + commonMetadata: + labels: + app.kubernetes.io/name: *app + dependsOn: + - name: external-secrets-stores + path: ./kubernetes/apps/observability/kube-prometheus-stack/app + prune: true + sourceRef: + kind: GitRepository + name: homelab + wait: false + interval: 30m + retryInterval: 1m + timeout: 15m + postBuild: + substitute: + # renovate: datasource=docker depName=quay.io/thanos/thanos + THANOS_VERSION: v0.34.1 \ No newline at end of file diff --git a/kubernetes/apps/observability/kustomization.yaml b/kubernetes/apps/observability/kustomization.yaml new file mode 100644 index 00000000..0f69e331 --- /dev/null +++ b/kubernetes/apps/observability/kustomization.yaml @@ -0,0 +1,17 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + # Pre Flux-Kustomizations + - ./namespace.yaml + # Flux-Kustomizations + # - ./gatus/ks.yaml + # - ./grafana/ks.yaml + - ./kube-prometheus-stack/ks.yaml + # - ./loki/ks.yaml + # - ./smartctl-exporter/ks.yaml + # - ./snmp-exporter/ks.yaml + - ./thanos/ks.yaml + # - ./unpoller/ks.yaml + # - ./vector/ks.yaml \ No newline at end of file diff --git a/kubernetes/apps/observability/namespace.yaml b/kubernetes/apps/observability/namespace.yaml new file mode 100644 index 00000000..f062e931 --- /dev/null +++ b/kubernetes/apps/observability/namespace.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: observability + labels: + kustomize.toolkit.fluxcd.io/prune: disabled + pgo-enabled-hsn.dev: "true" \ No newline at end of file diff --git a/kubernetes/apps/observability/thanos/app/externalsecret.yaml b/kubernetes/apps/observability/thanos/app/externalsecret.yaml new file mode 100644 index 00000000..55907c06 --- /dev/null +++ b/kubernetes/apps/observability/thanos/app/externalsecret.yaml @@ -0,0 +1,31 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/external-secrets.io/externalsecret_v1beta1.json +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: thanos + namespace: monitoring +spec: + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-connect + target: + name: thanos-s3-secret + creationPolicy: Owner + template: + engineVersion: v2 + data: + objstore.yml: |- + type: s3 + config: + access_key: {{ .s3_thanos_access_key }} + bucket: {{ .s3_thanos_bucket_name }} + endpoint: {{ .s3_homelab_endpoint }} + secret_key: {{ .s3_thanos_secret_key }} + dataFrom: + - extract: + key: Minio + rewrite: + - regexp: + source: "(.*)" + target: "s3_$1" diff --git a/kubernetes/apps/observability/thanos/app/helmrelease.yaml b/kubernetes/apps/observability/thanos/app/helmrelease.yaml new file mode 100644 index 00000000..b5fbbbfd --- /dev/null +++ b/kubernetes/apps/observability/thanos/app/helmrelease.yaml @@ -0,0 +1,132 @@ +--- +# yaml-language-server: $schema=https://ks.hsn.dev/helm.toolkit.fluxcd.io/helmrelease_v2beta2.json +apiVersion: helm.toolkit.fluxcd.io/v2beta2 +kind: HelmRelease +metadata: + name: thanos +spec: + interval: 30m + timeout: 15m + chart: + spec: + chart: thanos + version: 13.2.2 + sourceRef: + kind: HelmRepository + name: bitnami + namespace: flux-system + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 + uninstall: + keepHistory: false + dependsOn: + - name: openebs + namespace: openebs-system + - name: dragonfly-operator + namespace: dragonfly-operator-system + - name: rook-ceph-cluster + namespace: rook-ceph + values: + existingObjstoreSecret: thanos-s3-secret + image: + registry: quay.io + repository: thanos/thanos + tag: v0.34.1 + objstoreConfig: + type: s3 + config: + insecure: true + receive: + enabled: false + networkPolicy: + enabled: false + queryFrontend: + enabled: true + replicaCount: 2 + config: &config |- + type: REDIS + config: + addr: >- + dragonfly.database.svc.cluster.local:6379, + db: 13 + ingress: + enabled: true + ingressClassName: internal-nginx + hostname: &host thanos-query-frontend.jahanson.tech + tls: true + extraTls: + - hosts: + - *host + networkPolicy: + enabled: false + query: + enabled: true + replicaCount: 2 + replicaLabel: ["__replica__"] + dnsDiscovery: + sidecarsService: kube-prometheus-stack-thanos-discovery + sidecarsNamespace: observability + stores: ["thanos.jahanson.tech:10901"] + networkPolicy: + enabled: false + bucketweb: + enabled: true + replicaCount: 2 + networkPolicy: + enabled: false + compactor: + enabled: true + extraFlags: + - --compact.concurrency=4 + - --delete-delay=30m + retentionResolutionRaw: 14d + retentionResolution5m: 30d + retentionResolution1h: 60d + persistence: + enabled: true + storageClass: openebs-hostpath + size: 10Gi + networkPolicy: + enabled: false + storegateway: + enabled: true + replicaCount: 2 + config: *config + persistence: + enabled: true + storageClass: openebs-hostpath + size: 10Gi + networkPolicy: + enabled: false + ruler: + enabled: true + replicaCount: 2 + replicaLabel: __replica__ + alertmanagers: ["http://alertmanager-operated.observability.svc.cluster.local:9093"] + extraFlags: ["--web.prefix-header=X-Forwarded-Prefix"] + config: |- + groups: + - name: PrometheusWatcher + rules: + - alert: PrometheusDown + annotations: + summary: A Prometheus has disappeared from Prometheus target discovery + expr: absent(up{job="kube-prometheus-stack-prometheus"}) + for: 5m + labels: + severity: critical + persistence: + enabled: true + storageClass: openebs-hostpath + size: 10Gi + networkPolicy: + enabled: false + metrics: + enabled: true + serviceMonitor: + enabled: true \ No newline at end of file diff --git a/kubernetes/apps/observability/thanos/app/kustomization.yaml b/kubernetes/apps/observability/thanos/app/kustomization.yaml new file mode 100644 index 00000000..7c9498fd --- /dev/null +++ b/kubernetes/apps/observability/thanos/app/kustomization.yaml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./externalsecret.yaml + - ./helmrelease.yaml \ No newline at end of file diff --git a/kubernetes/apps/observability/thanos/ks.yaml b/kubernetes/apps/observability/thanos/ks.yaml new file mode 100644 index 00000000..e69de29b diff --git a/kubernetes/flux/vars/cluster-secrets.sops.yaml b/kubernetes/flux/vars/cluster-secrets.sops.yaml index 0f0237f2..8e5b5456 100644 --- a/kubernetes/flux/vars/cluster-secrets.sops.yaml +++ b/kubernetes/flux/vars/cluster-secrets.sops.yaml @@ -4,12 +4,7 @@ metadata: name: cluster-secrets namespace: flux-system stringData: - SECRET_PUSHOVER_USERKEY: ENC[AES256_GCM,data:HknjiEQXIa1zntN4yOlTQ/buKx2xppiQV7faAxIe,iv:A9sMptT1QcgQvuP8jqPUZDjqTa56kbsLBjITQvPQyF8=,tag:Sa5PIweT7OYuoq5YG43rpA==,type:str] - SECRET_PUSHOVER_ALERT_MANAGER_APIKEY: ENC[AES256_GCM,data:n0cFsAwCX1/y5HhsNxr/c2KT/5dzt55Ygi17rX+OV7cwKPKMImmLinb6GhD9fDIz1AINGBijXuXvD8TL,iv:4nwdHlSJEUSyMEDvh+5mhONXCGTJ3qyTITwG6CxeG3A=,tag:kurCrF2rGQFBF2u7Hhinuw==,type:str] - SECRET_HEALTHCHECKS_WEBHOOK: ENC[AES256_GCM,data:YG8/g4i8inIQnCIsQyEkPdNyVmbFYU4bhixacOEEEcuJMl8ax8TH1yBRl5ziQmBggp/CETorWCmNiC3jkUXYYta/znlo76T5,iv:SGdg9htpyFP38jbAJDg+zq4Rs+axgM5m3SsgBG38Bu8=,tag:TTIVFki9e03rqVvNmtsFuw==,type:str] - SECRET_CLOUDFLARE_ACCOUNT_ID: ENC[AES256_GCM,data:bKGSKh/TxNtCMRa83/i44fX7XC5mRxBLVeZ94UltjOo=,iv:Ji0tUnrvDywxMeCvNwBrG/a8JVudfK4sXYL8q0i/cz8=,tag:j4Bwvcz73RdIInsiz0F0JA==,type:str] - SECRET_CLUSTER_CLOUDFLARE_TUNNEL_ID: ENC[AES256_GCM,data:bl9psiIxkDTchopNuPNxaGy7fQWJLdZwfnqTi8AOSl5cFMAZ,iv:CKYrQHv8fiHU4312Wfo6XlMofiR6uWP+AafO1n1y970=,tag:iyceSr/VUtE2cNbndkmV1g==,type:str] - K8S_SERVICE_ENDPOINT: ENC[AES256_GCM,data:3s9EeJwFzDQ=,iv:a4oU9bf7ESscw6o9YqhBx8kRm/rL1l2ydjjd1ngn/P0=,tag:TAwJ2UmFuEHeHsEhfiVH9g==,type:str] + CLUSTER_SECRET_CLOUDFLARE_ACCOUNT_ID: ENC[AES256_GCM,data:bQvXy9wHJcVKCa9xb89Ji2VSBmsxPKuEXIG/+KiclmM=,iv:63JdSorOBh2uz98ajzdtydSbJH3wKEaX5fRP3LX8g9Q=,tag:NH7Y6EoWaEGVal7E0XHg0w==,type:str] sops: kms: [] gcp_kms: [] @@ -19,14 +14,14 @@ sops: - recipient: age1eqlaq205y5jre9hu5hvulywa7w3d4qyxwmafneamxcn7nejesedsf4q9g6 enc: | -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBwSC9CNFkwMHVLd0dWb0Jq - cnN0OUJzYVlYV2VRS3p2ek5UcHl4TXNQckhjCnlHQTVNNmdyZFF6RXhETlBzSW9v - S00ra2k2Y0VyWnJjcU9oWG5XVGJDQkkKLS0tIHB2bGxDOWhWci81aGViVFlsL0JE - ZGRUUFpKTXpjWW9HQ0R1VDk2RmVmQ2MKJwHW3q0vCZClJFfDrWSLw6C43vWVfyLr - 1ACvmNWml+xv/MOQwoRRMx6OVF74X83UyTFdVrXXk7SkzRcwQr4j+A== + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBUdHVVdXUyMUlYc01Va25F + aXg2YWVDdnQwQnRGMWE4SEJtUnNka216YkVRCks5SUJBMzIxY25PWXQzSlBybkdL + Smwxc1hscTlNdzkzUWVPaXBYNkg5RWsKLS0tIGg0UHU3NGlpR1I5RjAvK1NvS3hl + K3J3NTZHQlhIOEt6YnZ6QU5QZ0JLT3MKYyy736Q4oXmaryf+JLlgEoK64iGDlUDg + JbdxbEfCPh3xbuTAff5oU0LxX9XVsoKBO/8+ew6+P/8bcjeb9sNCEg== -----END AGE ENCRYPTED FILE----- - lastmodified: "2024-01-12T19:24:10Z" - mac: ENC[AES256_GCM,data:EdmF3LFSmBFe6Vn5LzVmOb6tyOYto4iwIfJlUL50pjIobvw073oTwd99NkZ9m6aXB2no6ghgPc2RU8jOAtK9gg71kvLOGP45VZ07zLbcxsM8iEkSp2UX2k07/WavdXXGY4yBswGCZgnuPKah6uVNs1s8zEQNCkQQu0D1Ukf3SJY=,iv:7+sUShSrv6iwBJUgT03l38Wg9yX4G1LeXpGgHlOuMnE=,tag:rgXF0E/BIfeyYwnAYYJBsQ==,type:str] + lastmodified: "2024-02-27T17:14:28Z" + mac: ENC[AES256_GCM,data:0OKbP4/zLiMI7KU0WNXfZ62uVKTKBsJJux36ULEI2nd4AEpp57r7hH4DdAcUW9lCB6ZSvXMNytOM2T5GPHDOvEjrne0tv+jMbrp1daBCM08FUDsbjt0tl2veU43wz9KYWe2AlvmwOZPna614fQVFGtaeu79TRu938p2Gz/BnElc=,iv://gX/mf4C/TtTgUKOg6M7m1y6b2mDTk8PjR9Zwusl9c=,tag:uYapGpdIRJfL26kjw3a8Vw==,type:str] pgp: [] encrypted_regex: ^(data|stringData)$ version: 3.8.1