Adding the first part of monitoring stack.
This commit is contained in:
parent
8cc09e36ae
commit
fea9d8227d
19 changed files with 743 additions and 13 deletions
|
@ -0,0 +1,32 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/external-secrets.io/externalsecret_v1beta1.json
|
||||||
|
apiVersion: external-secrets.io/v1beta1
|
||||||
|
kind: ExternalSecret
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
spec:
|
||||||
|
secretStoreRef:
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
name: onepassword-connect
|
||||||
|
target:
|
||||||
|
name: grafana-secret
|
||||||
|
template:
|
||||||
|
engineVersion: v2
|
||||||
|
data:
|
||||||
|
GF_DATABASE_NAME: &dbName grafana
|
||||||
|
GF_DATABASE_HOST: postgres16-rw.database.svc.cluster.local:5432
|
||||||
|
GF_DATABASE_USER: &dbUser "{{ .GRAFANA_POSTGRES_USER }}"
|
||||||
|
GF_DATABASE_PASSWORD: &dbPass "{{ .GRAFANA_POSTGRES_PASS }}"
|
||||||
|
GF_DATABASE_SSL_MODE: disable
|
||||||
|
GF_DATABASE_TYPE: postgres
|
||||||
|
GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: "{{ .GRAFANA_OAUTH_CLIENT_SECRET }}"
|
||||||
|
INIT_POSTGRES_DBNAME: *dbName
|
||||||
|
INIT_POSTGRES_HOST: postgres16-rw.database.svc.cluster.local
|
||||||
|
INIT_POSTGRES_USER: *dbUser
|
||||||
|
INIT_POSTGRES_PASS: *dbPass
|
||||||
|
INIT_POSTGRES_SUPER_PASS: "{{ .POSTGRES_SUPER_PASS }}"
|
||||||
|
dataFrom:
|
||||||
|
- extract:
|
||||||
|
key: grafana
|
||||||
|
- extract:
|
||||||
|
key: cloudnative-pg
|
|
@ -0,0 +1,86 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/postgres-operator.crunchydata.com/postgrescluster_v1beta1.json
|
||||||
|
apiVersion: postgres-operator.crunchydata.com/v1beta1
|
||||||
|
kind: PostgresCluster
|
||||||
|
metadata:
|
||||||
|
name: "${APP}"
|
||||||
|
spec:
|
||||||
|
postgresVersion: 16
|
||||||
|
dataSource:
|
||||||
|
pgbackrest:
|
||||||
|
stanza: db
|
||||||
|
configuration:
|
||||||
|
- secret:
|
||||||
|
name: pgo-s3-creds
|
||||||
|
global:
|
||||||
|
repo1-path: "/${APP}/repo1"
|
||||||
|
repo1-s3-uri-style: path
|
||||||
|
repo:
|
||||||
|
name: repo1
|
||||||
|
s3:
|
||||||
|
bucket: "crunchy-postgres"
|
||||||
|
endpoint: "s3.hsn.dev"
|
||||||
|
region: "us-east-1"
|
||||||
|
patroni:
|
||||||
|
dynamicConfiguration:
|
||||||
|
synchronous_mode: true
|
||||||
|
postgresql:
|
||||||
|
synchronous_commit: "on"
|
||||||
|
pg_hba:
|
||||||
|
- hostnossl all all 10.32.0.0/16 md5
|
||||||
|
- hostssl all all all md5
|
||||||
|
instances:
|
||||||
|
- name: postgres
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: pgo-${APP}
|
||||||
|
replicas: 2
|
||||||
|
dataVolumeClaimSpec:
|
||||||
|
storageClassName: openebs-hostpath
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 5Gi
|
||||||
|
topologySpreadConstraints:
|
||||||
|
- maxSkew: 1
|
||||||
|
topologyKey: "kubernetes.io/hostname"
|
||||||
|
whenUnsatisfiable: "DoNotSchedule"
|
||||||
|
labelSelector:
|
||||||
|
matchLabels:
|
||||||
|
postgres-operator.crunchydata.com/cluster: ${APP}
|
||||||
|
postgres-operator.crunchydata.com/data: postgres
|
||||||
|
users:
|
||||||
|
- name: "grafana"
|
||||||
|
databases:
|
||||||
|
- "grafana"
|
||||||
|
options: "SUPERUSER"
|
||||||
|
password:
|
||||||
|
type: AlphaNumeric
|
||||||
|
backups:
|
||||||
|
pgbackrest:
|
||||||
|
configuration:
|
||||||
|
- secret:
|
||||||
|
name: pgo-s3-creds
|
||||||
|
global:
|
||||||
|
archive-push-queue-max: 4GiB
|
||||||
|
repo1-retention-full: "14"
|
||||||
|
repo1-retention-full-type: time
|
||||||
|
repo1-path: "/${APP}/repo1"
|
||||||
|
repo1-s3-uri-style: path
|
||||||
|
manual:
|
||||||
|
repoName: repo1
|
||||||
|
options:
|
||||||
|
- --type=full
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: pgo-${APP}-backup
|
||||||
|
repos:
|
||||||
|
- name: repo1
|
||||||
|
schedules:
|
||||||
|
full: "0 1 * * 0"
|
||||||
|
differential: "0 1 * * 1-6"
|
||||||
|
s3:
|
||||||
|
bucket: "crunchy-postgres"
|
||||||
|
endpoint: "s3.hsn.dev"
|
||||||
|
region: "us-east-1"
|
27
kubernetes/apps/observability/grafana/ks.yaml
Normal file
27
kubernetes/apps/observability/grafana/ks.yaml
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/kustomize.toolkit.fluxcd.io/kustomization_v1.json
|
||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: &app grafana
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
targetNamespace: observability
|
||||||
|
commonMetadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: *app
|
||||||
|
dependsOn:
|
||||||
|
- name: crunchy-postgres-operator
|
||||||
|
- name: external-secrets-stores
|
||||||
|
path: ./kubernetes/apps/observability/grafana/app
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: homelab
|
||||||
|
wait: false
|
||||||
|
interval: 30m
|
||||||
|
retryInterval: 1m
|
||||||
|
timeout: 5m
|
||||||
|
postBuild:
|
||||||
|
substitute:
|
||||||
|
APP: *app
|
|
@ -0,0 +1,22 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/external-secrets.io/externalsecret_v1beta1.json
|
||||||
|
apiVersion: external-secrets.io/v1beta1
|
||||||
|
kind: ExternalSecret
|
||||||
|
metadata:
|
||||||
|
name: alertmanager
|
||||||
|
spec:
|
||||||
|
refreshInterval: 5m
|
||||||
|
secretStoreRef:
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
name: onepassword-connect
|
||||||
|
target:
|
||||||
|
name: alertmanager-secret
|
||||||
|
template:
|
||||||
|
templateFrom:
|
||||||
|
- configMap:
|
||||||
|
name: alertmanager-config-tpl
|
||||||
|
items:
|
||||||
|
- key: alertmanager.yaml
|
||||||
|
dataFrom:
|
||||||
|
- extract:
|
||||||
|
key: pushover
|
|
@ -0,0 +1,203 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/helm.toolkit.fluxcd.io/helmrelease_v2beta2.json
|
||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2beta2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: kube-prometheus-stack
|
||||||
|
spec:
|
||||||
|
interval: 30m
|
||||||
|
timeout: 15m
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: kube-prometheus-stack
|
||||||
|
version: 56.13.0
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: prometheus-community
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
crds: CreateReplace
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
cleanupOnFail: true
|
||||||
|
crds: CreateReplace
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
uninstall:
|
||||||
|
keepHistory: false
|
||||||
|
dependsOn:
|
||||||
|
- name: openebs
|
||||||
|
namespace: openebs-system
|
||||||
|
- name: thanos
|
||||||
|
namespace: observability
|
||||||
|
values:
|
||||||
|
crds:
|
||||||
|
enabled: true
|
||||||
|
cleanPrometheusOperatorObjectNames: true
|
||||||
|
alertmanager:
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
pathType: Prefix
|
||||||
|
ingressClassName: internal-nginx
|
||||||
|
hosts:
|
||||||
|
- &host alertmanager.jahanson.tech
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- *host
|
||||||
|
alertmanagerSpec:
|
||||||
|
replicas: 2
|
||||||
|
useExistingSecret: true
|
||||||
|
configSecret: alertmanager-secret
|
||||||
|
storage:
|
||||||
|
volumeClaimTemplate:
|
||||||
|
spec:
|
||||||
|
storageClassName: openebs-hostpath
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
kubelet:
|
||||||
|
enabled: true
|
||||||
|
serviceMonitor:
|
||||||
|
metricRelabelings:
|
||||||
|
# Drop high cardinality labels
|
||||||
|
- action: labeldrop
|
||||||
|
regex: (uid)
|
||||||
|
- action: labeldrop
|
||||||
|
regex: (id|name)
|
||||||
|
- action: drop
|
||||||
|
sourceLabels: ["__name__"]
|
||||||
|
regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count)
|
||||||
|
kubeApiServer:
|
||||||
|
enabled: true
|
||||||
|
serviceMonitor:
|
||||||
|
metricRelabelings:
|
||||||
|
# Drop high cardinality labels
|
||||||
|
- action: drop
|
||||||
|
sourceLabels: ["__name__"]
|
||||||
|
regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket
|
||||||
|
- action: drop
|
||||||
|
sourceLabels: ["__name__"]
|
||||||
|
regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket)
|
||||||
|
kubeControllerManager:
|
||||||
|
enabled: true
|
||||||
|
endpoints: &cp
|
||||||
|
- 192.168.1.61
|
||||||
|
- 192.168.1.62
|
||||||
|
- 192.168.1.63
|
||||||
|
kubeEtcd:
|
||||||
|
enabled: true
|
||||||
|
endpoints: *cp
|
||||||
|
kubeScheduler:
|
||||||
|
enabled: true
|
||||||
|
endpoints: *cp
|
||||||
|
kubeProxy:
|
||||||
|
enabled: false
|
||||||
|
prometheus:
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
ingressClassName: internal-nginx
|
||||||
|
pathType: Prefix
|
||||||
|
hosts:
|
||||||
|
- &host prometheus.jahanson.tech
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- *host
|
||||||
|
thanosService:
|
||||||
|
enabled: true
|
||||||
|
thanosServiceMonitor:
|
||||||
|
enabled: true
|
||||||
|
thanosServiceExternal:
|
||||||
|
enabled: true
|
||||||
|
type: LoadBalancer
|
||||||
|
annotations:
|
||||||
|
external-dns.alpha.kubernetes.io/hostname: thanos.jahanson.tech
|
||||||
|
io.cilium/lb-ipam-ips: 10.45.0.6
|
||||||
|
externalTrafficPolicy: Cluster
|
||||||
|
prometheusSpec:
|
||||||
|
replicas: 2
|
||||||
|
replicaExternalLabelName: __replica__
|
||||||
|
ruleSelectorNilUsesHelmValues: false
|
||||||
|
serviceMonitorSelectorNilUsesHelmValues: false
|
||||||
|
podMonitorSelectorNilUsesHelmValues: false
|
||||||
|
probeSelectorNilUsesHelmValues: false
|
||||||
|
scrapeConfigSelectorNilUsesHelmValues: false
|
||||||
|
enableAdminAPI: true
|
||||||
|
walCompression: true
|
||||||
|
enableFeatures:
|
||||||
|
- auto-gomaxprocs
|
||||||
|
- memory-snapshot-on-shutdown
|
||||||
|
- new-service-discovery-manager
|
||||||
|
thanos:
|
||||||
|
image: quay.io/thanos/thanos:${THANOS_VERSION}
|
||||||
|
version: "${THANOS_VERSION#v}"
|
||||||
|
objectStorageConfig:
|
||||||
|
existingSecret:
|
||||||
|
name: thanos-objstore-secret
|
||||||
|
key: objstore.yml
|
||||||
|
retention: 2d
|
||||||
|
retentionSize: 15GB
|
||||||
|
externalLabels:
|
||||||
|
cluster: main
|
||||||
|
storageSpec:
|
||||||
|
volumeClaimTemplate:
|
||||||
|
spec:
|
||||||
|
storageClassName: openebs-hostpath
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 20Gi
|
||||||
|
nodeExporter:
|
||||||
|
enabled: true
|
||||||
|
prometheus-node-exporter:
|
||||||
|
fullnameOverride: node-exporter
|
||||||
|
prometheus:
|
||||||
|
monitor:
|
||||||
|
enabled: true
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: kubernetes_node
|
||||||
|
kubeStateMetrics:
|
||||||
|
enabled: true
|
||||||
|
kube-state-metrics:
|
||||||
|
fullnameOverride: kube-state-metrics
|
||||||
|
metricLabelsAllowlist:
|
||||||
|
- pods=[*]
|
||||||
|
- deployments=[*]
|
||||||
|
- persistentvolumeclaims=[*]
|
||||||
|
prometheus:
|
||||||
|
monitor:
|
||||||
|
enabled: true
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: kubernetes_node
|
||||||
|
grafana:
|
||||||
|
enabled: false
|
||||||
|
forceDeployDashboards: true
|
||||||
|
sidecar:
|
||||||
|
dashboards:
|
||||||
|
multicluster:
|
||||||
|
etcd:
|
||||||
|
enabled: true
|
||||||
|
postRenderers:
|
||||||
|
- kustomize:
|
||||||
|
patches:
|
||||||
|
- target:
|
||||||
|
version: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
labelSelector: grafana_dashboard in (1)
|
||||||
|
patch: |-
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: not-used
|
||||||
|
namespace: not-used
|
||||||
|
annotations:
|
||||||
|
grafana_folder: Kubernetes
|
|
@ -0,0 +1,15 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- ./externalsecret.yaml
|
||||||
|
- ./helmrelease.yaml
|
||||||
|
- ./prometheusrules
|
||||||
|
- ./scrapeconfigs
|
||||||
|
configMapGenerator:
|
||||||
|
- name: alertmanager-config-tpl
|
||||||
|
files:
|
||||||
|
- alertmanager.yaml=./resources/alertmanager.yaml
|
||||||
|
generatorOptions:
|
||||||
|
disableNameSuffixHash: true
|
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- ./prometheusrule.yaml
|
|
@ -0,0 +1,37 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/prometheusrule_v1.json
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: miscellaneous-rules
|
||||||
|
labels:
|
||||||
|
prometheus: k8s
|
||||||
|
role: alert-rules
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: dockerhub
|
||||||
|
rules:
|
||||||
|
- alert: BootstrapRateLimitRisk
|
||||||
|
annotations:
|
||||||
|
summary: Kubernetes cluster at risk of being rate limited by dockerhub on bootstrap
|
||||||
|
expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- name: oom
|
||||||
|
rules:
|
||||||
|
- alert: OOMKilled
|
||||||
|
annotations:
|
||||||
|
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
||||||
|
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- name: zfs
|
||||||
|
rules:
|
||||||
|
- alert: ZfsUnexpectedPoolState
|
||||||
|
annotations:
|
||||||
|
summary: ZFS pool {{$labels.zpool}} on {{$labels.instance}} is in a unexpected state {{$labels.state}}
|
||||||
|
expr: node_zfs_zpool_state{state!="online"} > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
|
@ -0,0 +1,68 @@
|
||||||
|
---
|
||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
route:
|
||||||
|
group_by: ["alertname", "job"]
|
||||||
|
group_interval: 10m
|
||||||
|
group_wait: 1m
|
||||||
|
receiver: pushover
|
||||||
|
repeat_interval: 12h
|
||||||
|
routes:
|
||||||
|
- receiver: heartbeat
|
||||||
|
group_interval: 5m
|
||||||
|
group_wait: 0s
|
||||||
|
matchers:
|
||||||
|
- alertname =~ "Watchdog"
|
||||||
|
repeat_interval: 5m
|
||||||
|
- receiver: "null"
|
||||||
|
matchers:
|
||||||
|
- alertname =~ "InfoInhibitor"
|
||||||
|
- receiver: pushover
|
||||||
|
continue: true
|
||||||
|
matchers:
|
||||||
|
- severity = "critical"
|
||||||
|
inhibit_rules:
|
||||||
|
- equal: ["alertname", "namespace"]
|
||||||
|
source_matchers:
|
||||||
|
- severity = "critical"
|
||||||
|
target_matchers:
|
||||||
|
- severity = "warning"
|
||||||
|
receivers:
|
||||||
|
- name: heartbeat
|
||||||
|
webhook_configs:
|
||||||
|
- send_resolved: true
|
||||||
|
url: "{{ .alertmanager_heartbeat_url }}"
|
||||||
|
- name: "null"
|
||||||
|
- name: pushover
|
||||||
|
pushover_configs:
|
||||||
|
- html: true
|
||||||
|
# Compooters are hard
|
||||||
|
message: |-
|
||||||
|
{{ "{{-" }} range .Alerts {{ "}}" }}
|
||||||
|
{{ "{{-" }} if ne .Annotations.description "" {{ "}}" }}
|
||||||
|
{{ "{{" }} .Annotations.description {{ "}}" }}
|
||||||
|
{{ "{{-" }} else if ne .Annotations.summary "" {{ "}}" }}
|
||||||
|
{{ "{{" }} .Annotations.summary {{ "}}" }}
|
||||||
|
{{ "{{-" }} else if ne .Annotations.message "" {{ "}}" }}
|
||||||
|
{{ "{{" }} .Annotations.message {{ "}}" }}
|
||||||
|
{{ "{{-" }} else {{ "}}" }}
|
||||||
|
Alert description not available
|
||||||
|
{{ "{{-" }} end {{ "}}" }}
|
||||||
|
{{ "{{-" }} if gt (len .Labels.SortedPairs) 0 {{ "}}" }}
|
||||||
|
<small>
|
||||||
|
{{ "{{-" }} range .Labels.SortedPairs {{ "}}" }}
|
||||||
|
<b>{{ "{{" }} .Name {{ "}}" }}:</b> {{ "{{" }} .Value {{ "}}" }}
|
||||||
|
{{ "{{-" }} end {{ "}}" }}
|
||||||
|
</small>
|
||||||
|
{{ "{{-" }} end {{ "}}" }}
|
||||||
|
{{ "{{-" }} end {{ "}}" }}
|
||||||
|
priority: |-
|
||||||
|
{{ "{{" }} if eq .Status "firing" {{ "}}" }}1{{ "{{" }} else {{ "}}" }}0{{ "{{" }} end {{ "}}" }}
|
||||||
|
send_resolved: true
|
||||||
|
sound: gamelan
|
||||||
|
title: >-
|
||||||
|
{{ "{{" }} .CommonLabels.alertname {{ "}}" }}
|
||||||
|
[{{ "{{" }} .Status | toUpper {{ "}}" }}{{ "{{" }} if eq .Status "firing" {{ "}}" }}:{{ "{{" }} .Alerts.Firing | len {{ "}}" }}{{ "{{" }} end {{ "}}" }}]
|
||||||
|
token: "{{ .alertmanager_token }}"
|
||||||
|
url_title: View in Alertmanager
|
||||||
|
user_key: "{{ .userkey_jahanson }}"
|
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- ./node-exporter.yaml
|
|
@ -0,0 +1,11 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/monitoring.coreos.com/scrapeconfig_v1alpha1.json
|
||||||
|
apiVersion: monitoring.coreos.com/v1alpha1
|
||||||
|
kind: ScrapeConfig
|
||||||
|
metadata:
|
||||||
|
name: node-exporter
|
||||||
|
spec:
|
||||||
|
staticConfigs:
|
||||||
|
- targets:
|
||||||
|
- 10.1.1.1:9100
|
||||||
|
metricsPath: /metrics
|
27
kubernetes/apps/observability/kube-prometheus-stack/ks.yaml
Normal file
27
kubernetes/apps/observability/kube-prometheus-stack/ks.yaml
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/kustomize.toolkit.fluxcd.io/kustomization_v1.json
|
||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: &app kube-prometheus-stack
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
targetNamespace: observability
|
||||||
|
commonMetadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: *app
|
||||||
|
dependsOn:
|
||||||
|
- name: external-secrets-stores
|
||||||
|
path: ./kubernetes/apps/observability/kube-prometheus-stack/app
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: homelab
|
||||||
|
wait: false
|
||||||
|
interval: 30m
|
||||||
|
retryInterval: 1m
|
||||||
|
timeout: 15m
|
||||||
|
postBuild:
|
||||||
|
substitute:
|
||||||
|
# renovate: datasource=docker depName=quay.io/thanos/thanos
|
||||||
|
THANOS_VERSION: v0.34.1
|
17
kubernetes/apps/observability/kustomization.yaml
Normal file
17
kubernetes/apps/observability/kustomization.yaml
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
# Pre Flux-Kustomizations
|
||||||
|
- ./namespace.yaml
|
||||||
|
# Flux-Kustomizations
|
||||||
|
# - ./gatus/ks.yaml
|
||||||
|
# - ./grafana/ks.yaml
|
||||||
|
- ./kube-prometheus-stack/ks.yaml
|
||||||
|
# - ./loki/ks.yaml
|
||||||
|
# - ./smartctl-exporter/ks.yaml
|
||||||
|
# - ./snmp-exporter/ks.yaml
|
||||||
|
- ./thanos/ks.yaml
|
||||||
|
# - ./unpoller/ks.yaml
|
||||||
|
# - ./vector/ks.yaml
|
8
kubernetes/apps/observability/namespace.yaml
Normal file
8
kubernetes/apps/observability/namespace.yaml
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: observability
|
||||||
|
labels:
|
||||||
|
kustomize.toolkit.fluxcd.io/prune: disabled
|
||||||
|
pgo-enabled-hsn.dev: "true"
|
31
kubernetes/apps/observability/thanos/app/externalsecret.yaml
Normal file
31
kubernetes/apps/observability/thanos/app/externalsecret.yaml
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/external-secrets.io/externalsecret_v1beta1.json
|
||||||
|
apiVersion: external-secrets.io/v1beta1
|
||||||
|
kind: ExternalSecret
|
||||||
|
metadata:
|
||||||
|
name: thanos
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
secretStoreRef:
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
name: onepassword-connect
|
||||||
|
target:
|
||||||
|
name: thanos-s3-secret
|
||||||
|
creationPolicy: Owner
|
||||||
|
template:
|
||||||
|
engineVersion: v2
|
||||||
|
data:
|
||||||
|
objstore.yml: |-
|
||||||
|
type: s3
|
||||||
|
config:
|
||||||
|
access_key: {{ .s3_thanos_access_key }}
|
||||||
|
bucket: {{ .s3_thanos_bucket_name }}
|
||||||
|
endpoint: {{ .s3_homelab_endpoint }}
|
||||||
|
secret_key: {{ .s3_thanos_secret_key }}
|
||||||
|
dataFrom:
|
||||||
|
- extract:
|
||||||
|
key: Minio
|
||||||
|
rewrite:
|
||||||
|
- regexp:
|
||||||
|
source: "(.*)"
|
||||||
|
target: "s3_$1"
|
132
kubernetes/apps/observability/thanos/app/helmrelease.yaml
Normal file
132
kubernetes/apps/observability/thanos/app/helmrelease.yaml
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://ks.hsn.dev/helm.toolkit.fluxcd.io/helmrelease_v2beta2.json
|
||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2beta2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: thanos
|
||||||
|
spec:
|
||||||
|
interval: 30m
|
||||||
|
timeout: 15m
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: thanos
|
||||||
|
version: 13.2.2
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: bitnami
|
||||||
|
namespace: flux-system
|
||||||
|
install:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
cleanupOnFail: true
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
uninstall:
|
||||||
|
keepHistory: false
|
||||||
|
dependsOn:
|
||||||
|
- name: openebs
|
||||||
|
namespace: openebs-system
|
||||||
|
- name: dragonfly-operator
|
||||||
|
namespace: dragonfly-operator-system
|
||||||
|
- name: rook-ceph-cluster
|
||||||
|
namespace: rook-ceph
|
||||||
|
values:
|
||||||
|
existingObjstoreSecret: thanos-s3-secret
|
||||||
|
image:
|
||||||
|
registry: quay.io
|
||||||
|
repository: thanos/thanos
|
||||||
|
tag: v0.34.1
|
||||||
|
objstoreConfig:
|
||||||
|
type: s3
|
||||||
|
config:
|
||||||
|
insecure: true
|
||||||
|
receive:
|
||||||
|
enabled: false
|
||||||
|
networkPolicy:
|
||||||
|
enabled: false
|
||||||
|
queryFrontend:
|
||||||
|
enabled: true
|
||||||
|
replicaCount: 2
|
||||||
|
config: &config |-
|
||||||
|
type: REDIS
|
||||||
|
config:
|
||||||
|
addr: >-
|
||||||
|
dragonfly.database.svc.cluster.local:6379,
|
||||||
|
db: 13
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
ingressClassName: internal-nginx
|
||||||
|
hostname: &host thanos-query-frontend.jahanson.tech
|
||||||
|
tls: true
|
||||||
|
extraTls:
|
||||||
|
- hosts:
|
||||||
|
- *host
|
||||||
|
networkPolicy:
|
||||||
|
enabled: false
|
||||||
|
query:
|
||||||
|
enabled: true
|
||||||
|
replicaCount: 2
|
||||||
|
replicaLabel: ["__replica__"]
|
||||||
|
dnsDiscovery:
|
||||||
|
sidecarsService: kube-prometheus-stack-thanos-discovery
|
||||||
|
sidecarsNamespace: observability
|
||||||
|
stores: ["thanos.jahanson.tech:10901"]
|
||||||
|
networkPolicy:
|
||||||
|
enabled: false
|
||||||
|
bucketweb:
|
||||||
|
enabled: true
|
||||||
|
replicaCount: 2
|
||||||
|
networkPolicy:
|
||||||
|
enabled: false
|
||||||
|
compactor:
|
||||||
|
enabled: true
|
||||||
|
extraFlags:
|
||||||
|
- --compact.concurrency=4
|
||||||
|
- --delete-delay=30m
|
||||||
|
retentionResolutionRaw: 14d
|
||||||
|
retentionResolution5m: 30d
|
||||||
|
retentionResolution1h: 60d
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClass: openebs-hostpath
|
||||||
|
size: 10Gi
|
||||||
|
networkPolicy:
|
||||||
|
enabled: false
|
||||||
|
storegateway:
|
||||||
|
enabled: true
|
||||||
|
replicaCount: 2
|
||||||
|
config: *config
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClass: openebs-hostpath
|
||||||
|
size: 10Gi
|
||||||
|
networkPolicy:
|
||||||
|
enabled: false
|
||||||
|
ruler:
|
||||||
|
enabled: true
|
||||||
|
replicaCount: 2
|
||||||
|
replicaLabel: __replica__
|
||||||
|
alertmanagers: ["http://alertmanager-operated.observability.svc.cluster.local:9093"]
|
||||||
|
extraFlags: ["--web.prefix-header=X-Forwarded-Prefix"]
|
||||||
|
config: |-
|
||||||
|
groups:
|
||||||
|
- name: PrometheusWatcher
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusDown
|
||||||
|
annotations:
|
||||||
|
summary: A Prometheus has disappeared from Prometheus target discovery
|
||||||
|
expr: absent(up{job="kube-prometheus-stack-prometheus"})
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClass: openebs-hostpath
|
||||||
|
size: 10Gi
|
||||||
|
networkPolicy:
|
||||||
|
enabled: false
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
|
@ -0,0 +1,7 @@
|
||||||
|
---
|
||||||
|
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- ./externalsecret.yaml
|
||||||
|
- ./helmrelease.yaml
|
0
kubernetes/apps/observability/thanos/ks.yaml
Normal file
0
kubernetes/apps/observability/thanos/ks.yaml
Normal file
|
@ -4,12 +4,7 @@ metadata:
|
||||||
name: cluster-secrets
|
name: cluster-secrets
|
||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
stringData:
|
stringData:
|
||||||
SECRET_PUSHOVER_USERKEY: ENC[AES256_GCM,data:HknjiEQXIa1zntN4yOlTQ/buKx2xppiQV7faAxIe,iv:A9sMptT1QcgQvuP8jqPUZDjqTa56kbsLBjITQvPQyF8=,tag:Sa5PIweT7OYuoq5YG43rpA==,type:str]
|
CLUSTER_SECRET_CLOUDFLARE_ACCOUNT_ID: ENC[AES256_GCM,data:bQvXy9wHJcVKCa9xb89Ji2VSBmsxPKuEXIG/+KiclmM=,iv:63JdSorOBh2uz98ajzdtydSbJH3wKEaX5fRP3LX8g9Q=,tag:NH7Y6EoWaEGVal7E0XHg0w==,type:str]
|
||||||
SECRET_PUSHOVER_ALERT_MANAGER_APIKEY: ENC[AES256_GCM,data:n0cFsAwCX1/y5HhsNxr/c2KT/5dzt55Ygi17rX+OV7cwKPKMImmLinb6GhD9fDIz1AINGBijXuXvD8TL,iv:4nwdHlSJEUSyMEDvh+5mhONXCGTJ3qyTITwG6CxeG3A=,tag:kurCrF2rGQFBF2u7Hhinuw==,type:str]
|
|
||||||
SECRET_HEALTHCHECKS_WEBHOOK: ENC[AES256_GCM,data:YG8/g4i8inIQnCIsQyEkPdNyVmbFYU4bhixacOEEEcuJMl8ax8TH1yBRl5ziQmBggp/CETorWCmNiC3jkUXYYta/znlo76T5,iv:SGdg9htpyFP38jbAJDg+zq4Rs+axgM5m3SsgBG38Bu8=,tag:TTIVFki9e03rqVvNmtsFuw==,type:str]
|
|
||||||
SECRET_CLOUDFLARE_ACCOUNT_ID: ENC[AES256_GCM,data:bKGSKh/TxNtCMRa83/i44fX7XC5mRxBLVeZ94UltjOo=,iv:Ji0tUnrvDywxMeCvNwBrG/a8JVudfK4sXYL8q0i/cz8=,tag:j4Bwvcz73RdIInsiz0F0JA==,type:str]
|
|
||||||
SECRET_CLUSTER_CLOUDFLARE_TUNNEL_ID: ENC[AES256_GCM,data:bl9psiIxkDTchopNuPNxaGy7fQWJLdZwfnqTi8AOSl5cFMAZ,iv:CKYrQHv8fiHU4312Wfo6XlMofiR6uWP+AafO1n1y970=,tag:iyceSr/VUtE2cNbndkmV1g==,type:str]
|
|
||||||
K8S_SERVICE_ENDPOINT: ENC[AES256_GCM,data:3s9EeJwFzDQ=,iv:a4oU9bf7ESscw6o9YqhBx8kRm/rL1l2ydjjd1ngn/P0=,tag:TAwJ2UmFuEHeHsEhfiVH9g==,type:str]
|
|
||||||
sops:
|
sops:
|
||||||
kms: []
|
kms: []
|
||||||
gcp_kms: []
|
gcp_kms: []
|
||||||
|
@ -19,14 +14,14 @@ sops:
|
||||||
- recipient: age1eqlaq205y5jre9hu5hvulywa7w3d4qyxwmafneamxcn7nejesedsf4q9g6
|
- recipient: age1eqlaq205y5jre9hu5hvulywa7w3d4qyxwmafneamxcn7nejesedsf4q9g6
|
||||||
enc: |
|
enc: |
|
||||||
-----BEGIN AGE ENCRYPTED FILE-----
|
-----BEGIN AGE ENCRYPTED FILE-----
|
||||||
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBwSC9CNFkwMHVLd0dWb0Jq
|
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBUdHVVdXUyMUlYc01Va25F
|
||||||
cnN0OUJzYVlYV2VRS3p2ek5UcHl4TXNQckhjCnlHQTVNNmdyZFF6RXhETlBzSW9v
|
aXg2YWVDdnQwQnRGMWE4SEJtUnNka216YkVRCks5SUJBMzIxY25PWXQzSlBybkdL
|
||||||
S00ra2k2Y0VyWnJjcU9oWG5XVGJDQkkKLS0tIHB2bGxDOWhWci81aGViVFlsL0JE
|
Smwxc1hscTlNdzkzUWVPaXBYNkg5RWsKLS0tIGg0UHU3NGlpR1I5RjAvK1NvS3hl
|
||||||
ZGRUUFpKTXpjWW9HQ0R1VDk2RmVmQ2MKJwHW3q0vCZClJFfDrWSLw6C43vWVfyLr
|
K3J3NTZHQlhIOEt6YnZ6QU5QZ0JLT3MKYyy736Q4oXmaryf+JLlgEoK64iGDlUDg
|
||||||
1ACvmNWml+xv/MOQwoRRMx6OVF74X83UyTFdVrXXk7SkzRcwQr4j+A==
|
JbdxbEfCPh3xbuTAff5oU0LxX9XVsoKBO/8+ew6+P/8bcjeb9sNCEg==
|
||||||
-----END AGE ENCRYPTED FILE-----
|
-----END AGE ENCRYPTED FILE-----
|
||||||
lastmodified: "2024-01-12T19:24:10Z"
|
lastmodified: "2024-02-27T17:14:28Z"
|
||||||
mac: ENC[AES256_GCM,data:EdmF3LFSmBFe6Vn5LzVmOb6tyOYto4iwIfJlUL50pjIobvw073oTwd99NkZ9m6aXB2no6ghgPc2RU8jOAtK9gg71kvLOGP45VZ07zLbcxsM8iEkSp2UX2k07/WavdXXGY4yBswGCZgnuPKah6uVNs1s8zEQNCkQQu0D1Ukf3SJY=,iv:7+sUShSrv6iwBJUgT03l38Wg9yX4G1LeXpGgHlOuMnE=,tag:rgXF0E/BIfeyYwnAYYJBsQ==,type:str]
|
mac: ENC[AES256_GCM,data:0OKbP4/zLiMI7KU0WNXfZ62uVKTKBsJJux36ULEI2nd4AEpp57r7hH4DdAcUW9lCB6ZSvXMNytOM2T5GPHDOvEjrne0tv+jMbrp1daBCM08FUDsbjt0tl2veU43wz9KYWe2AlvmwOZPna614fQVFGtaeu79TRu938p2Gz/BnElc=,iv://gX/mf4C/TtTgUKOg6M7m1y6b2mDTk8PjR9Zwusl9c=,tag:uYapGpdIRJfL26kjw3a8Vw==,type:str]
|
||||||
pgp: []
|
pgp: []
|
||||||
encrypted_regex: ^(data|stringData)$
|
encrypted_regex: ^(data|stringData)$
|
||||||
version: 3.8.1
|
version: 3.8.1
|
||||||
|
|
Loading…
Reference in a new issue