From d25d93469dbbb17541aa658185178174b3161d1e Mon Sep 17 00:00:00 2001 From: Joseph Hanson Date: Sun, 13 Aug 2023 15:10:15 +0000 Subject: [PATCH] k3s bootstrap ready --- .ansible-lint | 8 + .editorconfig | 23 +++ .envrc | 3 + .gitattributes | 4 + .github/linters/.flake8 | 2 + .github/linters/.markdownlint.yaml | 23 +++ .github/linters/.yamllint.yaml | 29 +++ .gitignore | 13 ++ .pre-commit-config.yaml | 53 +++++ .taskfiles/PreCommit/Tasks.yaml | 16 ++ .taskfiles/VolSync/ListJob.tmpl.yaml | 19 ++ .../VolSync/ReplicationDestination.tmpl.yaml | 25 +++ .taskfiles/VolSync/Tasks.yaml | 158 +++++++++++++++ .taskfiles/VolSync/UnockJob.tmpl.yaml | 19 ++ .taskfiles/VolSync/WipeJob.tmpl.yaml | 25 +++ .taskfiles/VolSync/wait-for-job.sh | 14 ++ .vscode/settings.json | 49 +++++ README.md | 1 + Taskfile.yaml | 117 +++++++++++ ansible/kubernetes/.envrc | 8 + .../inventory/group_vars/all/main.yaml | 24 +++ .../group_vars/all/supplemental.yaml | 3 + .../inventory/group_vars/master/main.yaml | 29 +++ .../inventory/group_vars/worker/main.yaml | 4 + ansible/kubernetes/inventory/hosts.yaml | 28 +++ ansible/kubernetes/playbooks/ceph-reset.yaml | 39 ++++ .../playbooks/cluster-installation.yaml | 69 +++++++ .../kubernetes/playbooks/cluster-nuke.yaml | 30 +++ .../kubernetes/playbooks/cluster-prepare.yaml | 184 ++++++++++++++++++ .../playbooks/cluster-update-rollout.yaml | 75 +++++++ .../playbooks/files/stale-containers.service | 6 + .../playbooks/files/stale-containers.timer | 11 ++ .../kubernetes/playbooks/tasks/cilium.yaml | 56 ++++++ .../kubernetes/playbooks/tasks/coredns.yaml | 56 ++++++ ansible/kubernetes/playbooks/tasks/cruft.yaml | 32 +++ .../playbooks/tasks/stale_containers.yaml | 36 ++++ .../templates/custom-cilium-helmchart.yaml.j2 | 52 +++++ .../templates/custom-cilium-l2.yaml.j2 | 21 ++ .../custom-coredns-helmchart.yaml.j2 | 77 ++++++++ requirements.txt | 7 + requirements.yaml | 17 ++ 41 files changed, 1465 insertions(+) create mode 100644 .ansible-lint create mode 100644 .editorconfig create mode 100644 .envrc create mode 100644 .gitattributes create mode 100644 .github/linters/.flake8 create mode 100644 .github/linters/.markdownlint.yaml create mode 100644 .github/linters/.yamllint.yaml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 .taskfiles/PreCommit/Tasks.yaml create mode 100644 .taskfiles/VolSync/ListJob.tmpl.yaml create mode 100644 .taskfiles/VolSync/ReplicationDestination.tmpl.yaml create mode 100644 .taskfiles/VolSync/Tasks.yaml create mode 100644 .taskfiles/VolSync/UnockJob.tmpl.yaml create mode 100644 .taskfiles/VolSync/WipeJob.tmpl.yaml create mode 100644 .taskfiles/VolSync/wait-for-job.sh create mode 100644 .vscode/settings.json create mode 100644 README.md create mode 100644 Taskfile.yaml create mode 100644 ansible/kubernetes/.envrc create mode 100644 ansible/kubernetes/inventory/group_vars/all/main.yaml create mode 100644 ansible/kubernetes/inventory/group_vars/all/supplemental.yaml create mode 100644 ansible/kubernetes/inventory/group_vars/master/main.yaml create mode 100644 ansible/kubernetes/inventory/group_vars/worker/main.yaml create mode 100644 ansible/kubernetes/inventory/hosts.yaml create mode 100644 ansible/kubernetes/playbooks/ceph-reset.yaml create mode 100644 ansible/kubernetes/playbooks/cluster-installation.yaml create mode 100644 ansible/kubernetes/playbooks/cluster-nuke.yaml create mode 100644 ansible/kubernetes/playbooks/cluster-prepare.yaml create mode 100644 ansible/kubernetes/playbooks/cluster-update-rollout.yaml create mode 100644 ansible/kubernetes/playbooks/files/stale-containers.service create mode 100644 ansible/kubernetes/playbooks/files/stale-containers.timer create mode 100644 ansible/kubernetes/playbooks/tasks/cilium.yaml create mode 100644 ansible/kubernetes/playbooks/tasks/coredns.yaml create mode 100644 ansible/kubernetes/playbooks/tasks/cruft.yaml create mode 100644 ansible/kubernetes/playbooks/tasks/stale_containers.yaml create mode 100644 ansible/kubernetes/playbooks/templates/custom-cilium-helmchart.yaml.j2 create mode 100644 ansible/kubernetes/playbooks/templates/custom-cilium-l2.yaml.j2 create mode 100644 ansible/kubernetes/playbooks/templates/custom-coredns-helmchart.yaml.j2 create mode 100644 requirements.txt create mode 100644 requirements.yaml diff --git a/.ansible-lint b/.ansible-lint new file mode 100644 index 0000000..59c41dc --- /dev/null +++ b/.ansible-lint @@ -0,0 +1,8 @@ +skip_list: + - yaml[line-length] + - var-naming +warn_list: + - command-instead-of-shell + - deprecated-command-syntax + - experimental + - no-changed-when diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..6e40cb6 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,23 @@ +; https://editorconfig.org/ + +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[{Makefile,go.mod,go.sum,*.go,.gitmodules}] +indent_style = tab +indent_size = 4 + +[*.md] +indent_size = 4 +trim_trailing_whitespace = false + +[{Dockerfile,*.bash,*.sh}] +indent_style = space +indent_size = 4 diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..49bcb30 --- /dev/null +++ b/.envrc @@ -0,0 +1,3 @@ +#shellcheck disable=SC2148,SC2155 +export KUBECONFIG="$(expand_path ./kubeconfig)" +export SOPS_AGE_KEY_FILE="$(expand_path ./age.key)" diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..8d1628f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +* text=auto eol=lf +*.sops.* diff=sopsdiffer +*.sops.toml linguist-language=JSON +*.yaml.j2 linguist-language=YAML diff --git a/.github/linters/.flake8 b/.github/linters/.flake8 new file mode 100644 index 0000000..6deafc2 --- /dev/null +++ b/.github/linters/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 diff --git a/.github/linters/.markdownlint.yaml b/.github/linters/.markdownlint.yaml new file mode 100644 index 0000000..3443fa7 --- /dev/null +++ b/.github/linters/.markdownlint.yaml @@ -0,0 +1,23 @@ +--- +default: true + +# MD013/line-length - Line length +MD013: + # Number of characters + line_length: 240 + # Number of characters for headings + heading_line_length: 80 + # Number of characters for code blocks + code_block_line_length: 80 + # Include code blocks + code_blocks: true + # Include tables + tables: true + # Include headings + headings: true + # Include headings + headers: true + # Strict length checking + strict: false + # Stern length checking + stern: false diff --git a/.github/linters/.yamllint.yaml b/.github/linters/.yamllint.yaml new file mode 100644 index 0000000..bb7b058 --- /dev/null +++ b/.github/linters/.yamllint.yaml @@ -0,0 +1,29 @@ +--- +ignore: | + .ansible/ + .direnv/ + .private/ + .vscode/ + *.sops.* + ansible/roles/xanmanning.k3s/ + +extends: default + +rules: + truthy: + allowed-values: ["true", "false", "on"] + + comments: + min-spaces-from-content: 1 + + line-length: disable + + braces: + min-spaces-inside: 0 + max-spaces-inside: 1 + + brackets: + min-spaces-inside: 0 + max-spaces-inside: 0 + + indentation: enable diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b5d766a --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +.DS_Store +Thumbs.db +.private/ +.venv/ +.terraform +*.tfvars +.decrypted~* +*.agekey +*.pub +*.key +*.pem +kubeconfig* +config.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..dadd36a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,53 @@ +--- +fail_fast: false + +exclude: | + (?x)^( + docs/_assets/.* + | .*\.sops\.toml + )$ + +repos: + - repo: https://github.com/adrienverge/yamllint + rev: v1.32.0 + hooks: + - id: yamllint + args: + - -c + - ".github/linters/.yamllint.yaml" + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: fix-byte-order-marker + - id: mixed-line-ending + - id: check-added-large-files + args: [--maxkb=2048] + - id: check-merge-conflict + - id: check-executables-have-shebangs + + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.5.3 + hooks: + - id: forbid-crlf + - id: forbid-tabs + + - repo: https://github.com/sirosen/fix-smartquotes + rev: 0.2.0 + hooks: + - id: fix-smartquotes + + - repo: https://github.com/jumanjihouse/pre-commit-hooks + rev: 3.0.0 + hooks: + - id: shellcheck + language: script + args: [--severity=error] + additional_dependencies: [] + + - repo: https://github.com/k8s-at-home/sops-pre-commit + rev: v2.1.1 + hooks: + - id: forbid-secrets diff --git a/.taskfiles/PreCommit/Tasks.yaml b/.taskfiles/PreCommit/Tasks.yaml new file mode 100644 index 0000000..e708a01 --- /dev/null +++ b/.taskfiles/PreCommit/Tasks.yaml @@ -0,0 +1,16 @@ +--- +version: "3" + +tasks: + init: + desc: Initialize pre-commit hooks + cmds: + - pre-commit install --install-hooks + run: + desc: Run pre-commit + cmds: + - pre-commit run --all-files + update: + desc: Update pre-commit hooks + cmds: + - pre-commit autoupdate diff --git a/.taskfiles/VolSync/ListJob.tmpl.yaml b/.taskfiles/VolSync/ListJob.tmpl.yaml new file mode 100644 index 0000000..0d63998 --- /dev/null +++ b/.taskfiles/VolSync/ListJob.tmpl.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "list-${rsrc}-${ts}" + namespace: "${namespace}" +spec: + ttlSecondsAfterFinished: 3600 + template: + spec: + automountServiceAccountToken: false + restartPolicy: OnFailure + containers: + - name: list + image: docker.io/restic/restic:0.16.0 + args: ["snapshots"] + envFrom: + - secretRef: + name: "${rsrc}-restic-secret" diff --git a/.taskfiles/VolSync/ReplicationDestination.tmpl.yaml b/.taskfiles/VolSync/ReplicationDestination.tmpl.yaml new file mode 100644 index 0000000..46be699 --- /dev/null +++ b/.taskfiles/VolSync/ReplicationDestination.tmpl.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: volsync.backube/v1alpha1 +kind: ReplicationDestination +metadata: + name: "${rsrc}-${claim}-${ts}" + namespace: "${namespace}" +spec: + trigger: + manual: restore-once + restic: + repository: "${rsrc}-restic-secret" + destinationPVC: "${claim}" + copyMethod: Direct + storageClassName: ceph-block + # IMPORTANT NOTE: + # Set to the last X number of snapshots to restore from + previous: ${previous} + # OR; + # IMPORTANT NOTE: + # On bootstrap set `restoreAsOf` to the time the old cluster was destroyed. + # This will essentially prevent volsync from trying to restore a backup + # from a application that started with default data in the PVC. + # Do not restore snapshots made after the following RFC3339 Timestamp. + # date --rfc-3339=seconds (--utc) + # restoreAsOf: "2022-12-10T16:00:00-05:00" diff --git a/.taskfiles/VolSync/Tasks.yaml b/.taskfiles/VolSync/Tasks.yaml new file mode 100644 index 0000000..a27cc6d --- /dev/null +++ b/.taskfiles/VolSync/Tasks.yaml @@ -0,0 +1,158 @@ +--- +version: "3" + +x-task-vars: &task-vars + rsrc: '{{.rsrc}}' + controller: '{{.controller}}' + namespace: '{{.namespace}}' + claim: '{{.claim}}' + ts: '{{.ts}}' + kustomization: '{{.kustomization}}' + previous: '{{.previous}}' + +vars: + destinationTemplate: "{{.ROOT_DIR}}/.taskfiles/VolSync/ReplicationDestination.tmpl.yaml" + wipeJobTemplate: "{{.ROOT_DIR}}/.taskfiles/VolSync/WipeJob.tmpl.yaml" + waitForJobScript: "{{.ROOT_DIR}}/.taskfiles/VolSync/wait-for-job.sh" + listJobTemplate: "{{.ROOT_DIR}}/.taskfiles/VolSync/ListJob.tmpl.yaml" + unlockJobTemplate: "{{.ROOT_DIR}}/.taskfiles/VolSync/UnlockJob.tmpl.yaml" + ts: '{{now | date "150405"}}' + +tasks: + + list: + desc: List all snapshots taken by restic for a given ReplicationSource (ex. task volsync:list rsrc=plex [namespace=default]) + silent: true + cmds: + - envsubst < <(cat {{.listJobTemplate}}) | kubectl apply -f - + - bash {{.waitForJobScript}} list-{{.rsrc}}-{{.ts}} {{.namespace}} + - kubectl -n {{.namespace}} wait job/list-{{.rsrc}}-{{.ts}} --for condition=complete --timeout=1m + - kubectl -n {{.namespace}} logs job/list-{{.rsrc}}-{{.ts}} --container list + - kubectl -n {{.namespace}} delete job list-{{.rsrc}}-{{.ts}} + vars: + rsrc: '{{ or .rsrc (fail "ReplicationSource `rsrc` is required") }}' + namespace: '{{.namespace | default "default"}}' + env: *task-vars + preconditions: + - sh: test -f {{.waitForJobScript}} + - sh: test -f {{.listJobTemplate}} + + unlock: + desc: Unlocks restic repository for a given ReplicationSource (ex. task volsync:unlock rsrc=plex [namespace=default]) + silent: true + cmds: + - envsubst < <(cat {{.unlockJobTemplate}}) | kubectl apply -f - + - bash {{.waitForJobScript}} unlock-{{.rsrc}}-{{.ts}} {{.namespace}} + - kubectl -n {{.namespace}} wait job/unlock-{{.rsrc}}-{{.ts}} --for condition=complete --timeout=1m + - kubectl -n {{.namespace}} logs job/unlock-{{.rsrc}}-{{.ts}} --container unlock + - kubectl -n {{.namespace}} delete job unlock-{{.rsrc}}-{{.ts}} + vars: + rsrc: '{{ or .rsrc (fail "ReplicationSource `rsrc` is required") }}' + namespace: '{{.namespace | default "default"}}' + env: *task-vars + preconditions: + - sh: test -f {{.waitForJobScript}} + - sh: test -f {{.unlockJobTemplate}} + + # To run backup jobs in parallel for all replicationsources: + # - kubectl get replicationsources --all-namespaces --no-headers | awk '{print $2, $1}' | xargs --max-procs=4 -l bash -c 'task volsync:snapshot rsrc=$0 namespace=$1' + # + snapshot: + desc: Trigger a Restic ReplicationSource snapshot (ex. task volsync:snapshot rsrc=plex [namespace=default]) + cmds: + - kubectl -n {{.namespace}} patch replicationsources {{.rsrc}} --type merge -p '{"spec":{"trigger":{"manual":"{{.ts}}"}}}' + - bash {{.waitForJobScript}} volsync-src-{{.rsrc}} {{.namespace}} + - kubectl -n {{.namespace}} wait job/volsync-src-{{.rsrc}} --for condition=complete --timeout=120m + # TODO: Find a way to output logs + # Error from server (NotFound): jobs.batch "volsync-src-zzztest" not found + # - kubectl -n {{.namespace}} logs job/volsync-src-{{.rsrc}} + vars: + rsrc: '{{ or .rsrc (fail "ReplicationSource `rsrc` is required") }}' + namespace: '{{.namespace | default "default"}}' + env: *task-vars + preconditions: + - sh: test -f {{.waitForJobScript}} + - sh: kubectl -n {{.namespace}} get replicationsources {{.rsrc}} + msg: "ReplicationSource '{{.rsrc}}' not found in namespace '{{.namespace}}'" + + # To run restore jobs in parallel for all replicationdestinations: + # - kubectl get replicationsources --all-namespaces --no-headers | awk '{print $2, $1}' | xargs --max-procs=2 -l bash -c 'task volsync:restore rsrc=$0 namespace=$1' + # + restore: + desc: Trigger a Restic ReplicationSource restore (ex. task volsync:restore rsrc=plex [namespace=default]) + cmds: + - task: restore-suspend-app + vars: *task-vars + - task: restore-wipe-job + vars: *task-vars + - task: restore-volsync-job + vars: *task-vars + - task: restore-resume-app + vars: *task-vars + vars: + rsrc: '{{ or .rsrc (fail "Variable `rsrc` is required") }}' + namespace: '{{.namespace | default "default"}}' + # 1) Query to find the Flux Kustomization associated with the ReplicationSource (rsrc) + kustomization: + sh: | + kubectl -n {{.namespace}} get replicationsource {{.rsrc}} \ + -o jsonpath="{.metadata.labels.kustomize\.toolkit\.fluxcd\.io/name}" + # 2) Query to find the Claim associated with the ReplicationSource (rsrc) + claim: + sh: | + kubectl -n {{.namespace}} get replicationsource {{.rsrc}} \ + -o jsonpath="{.spec.sourcePVC}" + # 3) Query to find the controller associated with the PersistentVolumeClaim (claim) + controller: + sh: | + app=$(kubectl -n {{.namespace}} get persistentvolumeclaim {{.claim}} -o jsonpath="{.metadata.labels.app\.kubernetes\.io/name}") + if kubectl -n {{ .namespace }} get deployment.apps/$app >/dev/null 2>&1 ; then + echo "deployment.apps/$app" + else + echo "statefulset.apps/$app" + fi + previous: "{{.previous | default 2}}" + env: *task-vars + preconditions: + - sh: test -f {{.wipeJobTemplate}} + - sh: test -f {{.destinationTemplate}} + - sh: test -f {{.waitForJobScript}} + + # Suspend the Flux ks and hr + restore-suspend-app: + internal: true + cmds: + - flux -n flux-system suspend kustomization {{.kustomization}} + - flux -n {{.namespace}} suspend helmrelease {{.rsrc}} + - kubectl -n {{.namespace}} scale {{.controller}} --replicas 0 + - kubectl -n {{.namespace}} wait pod --for delete --selector="app.kubernetes.io/name={{.rsrc}}" --timeout=2m + env: *task-vars + + # Wipe the PVC of all data + restore-wipe-job: + internal: true + cmds: + - envsubst < <(cat {{.wipeJobTemplate}}) | kubectl apply -f - + - bash {{.waitForJobScript}} wipe-{{.rsrc}}-{{.claim}}-{{.ts}} {{.namespace}} + - kubectl -n {{.namespace}} wait job/wipe-{{.rsrc}}-{{.claim}}-{{.ts}} --for condition=complete --timeout=120m + - kubectl -n {{.namespace}} logs job/wipe-{{.rsrc}}-{{.claim}}-{{.ts}} --container wipe + - kubectl -n {{.namespace}} delete job wipe-{{.rsrc}}-{{.claim}}-{{.ts}} + env: *task-vars + + # Create VolSync replicationdestination CR to restore data + restore-volsync-job: + internal: true + cmds: + - envsubst < <(cat {{.destinationTemplate}}) | kubectl apply -f - + - bash {{.waitForJobScript}} volsync-dst-{{.rsrc}}-{{.claim}}-{{.ts}} {{.namespace}} + - kubectl -n {{.namespace}} wait job/volsync-dst-{{.rsrc}}-{{.claim}}-{{.ts}} --for condition=complete --timeout=120m + - kubectl -n {{.namespace}} delete replicationdestination {{.rsrc}}-{{.claim}}-{{.ts}} + env: *task-vars + + # Resume Flux ks and hr + restore-resume-app: + internal: true + cmds: + - flux -n {{.namespace}} resume helmrelease {{.rsrc}} + - flux -n flux-system resume kustomization {{.kustomization}} + env: *task-vars diff --git a/.taskfiles/VolSync/UnockJob.tmpl.yaml b/.taskfiles/VolSync/UnockJob.tmpl.yaml new file mode 100644 index 0000000..310f1b7 --- /dev/null +++ b/.taskfiles/VolSync/UnockJob.tmpl.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "unlock-${rsrc}-${ts}" + namespace: "${namespace}" +spec: + ttlSecondsAfterFinished: 3600 + template: + spec: + automountServiceAccountToken: false + restartPolicy: OnFailure + containers: + - name: unlock + image: docker.io/restic/restic:0.16.0 + args: ["unlock", "--remove-all"] + envFrom: + - secretRef: + name: "${rsrc}-restic-secret" diff --git a/.taskfiles/VolSync/WipeJob.tmpl.yaml b/.taskfiles/VolSync/WipeJob.tmpl.yaml new file mode 100644 index 0000000..eb878b0 --- /dev/null +++ b/.taskfiles/VolSync/WipeJob.tmpl.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "wipe-${rsrc}-${claim}-${ts}" + namespace: "${namespace}" +spec: + ttlSecondsAfterFinished: 3600 + template: + spec: + automountServiceAccountToken: false + restartPolicy: OnFailure + containers: + - name: wipe + image: public.ecr.aws/docker/library/busybox:latest + command: ["/bin/sh", "-c", "cd /config; find . -delete"] + volumeMounts: + - name: config + mountPath: /config + securityContext: + privileged: true + volumes: + - name: config + persistentVolumeClaim: + claimName: "${claim}" diff --git a/.taskfiles/VolSync/wait-for-job.sh b/.taskfiles/VolSync/wait-for-job.sh new file mode 100644 index 0000000..32feadd --- /dev/null +++ b/.taskfiles/VolSync/wait-for-job.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +JOB_NAME=$1 +NAMESPACE="${2:-default}" + +[[ -z "${JOB_NAME}" ]] && echo "Job name not specified" && exit 1 + +while true; do + STATUS="$(kubectl -n "${NAMESPACE}" get pod -l job-name="${JOB_NAME}" -o jsonpath='{.items[*].status.phase}')" + if [ "${STATUS}" == "Pending" ]; then + break + fi + sleep 1 +done diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..cc74893 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,49 @@ +{ + "ansible.validation.lint.arguments": "-c .ansible-lint", + "files.associations": { + "*.json5": "jsonc", + "**/ansible/**/*.yaml": "ansible", + "**/ansible/**/*.sops.yaml": "yaml", + "**/ansible/**/inventory/**/*.yaml": "yaml", + "**/terraform/**/*.tf": "terraform", + "**/kubernetes/**/*.sops.toml": "plaintext" + }, + "material-icon-theme.folders.associations": { + ".taskfiles": "utils", + "bootstrap": "import", + "charts": "kubernetes", + "hack": "scripts", + "repositories": "database", + "terraforms": "terraform", + "vars": "other", + // namespaces + "cert-manager": "guard", + "external-secrets": "keys", + "kube-system": "kubernetes", + "monitoring": "event", + "networking": "connection", + "rook-ceph": "dump", + }, + "yaml.schemaStore.enable": true, + "yaml.schemas": { + "ansible": "ansible/**/*.yaml", + "kubernetes": "kubernetes/**/*.yaml", + "schemaservice://combinedschema/ansible": "file:///home/jahanson/projects/k3s-ops/ansible/kubernetes/inventory/hosts.yaml" + }, + "editor.fontFamily": "FiraCode Nerd Font", + "editor.fontLigatures": true, + "editor.bracketPairColorization.enabled": true, + "editor.guides.bracketPairs": true, + "editor.guides.bracketPairsHorizontal": true, + "editor.guides.highlightActiveBracketPair": true, + "editor.hover.delay": 1500, + "editor.stickyScroll.enabled": false, + "editor.rulers": [ + 100 + ], + "explorer.autoReveal": false, + "files.trimTrailingWhitespace": true, + "ansible.python.interpreterPath": "/usr/bin/python3", + "sops.defaults.ageKeyFile": "age.key", + "ansible.validation.lint.path": "~/projects/k3s-ops/.venv/bin/ansible-lint" +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..f7fe79a --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +k3s version of my kubernetes lab on hetzner diff --git a/Taskfile.yaml b/Taskfile.yaml new file mode 100644 index 0000000..c3b7bca --- /dev/null +++ b/Taskfile.yaml @@ -0,0 +1,117 @@ +--- +version: "3" + +vars: + PYTHON_BIN: python3 + CLUSTER_DIR: "{{.ROOT_DIR}}/kubernetes" + ANSIBLE_DIR: "{{.ROOT_DIR}}/ansible" + CLUSTER_SECRETS_FILE: "{{.CLUSTER_DIR}}/flux/vars/cluster-secrets.sops.env" + CLUSTER_SETTINGS_FILE: "{{.CLUSTER_DIR}}/flux/vars/cluster-settings.env" + +env: + KUBECONFIG: "{{.ROOT_DIR}}/kubeconfig" + SOPS_AGE_KEY_FILE: "{{.ROOT_DIR}}/age.key" + PATH: "{{.ROOT_DIR}}/.venv/bin:$PATH" + VIRTUAL_ENV: "{{.ROOT_DIR}}/.venv" + ANSIBLE_COLLECTIONS_PATH: "{{.ROOT_DIR}}/.venv/galaxy" + ANSIBLE_ROLES_PATH: "{{.ROOT_DIR}}/.venv/galaxy/ansible_roles" + ANSIBLE_VARS_ENABLED: "host_group_vars,community.sops.sops" + K8S_AUTH_KUBECONFIG: "{{.ROOT_DIR}}/kubeconfig" + +includes: + volsync: .taskfiles/VolSync/Tasks.yaml + precommit: .taskfiles/PreCommit/Tasks.yaml + +tasks: + + default: + silent: true + cmds: ["task -l"] + + configure-venv: + desc: Install or upgrade the Python virtual env + cmds: + - "{{.PYTHON_BIN}} -m venv {{.ROOT_DIR}}/.venv" + - .venv/bin/python3 -m pip install --upgrade pip setuptools wheel + - .venv/bin/python3 -m pip install --upgrade --requirement "{{.ROOT_DIR}}/requirements.txt" + - .venv/bin/ansible-galaxy install --role-file "{{.ROOT_DIR}}/requirements.yaml" --force + + flux-apply: + desc: Apply a resource path that contains Flux substitution variables + dotenv: ['{{.CLUSTER_SETTINGS_FILE}}'] + vars: + ks: '{{ or .ks (fail "Missing path (`ks` var)") }}' + cmd: | + sops exec-env {{.CLUSTER_SECRETS_FILE}} \ + "kustomize build --load-restrictor=LoadRestrictionsNone {{.ks}} | \ + envsubst | kubectl apply --server-side --field-manager=kustomize-controller -f -" + preconditions: + - sh: test -f {{.CLUSTER_SECRETS_FILE}} + - sh: test -f {{.CLUSTER_SETTINGS_FILE}} + + sync-secrets: + desc: Sync ExternalSecret resources + vars: + secret: '{{ .secret | default ""}}' + namespace: '{{.namespace | default "default"}}' + cmd: | + {{if eq .secret ""}} + kubectl get externalsecret.external-secrets.io --all-namespaces --no-headers -A | awk '{print $1, $2}' \ + | xargs --max-procs=4 -l bash -c 'kubectl -n $0 annotate externalsecret.external-secrets.io $1 force-sync=$(date +%s) --overwrite' + {{else}} + kubectl -n {{.namespace}} annotate externalsecret.external-secrets.io {{.secret}} force-sync=$(date +%s) --overwrite + {{end}} + preconditions: + - kubectl -n {{.namespace}} get externalsecret {{.secret}} + + mount-volume: + desc: Mount a PersistentVolumeClaim to a temporary pod + interactive: true + vars: + claim: '{{ or .claim (fail "PersistentVolumeClaim `claim` is required") }}' + namespace: '{{.namespace | default "default"}}' + cmd: | + kubectl run -n {{.namespace}} debug-{{.claim}} -i --tty --rm --image=null --privileged --overrides=' + { + "apiVersion": "v1", + "spec": { + "containers": [ + { + "name": "debug", + "image": "ghcr.io/onedr0p/alpine:rolling", + "command": ["/bin/bash"], + "stdin": true, + "stdinOnce": true, + "tty": true, + "volumeMounts": [ + { + "name": "config", + "mountPath": "/config" + } + ] + } + ], + "volumes": [ + { + "name": "config", + "persistentVolumeClaim": { + "claimName": "{{.claim}}" + } + } + ], + "restartPolicy": "Never" + } + }' + preconditions: + - kubectl -n {{.namespace}} get pvc {{.claim}} + + # https://github.com/fluxcd/helm-controller/issues/644 + "644": + cmds: + - kubectl -n {{.namespace}} delete secret -l owner=helm,name={{.release}},status=pending-upgrade + - flux -n {{.namespace}} reconcile hr {{.release}} + vars: + release: '{{ or .release (fail "HelmRelease `release` is required") }}' + namespace: '{{.namespace | default "default"}}' + preconditions: + - flux -n {{.namespace}} get hr {{.release}} diff --git a/ansible/kubernetes/.envrc b/ansible/kubernetes/.envrc new file mode 100644 index 0000000..a3eca56 --- /dev/null +++ b/ansible/kubernetes/.envrc @@ -0,0 +1,8 @@ +#shellcheck disable=SC2148,SC2155 +export SOPS_AGE_KEY_FILE="$(expand_path ../../age.key)" +export VIRTUAL_ENV="$(expand_path ../../.venv)" +export ANSIBLE_COLLECTIONS_PATH=$(expand_path ../../.venv/galaxy) +export ANSIBLE_ROLES_PATH=$(expand_path ../../.venv/galaxy/ansible_roles) +export ANSIBLE_VARS_ENABLED="host_group_vars,community.sops.sops" +export ANSIBLE_INVENTORY=$(expand_path ./inventory/hosts.yaml) +PATH_add "$(expand_path ../../.venv/bin)" diff --git a/ansible/kubernetes/inventory/group_vars/all/main.yaml b/ansible/kubernetes/inventory/group_vars/all/main.yaml new file mode 100644 index 0000000..aef58b2 --- /dev/null +++ b/ansible/kubernetes/inventory/group_vars/all/main.yaml @@ -0,0 +1,24 @@ +--- +# renovate: datasource=github-releases depName=k3s-io/k3s +k3s_release_version: "v1.27.4+k3s1" +k3s_install_hard_links: true +k3s_become: true +k3s_etcd_datastore: true +k3s_registration_address: 10.2.0.3 +# /var/lib/rancher/k3s/server/manifests +k3s_server_manifests_urls: + # Essential Prometheus Operator CRDs (the rest are installed with the kube-prometheus-stack helm release) + - url: https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml + filename: custom-prometheus-podmonitors.yaml + - url: https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml + filename: custom-prometheus-prometheusrules.yaml + - url: https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml + filename: custom-prometheus-scrapeconfigs.yaml + - url: https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml + filename: custom-prometheus-servicemonitors.yaml +# /var/lib/rancher/k3s/server/manifests +k3s_server_manifests_templates: + - custom-cilium-helmchart.yaml.j2 + - custom-cilium-l2.yaml.j2 + - custom-coredns-helmchart.yaml.j2 +# /var/lib/rancher/k3s/agent/pod-manifests diff --git a/ansible/kubernetes/inventory/group_vars/all/supplemental.yaml b/ansible/kubernetes/inventory/group_vars/all/supplemental.yaml new file mode 100644 index 0000000..6aba05e --- /dev/null +++ b/ansible/kubernetes/inventory/group_vars/all/supplemental.yaml @@ -0,0 +1,3 @@ +--- +github_username: jahanson +timezone: America/Chicago diff --git a/ansible/kubernetes/inventory/group_vars/master/main.yaml b/ansible/kubernetes/inventory/group_vars/master/main.yaml new file mode 100644 index 0000000..1849c7c --- /dev/null +++ b/ansible/kubernetes/inventory/group_vars/master/main.yaml @@ -0,0 +1,29 @@ +--- +k3s_control_node: true +k3s_server: + node-ip: "{{ ansible_host }}" + tls-san: + - "{{ k3s_registration_address }}" + https-listen-port: 6443 + docker: false + flannel-backend: "none" # quote + disable: + - coredns + - flannel + - local-storage + - metrics-server + - servicelb + - traefik + disable-network-policy: true + disable-cloud-controller: true + disable-kube-proxy: true + cluster-cidr: 10.32.0.0/16 + service-cidr: 10.33.0.0/16 + write-kubeconfig-mode: "0644" + etcd-expose-metrics: true + kube-controller-manager-arg: + - bind-address=0.0.0.0 + kube-scheduler-arg: + - bind-address=0.0.0.0 + kube-apiserver-arg: + - anonymous-auth=true diff --git a/ansible/kubernetes/inventory/group_vars/worker/main.yaml b/ansible/kubernetes/inventory/group_vars/worker/main.yaml new file mode 100644 index 0000000..5fdb463 --- /dev/null +++ b/ansible/kubernetes/inventory/group_vars/worker/main.yaml @@ -0,0 +1,4 @@ +--- +k3s_control_node: false +k3s_agent: + node-ip: "{{ ansible_host }}" diff --git a/ansible/kubernetes/inventory/hosts.yaml b/ansible/kubernetes/inventory/hosts.yaml new file mode 100644 index 0000000..7cc29d4 --- /dev/null +++ b/ansible/kubernetes/inventory/hosts.yaml @@ -0,0 +1,28 @@ +--- +kubernetes: + vars: + ansible_user: jahanson + ansible_ssh_port: 22 + children: + master: + hosts: + eonwe: + ansible_host: 10.2.0.4 + nienna: + ansible_host: 10.2.0.5 + arlen: + ansible_host: 10.2.0.6 + worker: + hosts: + aule: + ansible_host: 10.2.1.10 + ceph_drives: + - /dev/disk/by-id/nvme-SAMSUNG_MZQL2960HCJR-00A07_S64FNE0RA01210 + manwe: + ansible_host: 10.2.1.11 + ceph_drives: + - /dev/disk/by-id/nvme-SAMSUNG_MZQL2960HCJR-00A07_S64FNE0R801843 + varda: + ansible_host: 10.2.1.12 + ceph_drives: + - /dev/disk/by-id/nvme-SAMSUNG_MZQL2960HCJR-00A07_S64FNE0R801309 diff --git a/ansible/kubernetes/playbooks/ceph-reset.yaml b/ansible/kubernetes/playbooks/ceph-reset.yaml new file mode 100644 index 0000000..7acd285 --- /dev/null +++ b/ansible/kubernetes/playbooks/ceph-reset.yaml @@ -0,0 +1,39 @@ +--- +- name: Reset Ceph Drives + hosts: all + become: true + gather_facts: true + any_errors_fatal: true + pre_tasks: + - name: Pausing for 2 seconds... + ansible.builtin.pause: + seconds: 2 + tasks: + - name: Reset Ceph Drives # noqa: ignore-errors + ignore_errors: true + when: ceph_drives | default([]) | length > 0 + block: + - name: Delete (/var/lib/rook) + ansible.builtin.file: + state: absent + path: /var/lib/rook + - name: Delete (/dev/mapper/ceph-*) # noqa: no-changed-when + ansible.builtin.shell: | + set -o pipefail + ls /dev/mapper/ceph-* | xargs -I% -- dmsetup remove_all --force % || true + - name: Delete (/dev/ceph-*) # noqa: no-changed-when + ansible.builtin.command: rm -rf /dev/ceph-* + - name: Delete (/dev/mapper/ceph--*) # noqa: no-changed-when + ansible.builtin.command: rm -rf /dev/mapper/ceph--* + - name: Wipe (sgdisk) # noqa: no-changed-when + ansible.builtin.command: "sgdisk --zap-all {{ item }}" + loop: "{{ ceph_drives }}" + - name: Wipe (dd) # noqa: no-changed-when + ansible.builtin.command: "dd if=/dev/zero of={{ item }} bs=1M count=100 oflag=direct,dsync" + loop: "{{ ceph_drives }}" + - name: Wipe (blkdiscard) # noqa: no-changed-when + ansible.builtin.command: "blkdiscard {{ item }}" + loop: "{{ ceph_drives }}" + - name: Wipe (partprobe) # noqa: no-changed-when + ansible.builtin.command: "partprobe {{ item }}" + loop: "{{ ceph_drives }}" diff --git a/ansible/kubernetes/playbooks/cluster-installation.yaml b/ansible/kubernetes/playbooks/cluster-installation.yaml new file mode 100644 index 0000000..682eb1d --- /dev/null +++ b/ansible/kubernetes/playbooks/cluster-installation.yaml @@ -0,0 +1,69 @@ +--- +- name: Cluster Installation + hosts: all + become: true + gather_facts: true + any_errors_fatal: true + pre_tasks: + - name: Pausing for 2 seconds... + ansible.builtin.pause: + seconds: 2 + tasks: + - name: Check if cluster is installed + check_mode: false + ansible.builtin.stat: + path: /etc/rancher/k3s/config.yaml + register: k3s_installed + + - name: Ignore manifests templates and urls if the cluster is already installed + when: k3s_installed.stat.exists + ansible.builtin.set_fact: + k3s_server_manifests_templates: [] + k3s_server_manifests_urls: [] + + - name: Install Kubernetes + ansible.builtin.include_role: + name: xanmanning.k3s + public: true + vars: + k3s_state: installed + + - name: Wait for custom manifests to rollout + when: + - k3s_primary_control_node + - (k3s_server_manifests_templates | length > 0 + or k3s_server_manifests_urls | length > 0) + kubernetes.core.k8s_info: + kubeconfig: /etc/rancher/k3s/k3s.yaml + kind: "{{ item.kind }}" + name: "{{ item.name }}" + namespace: "{{ item.namespace | default('') }}" + wait: true + wait_sleep: 10 + wait_timeout: 360 + loop: + - { name: cilium, kind: HelmChart, namespace: kube-system } + - { name: coredns, kind: HelmChart, namespace: kube-system } + - { name: policy, kind: CiliumL2AnnouncementPolicy } + - { name: pool, kind: CiliumLoadBalancerIPPool } + - { name: podmonitors.monitoring.coreos.com, kind: CustomResourceDefinition } + - { name: prometheusrules.monitoring.coreos.com, kind: CustomResourceDefinition } + - { name: scrapeconfigs.monitoring.coreos.com, kind: CustomResourceDefinition } + - { name: servicemonitors.monitoring.coreos.com, kind: CustomResourceDefinition } + + - name: Coredns + when: k3s_primary_control_node + ansible.builtin.include_tasks: tasks/coredns.yaml + + - name: Cilium + when: k3s_primary_control_node + ansible.builtin.include_tasks: tasks/cilium.yaml + + - name: Cruft + when: k3s_primary_control_node + ansible.builtin.include_tasks: tasks/cruft.yaml + + - name: Stale Containers + ansible.builtin.include_tasks: tasks/stale_containers.yaml + vars: + stale_containers_state: enabled diff --git a/ansible/kubernetes/playbooks/cluster-nuke.yaml b/ansible/kubernetes/playbooks/cluster-nuke.yaml new file mode 100644 index 0000000..2db6f28 --- /dev/null +++ b/ansible/kubernetes/playbooks/cluster-nuke.yaml @@ -0,0 +1,30 @@ +--- +- name: Cluster Nuke + hosts: + - master + - worker + become: true + gather_facts: true + any_errors_fatal: true + pre_tasks: + - name: Pausing for 2 seconds... + ansible.builtin.pause: + seconds: 2 + tasks: + - name: Uninstall k3s + ansible.builtin.include_role: + name: xanmanning.k3s + public: true + vars: + k3s_state: uninstalled + - name: Gather list of CNI files + ansible.builtin.find: + paths: /etc/cni/net.d + patterns: "*" + hidden: true + register: directory_contents + - name: Delete CNI files + ansible.builtin.file: + path: "{{ item.path }}" + state: absent + loop: "{{ directory_contents.files }}" diff --git a/ansible/kubernetes/playbooks/cluster-prepare.yaml b/ansible/kubernetes/playbooks/cluster-prepare.yaml new file mode 100644 index 0000000..5d62076 --- /dev/null +++ b/ansible/kubernetes/playbooks/cluster-prepare.yaml @@ -0,0 +1,184 @@ +--- +- name: Prepare System + hosts: all + become: true + gather_facts: true + any_errors_fatal: true + pre_tasks: + - name: Pausing for 2 seconds... + ansible.builtin.pause: + seconds: 2 + tasks: + - name: Locale + block: + - name: Locale | Set timezone + community.general.timezone: + name: "{{ timezone | default('Etc/UTC') }}" + + - name: Packages + block: + - name: Packages | Add fish key + ansible.builtin.get_url: + url: https://download.opensuse.org/repositories/shells:fish:release:3/Debian_12/Release.key + dest: /etc/apt/trusted.gpg.d/fish.asc + owner: root + group: root + mode: "0644" + - name: Packages | Add fish repository + ansible.builtin.apt_repository: + repo: deb [signed-by=/etc/apt/trusted.gpg.d/fish.asc] http://download.opensuse.org/repositories/shells:/fish:/release:/3/Debian_12/ / + filename: fish + update_cache: true + - name: Packages | Add non-free repository + ansible.builtin.apt_repository: + repo: deb http://deb.debian.org/debian/ stable main contrib non-free + filename: non-free + update_cache: true + - name: Packages | Install + ansible.builtin.apt: + name: apt-transport-https,ca-certificates,conntrack,curl,dirmngr,fish,gdisk, + gnupg,hdparm,htop,iperf3,iptables,iputils-ping,ipvsadm, + libseccomp2,lm-sensors,neofetch,net-tools,nfs-common,nvme-cli,open-iscsi,parted,psmisc,python3, + python3-apt,python3-kubernetes,python3-yaml,smartmontools,socat,software-properties-common, + unzip,util-linux + install_recommends: false + + - name: User Configuration + block: + - name: User Configuration | SSH keys + ansible.posix.authorized_key: + user: "{{ ansible_user }}" + key: "https://github.com/{{ github_username }}.keys" + - name: User Configuration | Silence login + ansible.builtin.file: + dest: "{{ '/home/' + ansible_user if ansible_user != 'root' else '/root' }}/.hushlogin" + state: touch + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: "0644" + modification_time: preserve + access_time: preserve + - name: User Configuration | Add user to sudoers + when: ansible_user != 'root' + ansible.builtin.copy: + content: "{{ ansible_user }} ALL=(ALL:ALL) NOPASSWD:ALL" + dest: "/etc/sudoers.d/{{ ansible_user }}" + owner: root + group: root + mode: "0440" + - name: User Configuration | Fish shell (1) + ansible.builtin.user: + name: "{{ ansible_user }}" + shell: /usr/bin/fish + - name: User Configuration | Fish shell (2) + ansible.builtin.file: + path: "{{ '/home/' + ansible_user if ansible_user != 'root' else '/root' }}/.config/fish/functions" + state: directory + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + recurse: true + - name: User Configuration | Fish shell (3) + ansible.builtin.copy: + dest: "{{ '/home/' + ansible_user if ansible_user != 'root' else '/root' }}/.config/fish/functions/fish_greeting.fish" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: "0755" + content: neofetch --config none + - name: User Configuration | Fish shell (3) + ansible.builtin.copy: + dest: "{{ '/home/' + ansible_user if ansible_user != 'root' else '/root' }}/.config/fish/functions/k.fish" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: "0755" + content: | + function k --wraps=kubectl --description 'kubectl shorthand' + kubectl $argv + end + + - name: Network Configuration + notify: Reboot + block: + - name: Network Configuration | Set hostname + ansible.builtin.hostname: + name: "{{ inventory_hostname }}" + - name: Network Configuration | Update hosts + ansible.builtin.copy: + dest: /etc/hosts + content: | + 127.0.0.1 localhost + 127.0.1.1 {{ inventory_hostname }} + + # The following lines are desirable for IPv6 capable hosts + ::1 localhost ip6-localhost ip6-loopback + ff02::1 ip6-allnodes + ff02::2 ip6-allrouters + mode: preserve + # https://github.com/cilium/cilium/issues/18706 + - name: Network Configuration | Cilium (1) + ansible.builtin.lineinfile: + dest: /etc/systemd/networkd.conf + regexp: ManageForeignRoutingPolicyRules + line: ManageForeignRoutingPolicyRules=no + - name: Network Configuration | Cilium (2) + ansible.builtin.lineinfile: + dest: /etc/systemd/networkd.conf + regexp: ManageForeignRoutes + line: ManageForeignRoutes=no + + - name: System Configuration + notify: Reboot + block: + - name: System Configuration | Neofetch + ansible.builtin.copy: + dest: /etc/profile.d/neofetch.sh + mode: "0755" + content: neofetch --config none + - name: System Configuration | Disable swap + ansible.posix.mount: + name: "{{ item }}" + fstype: swap + state: absent + loop: ["none", "swap"] + - name: System Configuration | Kernel modules (1) + community.general.modprobe: + name: "{{ item }}" + state: present + loop: ["br_netfilter", "ceph", "ip_vs", "ip_vs_rr", "nbd", "overlay", "rbd"] + - name: System Configuration | Kernel modules (2) + ansible.builtin.copy: + dest: "/etc/modules-load.d/{{ item }}.conf" + mode: "0644" + content: "{{ item }}" + loop: ["br_netfilter", "ceph", "ip_vs", "ip_vs_rr", "nbd", "overlay", "rbd"] + - name: System Configuration | Sysctl + ansible.posix.sysctl: + name: "{{ item.key }}" + value: "{{ item.value }}" + sysctl_file: /etc/sysctl.d/99-kubernetes.conf + reload: true + with_dict: "{{ sysctl_config }}" + vars: + sysctl_config: + fs.inotify.max_queued_events: 65536 + fs.inotify.max_user_watches: 524288 + fs.inotify.max_user_instances: 8192 + - name: System Configuration | Grub (1) + ansible.builtin.replace: + path: /etc/default/grub + regexp: '^(GRUB_CMDLINE_LINUX=(?:(?![" ]{{ item.key | regex_escape }}=).)*)(?:[" ]{{ item.key | regex_escape }}=\S+)?(.*")$' + replace: '\1 {{ item.key }}={{ item.value }}\2' + with_dict: "{{ grub_config }}" + vars: + grub_config: + apparmor: "0" + mitigations: "off" + register: grub_status + - name: System Configuration | Grub (2) # noqa: no-changed-when no-handler + ansible.builtin.command: update-grub + when: grub_status.changed + + handlers: + - name: Reboot + ansible.builtin.reboot: + msg: Rebooting nodes + reboot_timeout: 3600 diff --git a/ansible/kubernetes/playbooks/cluster-update-rollout.yaml b/ansible/kubernetes/playbooks/cluster-update-rollout.yaml new file mode 100644 index 0000000..b30fa3b --- /dev/null +++ b/ansible/kubernetes/playbooks/cluster-update-rollout.yaml @@ -0,0 +1,75 @@ +--- +# https://github.com/kevincoakley/ansible-role-k8s-rolling-update +- name: Cluster update rollout + hosts: all + become: true + gather_facts: true + any_errors_fatal: true + serial: 1 + pre_tasks: + - name: Pausing for 2 seconds... + ansible.builtin.pause: + seconds: 2 + tasks: + - name: Details + ansible.builtin.command: "kubectl get node {{ inventory_hostname }} -o json" + register: kubectl_get_node + delegate_to: "{{ groups['master'][0] }}" + failed_when: false + changed_when: false + + - name: Update + when: + # When status.conditions[x].type == Ready then check stats.conditions[x].status for True|False + - kubectl_get_node['stdout'] | from_json | json_query("status.conditions[?type == 'Ready'].status") + # If spec.unschedulable is defined then the node is cordoned + - not (kubectl_get_node['stdout'] | from_json).spec.unschedulable is defined + block: + - name: Cordon + ansible.builtin.command: "kubectl cordon {{ inventory_hostname }}" + delegate_to: "{{ groups['master'][0] }}" + changed_when: false + + - name: Wait to cordon + ansible.builtin.command: "kubectl get node {{ inventory_hostname }} -o json" + register: wait_for_cordon + retries: 10 + delay: 10 + delegate_to: "{{ groups['master'][0] }}" + changed_when: false + until: (wait_for_cordon['stdout'] | from_json).spec.unschedulable + + - name: Drain + ansible.builtin.command: "kubectl drain --ignore-daemonsets --delete-emptydir-data --force {{ inventory_hostname }}" + delegate_to: "{{ groups['master'][0] }}" + changed_when: false + + - name: Update + ansible.builtin.apt: + upgrade: dist + update_cache: true + + - name: Check if reboot is required + ansible.builtin.stat: + path: /var/run/reboot-required + register: reboot_required + + - name: Reboot + when: reboot_required.stat.exists + ansible.builtin.reboot: + msg: Rebooting node + post_reboot_delay: 120 + reboot_timeout: 3600 + + - name: Uncordon + ansible.builtin.command: "kubectl uncordon {{ inventory_hostname }}" + delegate_to: "{{ groups['master'][0] }}" + changed_when: false + + - name: Wait to uncordon + ansible.builtin.command: "kubectl get node {{ inventory_hostname }} -o json" + retries: 10 + delay: 10 + delegate_to: "{{ groups['master'][0] }}" + changed_when: false + until: not (kubectl_get_node['stdout'] | from_json).spec.unschedulable is defined diff --git a/ansible/kubernetes/playbooks/files/stale-containers.service b/ansible/kubernetes/playbooks/files/stale-containers.service new file mode 100644 index 0000000..5136df2 --- /dev/null +++ b/ansible/kubernetes/playbooks/files/stale-containers.service @@ -0,0 +1,6 @@ +[Unit] +Description=Stale containers + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/k3s crictl rmi --prune diff --git a/ansible/kubernetes/playbooks/files/stale-containers.timer b/ansible/kubernetes/playbooks/files/stale-containers.timer new file mode 100644 index 0000000..731885a --- /dev/null +++ b/ansible/kubernetes/playbooks/files/stale-containers.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Stale containers + +[Timer] +OnCalendar=weekly +AccuracySec=1h +Persistent=true +RandomizedDelaySec=6000 + +[Install] +WantedBy=timers.target diff --git a/ansible/kubernetes/playbooks/tasks/cilium.yaml b/ansible/kubernetes/playbooks/tasks/cilium.yaml new file mode 100644 index 0000000..ca242bb --- /dev/null +++ b/ansible/kubernetes/playbooks/tasks/cilium.yaml @@ -0,0 +1,56 @@ +--- +- name: Cilium + block: + - name: Cilium | Check if Cilium HelmChart exists + kubernetes.core.k8s_info: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: cilium + kind: HelmChart + namespace: kube-system + register: cilium_helmchart + + - name: Cilium | Wait for Cilium to rollout + when: cilium_helmchart.resources | count > 0 + kubernetes.core.k8s_info: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: helm-install-cilium + kind: Job + namespace: kube-system + wait: true + wait_condition: + type: Complete + status: true + wait_timeout: 360 + + - name: Cilium | Patch the Cilium HelmChart to unmanage it + when: cilium_helmchart.resources | count > 0 + kubernetes.core.k8s_json_patch: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: cilium + kind: HelmChart + namespace: kube-system + patch: + - op: add + path: /metadata/annotations/helmcharts.helm.cattle.io~1unmanaged + value: "true" + + - name: Cilium | Delete the Cilium HelmChart CR + when: cilium_helmchart.resources | count > 0 + kubernetes.core.k8s: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: cilium + kind: HelmChart + namespace: kube-system + state: absent + + - name: Cilium | Force delete the Cilium HelmChart + when: cilium_helmchart.resources | count > 0 + kubernetes.core.k8s: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: cilium + kind: HelmChart + namespace: kube-system + state: patched + definition: + metadata: + finalizers: [] diff --git a/ansible/kubernetes/playbooks/tasks/coredns.yaml b/ansible/kubernetes/playbooks/tasks/coredns.yaml new file mode 100644 index 0000000..d18383a --- /dev/null +++ b/ansible/kubernetes/playbooks/tasks/coredns.yaml @@ -0,0 +1,56 @@ +--- +- name: Coredns + block: + - name: Coredns | Check if Coredns HelmChart exists + kubernetes.core.k8s_info: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: coredns + kind: HelmChart + namespace: kube-system + register: coredns_helmchart + + - name: Coredns | Wait for Coredns to rollout + when: coredns_helmchart.resources | count > 0 + kubernetes.core.k8s_info: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: helm-install-coredns + kind: Job + namespace: kube-system + wait: true + wait_condition: + type: Complete + status: true + wait_timeout: 360 + + - name: Coredns | Patch the Coredns HelmChart to unmanage it + when: coredns_helmchart.resources | count > 0 + kubernetes.core.k8s_json_patch: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: coredns + kind: HelmChart + namespace: kube-system + patch: + - op: add + path: /metadata/annotations/helmcharts.helm.cattle.io~1unmanaged + value: "true" + + - name: Coredns | Delete the Coredns HelmChart CR + when: coredns_helmchart.resources | count > 0 + kubernetes.core.k8s: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: coredns + kind: HelmChart + namespace: kube-system + state: absent + + - name: Coredns | Force delete the Coredns HelmChart + when: coredns_helmchart.resources | count > 0 + kubernetes.core.k8s: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: coredns + kind: HelmChart + namespace: kube-system + state: patched + definition: + metadata: + finalizers: [] diff --git a/ansible/kubernetes/playbooks/tasks/cruft.yaml b/ansible/kubernetes/playbooks/tasks/cruft.yaml new file mode 100644 index 0000000..66ae984 --- /dev/null +++ b/ansible/kubernetes/playbooks/tasks/cruft.yaml @@ -0,0 +1,32 @@ +--- +# https://github.com/k3s-io/k3s/issues/1971 +- name: Cruft + block: + - name: Cruft | Get list of custom mantifests + ansible.builtin.find: + paths: "{{ k3s_server_manifests_dir }}" + file_type: file + use_regex: true + patterns: ["^custom-.*"] + register: custom_manifest + + - name: Cruft | Delete custom mantifests + ansible.builtin.file: + path: "{{ item.path }}" + state: absent + loop: "{{ custom_manifest.files }}" + + - name: Cruft | Get list of custom addons + kubernetes.core.k8s_info: + kubeconfig: /etc/rancher/k3s/k3s.yaml + kind: Addon + register: addons_list + + - name: Cruft | Delete addons + kubernetes.core.k8s: + kubeconfig: /etc/rancher/k3s/k3s.yaml + name: "{{ item.metadata.name }}" + kind: Addon + namespace: kube-system + state: absent + loop: "{{ addons_list.resources | selectattr('metadata.name', 'match', '^custom-.*') | list }}" diff --git a/ansible/kubernetes/playbooks/tasks/stale_containers.yaml b/ansible/kubernetes/playbooks/tasks/stale_containers.yaml new file mode 100644 index 0000000..9857d6b --- /dev/null +++ b/ansible/kubernetes/playbooks/tasks/stale_containers.yaml @@ -0,0 +1,36 @@ +--- +# https://github.com/k3s-io/k3s/issues/1900 +- name: Enabled Stale containers + when: stale_containers_state == "enabled" + block: + - name: Stale containers | Create systemd unit + ansible.builtin.copy: + src: files/stale-containers.service + dest: /etc/systemd/system/stale-containers.service + owner: root + group: root + mode: "0644" + + - name: Stale containers | Create systemd timer + ansible.builtin.copy: + src: files/stale-containers.timer + dest: /etc/systemd/system/stale-containers.timer + owner: root + group: root + mode: "0644" + + - name: Stale containers | Start the systemd timer + ansible.builtin.systemd: + name: stale-containers.timer + enabled: true + daemon_reload: true + masked: false + state: started + +- name: Disable Stale containers + when: stale_containers_state == "disabled" + block: + - name: Stale containers | Mask the systemd timer + ansible.builtin.systemd: + name: stale-containers.timer + masked: true diff --git a/ansible/kubernetes/playbooks/templates/custom-cilium-helmchart.yaml.j2 b/ansible/kubernetes/playbooks/templates/custom-cilium-helmchart.yaml.j2 new file mode 100644 index 0000000..de6546d --- /dev/null +++ b/ansible/kubernetes/playbooks/templates/custom-cilium-helmchart.yaml.j2 @@ -0,0 +1,52 @@ +--- +# https://docs.k3s.io/helm +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: cilium + namespace: kube-system +spec: + # renovate: datasource=helm + repo: https://helm.cilium.io/ + chart: cilium + version: 1.14.0 + targetNamespace: kube-system + bootstrap: true + valuesContent: |- + autoDirectNodeRoutes: true + bpf: + masquerade: true + bgp: + enabled: false + cluster: + name: kubernetes + id: 1 + containerRuntime: + integration: containerd + socketPath: /var/run/k3s/containerd/containerd.sock + endpointRoutes: + enabled: true + hubble: + enabled: false + ipam: + mode: kubernetes + ipv4NativeRoutingCIDR: "{{ k3s_server['cluster-cidr'] }}" + k8sServiceHost: "{{ k3s_registration_address }}" + k8sServicePort: 6443 + kubeProxyReplacement: strict + kubeProxyReplacementHealthzBindAddr: 0.0.0.0:10256 + l2announcements: + enabled: true + leaseDuration: 120s + leaseRenewDeadline: 60s + leaseRetryPeriod: 1s + loadBalancer: + algorithm: maglev + mode: dsr + localRedirectPolicy: true + operator: + rollOutPods: true + rollOutCiliumPods: true + securityContext: + privileged: true + tunnel: disabled diff --git a/ansible/kubernetes/playbooks/templates/custom-cilium-l2.yaml.j2 b/ansible/kubernetes/playbooks/templates/custom-cilium-l2.yaml.j2 new file mode 100644 index 0000000..7b96246 --- /dev/null +++ b/ansible/kubernetes/playbooks/templates/custom-cilium-l2.yaml.j2 @@ -0,0 +1,21 @@ +--- +# https://docs.cilium.io/en/latest/network/l2-announcements +apiVersion: cilium.io/v2alpha1 +kind: CiliumL2AnnouncementPolicy +metadata: + name: policy +spec: + loadBalancerIPs: true + interfaces: + - ^enp.* + nodeSelector: + matchLabels: + kubernetes.io/os: linux +--- +apiVersion: cilium.io/v2alpha1 +kind: CiliumLoadBalancerIPPool +metadata: + name: pool +spec: + cidrs: + - cidr: "{{ (ansible_default_ipv4.network + '/' + ansible_default_ipv4.netmask) | ansible.utils.ipaddr('network/prefix') }}" diff --git a/ansible/kubernetes/playbooks/templates/custom-coredns-helmchart.yaml.j2 b/ansible/kubernetes/playbooks/templates/custom-coredns-helmchart.yaml.j2 new file mode 100644 index 0000000..d0b3ce1 --- /dev/null +++ b/ansible/kubernetes/playbooks/templates/custom-coredns-helmchart.yaml.j2 @@ -0,0 +1,77 @@ +--- +# https://docs.k3s.io/helm +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: coredns + namespace: kube-system +spec: + # renovate: datasource=helm + repo: https://coredns.github.io/helm + chart: coredns + version: 1.24.5 + targetNamespace: kube-system + bootstrap: true + valuesContent: |- + fullnameOverride: coredns + replicaCount: 2 + k8sAppLabelOverride: kube-dns + service: + name: kube-dns + clusterIP: {{ k3s_server['service-cidr'] | ansible.utils.nthhost(10) }} + serviceAccount: + create: true + deployment: + annotations: + reloader.stakater.com/auto: "true" + servers: + - zones: + - zone: . + scheme: dns:// + use_tcp: true + port: 53 + plugins: + - name: log + - name: errors + - name: health + configBlock: |- + lameduck 5s + - name: ready + - name: kubernetes + parameters: cluster.local in-addr.arpa ip6.arpa + configBlock: |- + pods insecure + fallthrough in-addr.arpa ip6.arpa + ttl 30 + - name: prometheus + parameters: 0.0.0.0:9153 + - name: forward + parameters: . /etc/resolv.conf + - name: cache + parameters: 30 + - name: loop + - name: reload + - name: loadbalance + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + tolerations: + - key: CriticalAddonsOnly + operator: Exists + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app.kubernetes.io/instance: coredns diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2bafe8b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +ansible==8.2.0 +ansible-lint==6.17.2 +bcrypt==4.0.1 +jmespath==1.0.1 +netaddr==0.8.0 +openshift==0.13.2 +passlib==1.7.4 diff --git a/requirements.yaml b/requirements.yaml new file mode 100644 index 0000000..91640b2 --- /dev/null +++ b/requirements.yaml @@ -0,0 +1,17 @@ +--- +collections: + - name: ansible.posix + version: 1.5.4 + - name: ansible.utils + version: 2.10.3 + - name: community.general + version: 7.2.1 + - name: community.sops + version: 1.6.4 + - name: kubernetes.core + version: 2.4.0 + - name: onepassword.connect + version: 2.2.1 +roles: + - name: xanmanning.k3s + version: v3.4.2