first edition that installs the cluster but cluster does not get healthy

This commit is contained in:
Marcel Straub
2025-09-03 16:42:08 +02:00
parent c83ca025ae
commit c3d6312cc5
25 changed files with 1088 additions and 2 deletions

9
.gitignore vendored
View File

@@ -1,2 +1,9 @@
# EnvRC files contain secrets that must not be checked in # EnvRC files contain secrets that must not be checked in
.envrc .envrc
# OpenTofu working files
.terraform
output/
*.tfstate
*.tfstate.*
**/*.tfplan

18
.yamllint.yaml Normal file
View File

@@ -0,0 +1,18 @@
extends: default
yaml-files:
- '*.yaml'
- '*.yml'
ignore: |
README.md
LICENSE
secret**.yaml
rules:
document-start: { present: false }
brackets:
min-spaces-inside: 0
max-spaces-inside: 10
line-length:
allow-non-breakable-inline-mappings: true

View File

@@ -0,0 +1,14 @@
#!/usr/bin/bash
NODES="-n 10.51.10.101 -n 10.51.10.102 -n 10.51.10.102"
ENDPOINTS="--endpoints 10.51.10.100"
TALOSCONFIG=" --talosconfig=output/talos-config.yaml"
PARAMETERS="${NODES} ${ENDPOINTS} ${TALOSCONFIG}"
echo " === Talos Cluster Members ==="
talosctl ${PARAMETERS} get members
echo
echo " === etcd status ==="
talosctl ${PARAMETERS} etcd status
echo
echo " === Cluster Health ==="
talosctl ${ENDPOINTS} ${TALOSCONFIG} -n 10.51.10.101 health

View File

@@ -0,0 +1,13 @@
# 00-infrastructure/01-talos-vms/main.tf
module "talos" {
source = "./talos"
providers = {
proxmox = proxmox
}
image = var.talos_image
cluster = var.talos_cluster_config
nodes = var.talos_nodes
}

View File

@@ -0,0 +1,58 @@
resource "local_file" "talos_machine_secrets" {
content = yamlencode({
cluster = module.talos.machine_secrets.cluster
secrets = module.talos.machine_secrets.secrets
trustdinfo = module.talos.machine_secrets.trustdinfo
certs = {
etcd = {
crt = module.talos.machine_secrets.certs.etcd.cert
key = module.talos.machine_secrets.certs.etcd.key
}
k8s = {
crt = module.talos.machine_secrets.certs.k8s.cert
key = module.talos.machine_secrets.certs.k8s.key
}
k8saggregator = {
crt = module.talos.machine_secrets.certs.k8s_aggregator.cert
key = module.talos.machine_secrets.certs.k8s_aggregator.key
}
k8sserviceaccount = {
key = module.talos.machine_secrets.certs.k8s_serviceaccount.key
}
os = {
crt = module.talos.machine_secrets.certs.os.cert
key = module.talos.machine_secrets.certs.os.key
}
}
})
filename = "output/talos-machine-secrets.yaml"
}
resource "local_file" "talos_machine_configs" {
for_each = module.talos.machine_config
content = each.value.machine_configuration
filename = "output/talos-machine-config-${each.key}.yaml"
file_permission = "0600"
}
resource "local_file" "talos_config" {
content = module.talos.client_configuration.talos_config
filename = "output/talos-config.yaml"
file_permission = "0600"
}
# resource "local_file" "kube_config" {
# content = module.talos.kube_config.kubeconfig_raw
# filename = "output/kube-config.yaml"
# file_permission = "0600"
# }
# output "kube_config" {
# value = module.talos.kube_config.kubeconfig_raw
# sensitive = true
# }
output "talos_config" {
value = module.talos.client_configuration.talos_config
sensitive = true
}

View File

@@ -0,0 +1,30 @@
terraform {
required_providers {
kubernetes = {
source = "hashicorp/kubernetes"
version = "2.38.0"
}
proxmox = {
source = "bpg/proxmox"
version = "0.81.0"
}
talos = {
source = "siderolabs/talos"
version = "0.8.1"
}
restapi = {
source = "Mastercard/restapi"
version = "2.0.1"
}
}
}
provider "proxmox" {
endpoint = var.proxmox.endpoint
insecure = var.proxmox.insecure
ssh {
agent = true
username = var.proxmox.username_ssh
}
}

View File

@@ -0,0 +1,6 @@
proxmox = {
name = "pve01"
cluster_name = "homelab"
endpoint = "https://pve01.straubintra.net:8006"
insecure = true
}

View File

@@ -0,0 +1,127 @@
locals {
first_control_plane_node_ip = [for k, v in var.nodes : v.ip if v.machine_type == "controlplane"][0]
kubernetes_endpoint = coalesce(var.cluster.vip, local.first_control_plane_node_ip)
extra_manifests = concat(var.cluster.extra_manifests, [
"https://github.com/kubernetes-sigs/gateway-api/releases/download/${var.cluster.gateway_api_version}/standard-install.yaml",
"https://raw.githubusercontent.com/kubernetes-sigs/gateway-api/${var.cluster.gateway_api_version}/config/crd/experimental/gateway.networking.k8s.io_tlsroutes.yaml",
"https://raw.githubusercontent.com/prometheus-community/helm-charts/refs/heads/main/charts/kube-prometheus-stack/charts/crds/crds/crd-servicemonitors.yaml"
])
}
resource "talos_machine_secrets" "this" {
// Changing talos_version causes trouble as new certs are created
}
data "talos_client_configuration" "this" {
cluster_name = var.cluster.name
client_configuration = talos_machine_secrets.this.client_configuration
nodes = [for k, v in var.nodes : v.ip]
endpoints = [for k, v in var.nodes : v.ip if v.machine_type == "controlplane"]
}
resource "terraform_data" "cilium_bootstrap_inline_manifests" {
input = [
{
name = "cilium-bootstrap"
contents = file("${path.root}/${var.cluster.cilium.bootstrap_manifest_path}")
},
{
name = "cilium-values"
contents = yamlencode({
apiVersion = "v1"
kind = "ConfigMap"
metadata = {
name = "cilium-values"
namespace = "kube-system"
}
data = {
"values.yaml" = file("${path.root}/${var.cluster.cilium.values_file_path}")
}
})
}
]
}
data "talos_machine_configuration" "this" {
for_each = var.nodes
cluster_name = var.cluster.name
# This is the Kubernetes API Server endpoint.
# ref - https://www.talos.dev/latest/introduction/prodnotes/#decide-the-kubernetes-endpoint
cluster_endpoint = "https://${local.kubernetes_endpoint}:6443"
# @formatter:off
talos_version = var.cluster.talos_machine_config_version != null ? var.cluster.talos_machine_config_version : (each.value.update == true ? var.image.update_version : var.image.version)
# @formatter:on
machine_type = each.value.machine_type
machine_secrets = talos_machine_secrets.this.machine_secrets
config_patches = [
templatefile("${path.module}/machine-config/common.yaml.tftpl", {
node_name = each.value.host_node
cluster_name = var.cluster.proxmox_cluster
kubernetes_version = var.cluster.kubernetes_version
http_proxy = var.cluster.http_proxy
no_proxy = var.cluster.no_proxy
ntp_servers = var.cluster.ntp_servers
hostname = each.key
kubelet = var.cluster.kubelet
}), each.value.machine_type == "controlplane" ?
templatefile("${path.module}/machine-config/control_plane.yaml.tftpl", {
mac_address = lower(each.value.mac_address)
vip = var.cluster.vip
extra_manifests = jsonencode(local.extra_manifests)
api_server = var.cluster.api_server
inline_manifests = jsonencode(terraform_data.cilium_bootstrap_inline_manifests.output)
}) :
templatefile("${path.module}/machine-config/worker.yaml.tftpl", {
mac_address = lower(each.value.mac_address)
})
]
}
resource "talos_machine_configuration_apply" "this" {
depends_on = [proxmox_virtual_environment_vm.this]
for_each = var.nodes
node = each.value.ip
client_configuration = talos_machine_secrets.this.client_configuration
machine_configuration_input = data.talos_machine_configuration.this[each.key].machine_configuration
lifecycle {
# re-run config apply if vm changes
replace_triggered_by = [proxmox_virtual_environment_vm.this[each.key]]
}
}
resource "talos_machine_bootstrap" "this" {
depends_on = [talos_machine_configuration_apply.this]
# Bootstrap with the first control plane node.
# VIP not yet available at this stage, so can't use var.cluster.vip
# ref - https://www.talos.dev/v1.9/talos-guides/network/vip/#caveats
node = local.first_control_plane_node_ip
client_configuration = talos_machine_secrets.this.client_configuration
}
data "talos_cluster_health" "this" {
depends_on = [
talos_machine_configuration_apply.this,
talos_machine_bootstrap.this
]
skip_kubernetes_checks = false
client_configuration = data.talos_client_configuration.this.client_configuration
control_plane_nodes = [for k, v in var.nodes : v.ip if v.machine_type == "controlplane"]
worker_nodes = [for k, v in var.nodes : v.ip if v.machine_type == "worker"]
endpoints = data.talos_client_configuration.this.endpoints
timeouts = {
read = "10m"
}
}
resource "talos_cluster_kubeconfig" "this" {
depends_on = [
talos_machine_bootstrap.this,
data.talos_cluster_health.this
]
# The kubeconfig endpoint will be populated from the talos_machine_configuration cluster_endpoint
node = local.first_control_plane_node_ip
client_configuration = talos_machine_secrets.this.client_configuration
timeouts = {
read = "1m"
}
}

View File

@@ -0,0 +1,63 @@
locals {
version = var.image.version
schematic = file("${path.root}/${var.image.schematic_path}")
schematic_id = jsondecode(data.http.schematic_id.response_body)["id"]
update_version = coalesce(var.image.update_version, var.image.version)
update_schematic_path = coalesce(var.image.update_schematic_path, var.image.schematic_path)
update_schematic = file("${path.root}/${local.update_schematic_path}")
update_schematic_id = jsondecode(data.http.updated_schematic_id.response_body)["id"]
image_id = "${local.schematic_id}_${local.version}"
update_image_id = "${local.update_schematic_id}_${local.update_version}"
# Comment the above 2 lines and un-comment the below 2 lines to use the provider schematic ID instead of the HTTP one
# ref - https://github.com/vehagn/homelab/issues/106
# image_id = "${talos_image_factory_schematic.this.id}_${local.version}"
# update_image_id = "${talos_image_factory_schematic.updated.id}_${local.update_version}"
}
data "http" "schematic_id" {
url = "${var.image.factory_url}/schematics"
method = "POST"
request_body = local.schematic
}
data "http" "updated_schematic_id" {
url = "${var.image.factory_url}/schematics"
method = "POST"
request_body = local.update_schematic
}
resource "talos_image_factory_schematic" "this" {
schematic = local.schematic
}
resource "talos_image_factory_schematic" "updated" {
schematic = local.update_schematic
}
# Note the ellipsis (...) after the for-loop. This collects values with same keys into a list.
# The key is purposefully made up of the values (image_id contains both schematic id and version),
# since all values under a key therefore are the same, we can simply select the first element of the value list.
# Improvements are welcome!
resource "proxmox_virtual_environment_download_file" "this" {
for_each = {
for k, v in var.nodes :
"${v.host_node}_${v.update == true ? local.update_image_id : local.image_id}" => {
host_node = v.host_node
schematic = v.update == true ? talos_image_factory_schematic.updated.id : talos_image_factory_schematic.this.id
version = v.update == true ? local.update_version : local.version
}...
}
node_name = each.value[0].host_node
content_type = "iso"
datastore_id = var.image.proxmox_datastore
file_name = "talos-${each.value[0].schematic}-${each.value[0].version}-${var.image.platform}-${var.image.arch}.img"
url = "${var.image.factory_url}/image/${each.value[0].schematic}/${each.value[0].version}/${var.image.platform}-${var.image.arch}.raw.gz"
decompression_algorithm = "gz"
overwrite = false
overwrite_unmanaged = true
}

View File

@@ -0,0 +1,6 @@
customization:
systemExtensions:
officialExtensions:
- siderolabs/amd-ucode
- siderolabs/intel-ucode
- siderolabs/qemu-guest-agent

View File

@@ -0,0 +1,86 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cilium-install
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: cilium-install
namespace: kube-system
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: cilium-install
namespace: kube-system
---
apiVersion: batch/v1
kind: Job
metadata:
name: cilium-install
namespace: kube-system
spec:
backoffLimit: 10
template:
metadata:
labels:
app: cilium-install
spec:
restartPolicy: OnFailure
tolerations:
- operator: Exists
- effect: NoSchedule
operator: Exists
- effect: NoExecute
operator: Exists
- effect: PreferNoSchedule
operator: Exists
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoExecute
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: PreferNoSchedule
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
serviceAccountName: cilium-install
hostNetwork: true
containers:
- name: cilium-install
image: quay.io/cilium/cilium-cli:v0.18.6 # renovate: github-releases=cilium/cilium-cli
env:
- name: KUBERNETES_SERVICE_HOST
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: KUBERNETES_SERVICE_PORT
value: "6443"
volumeMounts:
- name: values
mountPath: /root/app/values.yaml
subPath: values.yaml
command:
- cilium
- install
- --version=v1.18.0 # renovate: github-releases=cilium/cilium
- --set
- kubeProxyReplacement=true
- --values
- values.yaml
volumes:
- name: values
configMap:
name: cilium-values

View File

@@ -0,0 +1,67 @@
machine:
registries:
mirrors:
docker.io:
endpoints:
- http://harbor.prod.eis-mk8.de.s5b.org/v2/proxy-docker.io
overridePath: true
ghcr.io:
endpoints:
- http://harbor.prod.eis-mk8.de.s5b.org/v2/proxy-ghcr.io
overridePath: true
gcr.io:
endpoints:
- http://harbor.prod.eis-mk8.de.s5b.org/v2/proxy-gcr.io
overridePath: true
registry.k8s.io:
endpoints:
- http://harbor.prod.eis-mk8.de.s5b.org/v2/proxy-registry.k8s.io
overridePath: true
kubelet:
image: ghcr.io/siderolabs/kubelet:${kubernetes_version}
%{if kubelet != ""}
${indent(4, kubelet)}
%{endif}
nodeLabels:
topology.kubernetes.io/region: ${cluster_name}
topology.kubernetes.io/zone: ${node_name}
network:
hostname: ${hostname}
%{if length(ntp_servers) > 0 }
time:
servers:
%{ for addr in ntp_servers ~}
- ${addr}
%{ endfor ~}
%{endif}
sysctls:
fs.inotify.max_user_watches: 1048576 # Watchdog
fs.inotify.max_user_instances: 8192 # Watchdog
net.core.default_qdisc: fq # 10Gb/s
net.core.rmem_max: 67108864 # 10Gb/s | Cloudflared / QUIC
net.core.wmem_max: 67108864 # 10Gb/s | Cloudflared / QUIC
net.ipv4.tcp_congestion_control: bbr # 10Gb/s
net.ipv4.tcp_fastopen: 3 # Send and accept data in the opening SYN packet
net.ipv4.tcp_mtu_probing: 1 # 10Gb/s | Jumbo frames
net.ipv4.tcp_rmem: 4096 87380 33554432 # 10Gb/s
net.ipv4.tcp_wmem: 4096 65536 33554432 # 10Gb/s
net.ipv4.tcp_window_scaling: 1 # 10Gb/s
vm.nr_hugepages: 1024 # PostgreSQL
%{if http_proxy != ""}
env:
http_proxy: ${http_proxy}
https_proxy: ${http_proxy}
%{if no_proxy != ""}
no_proxy: ${no_proxy}
%{endif}
%{endif}
cluster:
apiServer:
image: registry.k8s.io/kube-apiserver:${kubernetes_version}
controllerManager:
image: registry.k8s.io/kube-controller-manager:${kubernetes_version}
proxy:
image: registry.k8s.io/kube-proxy:${kubernetes_version}
scheduler:
image: registry.k8s.io/kube-scheduler:${kubernetes_version}

View File

@@ -0,0 +1,39 @@
# https://www.talos.dev/v1.10/reference/configuration/v1alpha1/config/
machine:
network:
interfaces:
- deviceSelector:
hardwareAddr: ${mac_address}
dhcp: true
%{ if vip != null }
vip:
ip: ${vip}
%{ endif }
cluster:
allowSchedulingOnControlPlanes: true
%{if api_server != ""}
apiServer:
${indent(4, api_server)}
%{endif}
controllerManager:
extraArgs:
bind-address: 0.0.0.0
etcd:
extraArgs:
listen-metrics-urls: http://0.0.0.0:2381
scheduler:
extraArgs:
bind-address: 0.0.0.0
network:
cni:
name: none
proxy:
disabled: true
discovery:
enabled: true
registries:
service:
disabled: false
extraManifests: ${extra_manifests}
inlineManifests: ${inline_manifests}

View File

@@ -0,0 +1,6 @@
machine:
network:
interfaces:
- deviceSelector:
hardwareAddr: ${mac_address}
dhcp: true

View File

@@ -0,0 +1,18 @@
output "machine_secrets" {
value = talos_machine_secrets.this.machine_secrets
sensitive = true
}
output "machine_config" {
value = data.talos_machine_configuration.this
}
output "client_configuration" {
value = data.talos_client_configuration.this
sensitive = true
}
# output "kube_config" {
# value = talos_cluster_kubeconfig.this
# sensitive = true
# }

View File

@@ -0,0 +1,16 @@
terraform {
required_providers {
proxmox = {
source = "bpg/proxmox"
version = ">=0.81.0"
}
talos = {
source = "siderolabs/talos"
version = ">=0.8.1"
}
http = {
source = "hashicorp/http"
version = ">=3.4.5"
}
}
}

View File

@@ -0,0 +1,55 @@
variable "image" {
description = "Talos image configuration"
type = object({
factory_url = optional(string, "https://factory.talos.dev")
schematic_path = string
version = string
update_schematic_path = optional(string)
update_version = optional(string)
arch = optional(string, "amd64")
platform = optional(string, "nocloud")
proxmox_datastore = optional(string, "local")
})
}
variable "cluster" {
description = "Cluster configuration"
type = object({
name = string
vip = optional(string)
subnet_mask = optional(string, "24")
talos_machine_config_version = optional(string)
proxmox_cluster = string
kubernetes_version = string
gateway_api_version = string
node_network_vlan = optional(number)
ntp_servers = optional(list(string), [])
http_proxy = optional(string, "")
no_proxy = optional(string, "")
extra_manifests = optional(list(string))
kubelet = optional(string)
api_server = optional(string, "")
cilium = object({
bootstrap_manifest_path = string
values_file_path = string
})
})
}
variable "nodes" {
description = "Configuration for cluster nodes"
type = map(object({
host_node = string
machine_type = string
datastore_id = optional(string, "ZFS")
ip = string
dns = optional(list(string))
mac_address = string
vm_id = number
cpu = number
ram_dedicated = number
system_disk_size = optional(number, 60)
update = optional(bool, false)
igpu = optional(bool, false)
}))
}

View File

@@ -0,0 +1,84 @@
resource "proxmox_virtual_environment_vm" "this" {
for_each = var.nodes
node_name = each.value.host_node
name = each.key
description = each.value.machine_type == "controlplane" ? "Talos Control Plane" : "Talos Worker"
tags = each.value.machine_type == "controlplane" ? ["k8s", "control-plane"] : ["k8s", "worker"]
on_boot = true
vm_id = each.value.vm_id
machine = "q35"
scsi_hardware = "virtio-scsi-single"
bios = "seabios"
agent {
enabled = true
}
cpu {
cores = each.value.cpu
type = "host"
}
memory {
dedicated = each.value.ram_dedicated
}
network_device {
bridge = "vmbr0"
mac_address = each.value.mac_address
vlan_id = var.cluster.node_network_vlan
}
disk {
datastore_id = each.value.datastore_id
interface = "scsi0"
iothread = true
cache = "writethrough"
discard = "on"
ssd = true
file_format = "raw"
size = each.value.system_disk_size
file_id = proxmox_virtual_environment_download_file.this["${each.value.host_node}_${each.value.update == true ? local.update_image_id : local.image_id}"].id
}
boot_order = ["scsi0"]
operating_system {
type = "l26" # Linux Kernel 2.6 - 6.X.
}
# We use DHCP with static mappings --> Not needed
# initialization {
# datastore_id = each.value.datastore_id
# # Optional DNS Block. Update Nodes with a list value to use.
# dynamic "dns" {
# for_each = try(each.value.dns, null) != null ? { "enabled" = each.value.dns } : {}
# content {
# servers = each.value.dns
# }
# }
# ip_config {
# ipv4 {
# address = "${each.value.ip}/${var.cluster.subnet_mask}"
# gateway = var.cluster.gateway
# }
# }
# }
dynamic "hostpci" {
for_each = each.value.igpu ? [1] : []
content {
# Passthrough iGPU
device = "hostpci0"
mapping = "iGPU"
pcie = true
rombar = true
xvga = false
}
}
}

View File

@@ -0,0 +1,19 @@
talos_cluster_config = {
name = "eismk8-prod"
proxmox_cluster = "homelab"
node_network_vlan = 210
ntp_servers = [
"2a13:fc80:1:f000::1"
]
# http_proxy = "http://100.64.0.1:3128"
# no_proxy = "10.0.0.0/8"
vip = "10.51.10.100"
kubernetes_version = "v1.33.3"
gateway_api_version = "v1.3.0"
cilium = {
bootstrap_manifest_path = "talos/inline-manifests/cilium_install.yaml"
values_file_path = "../../02-k8s/infra/network/cilium/values.yaml"
}
}

View File

@@ -0,0 +1,6 @@
talos_image = {
version = "v1.10.7"
update_version = "v1.10.7"
schematic_path = "talos/image/schematic.yaml"
#update_schematic_path = "talos/image/schematic.yaml"
}

View File

@@ -0,0 +1,58 @@
talos_nodes = {
# Controller Nodes
"ctrl-01" = {
host_node = "pve01"
machine_type = "controlplane"
ip = "10.51.10.101"
mac_address = "BC:24:11:7B:76:3E"
vm_id = 301
cpu = 1
ram_dedicated = 4096
}
"ctrl-02" = {
host_node = "pve02"
machine_type = "controlplane"
ip = "10.51.10.102"
mac_address = "BC:24:11:16:85:7D"
vm_id = 302
cpu = 1
ram_dedicated = 4096
}
"ctrl-03" = {
host_node = "pve-oberon"
machine_type = "controlplane"
ip = "10.51.10.103"
mac_address = "BC:24:11:B8:B6:6F"
vm_id = 303
cpu = 1
ram_dedicated = 4096
}
# Worker Nodes
"worker-01" = {
host_node = "pve01"
machine_type = "worker"
ip = "10.51.11.1"
mac_address = "BC:24:11:E1:E9:AE"
vm_id = 311
cpu = 4
ram_dedicated = 8192
}
"worker-02" = {
host_node = "pve02"
machine_type = "worker"
ip = "10.51.11.2"
mac_address = "BC:24:11:63:3A:85"
vm_id = 312
cpu = 4
ram_dedicated = 8192
}
"worker-03" = {
host_node = "pve-oberon"
machine_type = "worker"
ip = "10.51.11.3"
mac_address = "BC:24:11:8E:75:0E"
vm_id = 313
cpu = 4
ram_dedicated = 8192
}
}

View File

@@ -0,0 +1,78 @@
variable "proxmox" {
description = "Proxmox provider configuration"
type = object({
name = string
cluster_name = string
endpoint = string
insecure = bool
username_ssh = optional(string, "root")
})
}
# variable "proxmox_api_token" {
# description = "API token for Proxmox"
# type = string
# sensitive = true
# }
variable "talos_image" {
description = "Talos image configuration"
type = object({
factory_url = optional(string, "https://factory.talos.dev")
version = string
schematic_path = string
update_version = optional(string)
update_schematic_path = optional(string)
arch = optional(string, "amd64")
platform = optional(string, "nocloud")
proxmox_datastore = optional(string, "local")
})
}
variable "talos_cluster_config" {
description = "Talos cluster configuration"
type = object({
name = string
vip = optional(string)
subnet_mask = optional(string, "24")
talos_machine_config_version = optional(string)
proxmox_cluster = string
kubernetes_version = string
gateway_api_version = string
node_network_vlan = optional(number)
http_proxy = optional(string)
no_proxy = optional(string)
extra_manifests = optional(list(string), [])
kubelet = optional(string, "")
api_server = optional(string)
cilium = object({
bootstrap_manifest_path = string
values_file_path = string
})
})
}
variable "talos_nodes" {
type = map(
object({
host_node = string
machine_type = string
ip = string
dns = optional(list(string))
mac_address = string
vm_id = number
cpu = number
ram_dedicated = number
system_disk_size = optional(number, 60)
update = optional(bool, false)
igpu = optional(bool, false)
})
)
validation {
// @formatter:off
condition = length([for n in var.talos_nodes : n if contains(["controlplane", "worker"], n.machine_type)]) == length(var.talos_nodes)
error_message = "Node machine_type must be either 'controlplane' or 'worker'."
// @formatter:on
}
}

View File

@@ -6,6 +6,42 @@ Here, you find everything to setup the VM infrastructure for TALOS cluster.
1. [Ansible Notebooks for managing Proxmox hosts](./00-ansible-pve-hosts/README.md) 1. [Ansible Notebooks for managing Proxmox hosts](./00-ansible-pve-hosts/README.md)
## Get started
### Setup Proxmox Cluster
tbd. For now, it's expected to have happened already.
- Creation of terraform user
- Assign role
- create token
### Install OpenTofu
```shell
sudo apt-get update
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
sudo install -m 0755 -d /etc/apt/keyrings
curl -fsSL https://get.opentofu.org/opentofu.gpg | sudo tee /etc/apt/keyrings/opentofu.gpg >/dev/null
curl -fsSL https://packages.opentofu.org/opentofu/tofu/gpgkey | sudo gpg --no-tty --batch --dearmor -o /etc/apt/keyrings/opentofu-repo.gpg >/dev/null
sudo chmod a+r /etc/apt/keyrings/opentofu.gpg
echo \
"deb [signed-by=/etc/apt/keyrings/opentofu.gpg,/etc/apt/keyrings/opentofu-repo.gpg] https://packages.opentofu.org/opentofu/tofu/any/ any main
deb-src [signed-by=/etc/apt/keyrings/opentofu.gpg,/etc/apt/keyrings/opentofu-repo.gpg] https://packages.opentofu.org/opentofu/tofu/any/ any main" | \
sudo tee /etc/apt/sources.list.d/opentofu.list > /dev/null
```
```shell
wget -O /tmp/tofu.deb https://github.com/opentofu/opentofu/releases/download/v1.10.5/tofu_1.10.5_386.deb
sudo dpkg -i /tmp/tofu.deb
rm /tmp/tofu.deb
sudo apt install -f
```
## Literature ## Literature
- [Talos Kubernetes on Proxmox using OpenTofu](https://blog.stonegarden.dev/articles/2024/08/talos-proxmox-tofu/) - [Talos Kubernetes on Proxmox using OpenTofu](https://blog.stonegarden.dev/articles/2024/08/talos-proxmox-tofu/)
- [Talos on Proxmox with Terraform (multiple node pools)](https://github.com/sergelogvinov/terraform-talos/tree/main/proxmox)
- [Infrastructure Automation: Provisioning VMs on Proxmox with Packer, OpenTOFU, GitLab, Vault, and Minio.](https://medium.com/@avishkumar27/infrastructure-automation-provisioning-vms-on-proxmox-with-packer-opentofu-gitlab-vault-and-27fda7d73771) - [Infrastructure Automation: Provisioning VMs on Proxmox with Packer, OpenTOFU, GitLab, Vault, and Minio.](https://medium.com/@avishkumar27/infrastructure-automation-provisioning-vms-on-proxmox-with-packer-opentofu-gitlab-vault-and-27fda7d73771)

View File

@@ -0,0 +1,152 @@
# https://github.com/cilium/cilium/blob/main/install/kubernetes/cilium/values.yaml
cluster:
name: talos
id: 1
kubeProxyReplacement: true
# Talos specific
k8sServiceHost: localhost
k8sServicePort: 7445
securityContext:
capabilities:
ciliumAgent: [ CHOWN, KILL, NET_ADMIN, NET_RAW, IPC_LOCK, SYS_ADMIN, SYS_RESOURCE, DAC_OVERRIDE, FOWNER, SETGID, SETUID ]
cleanCiliumState: [ NET_ADMIN, SYS_ADMIN, SYS_RESOURCE ]
cgroup:
autoMount:
enabled: false
hostRoot: /sys/fs/cgroup
# https://www.talos.dev/latest/talos-guides/network/host-dns/#forwarding-kube-dns-to-host-dns
# https://docs.cilium.io/en/stable/operations/performance/tuning/#ebpf-host-routing
bpf:
hostLegacyRouting: true
# https://docs.cilium.io/en/stable/network/concepts/ipam/
ipam:
mode: kubernetes
multiPoolPreAllocation: ""
operator:
rollOutPods: true
prometheus:
metricsService: true
enabled: true
port: 9963
serviceMonitor:
enabled: true
dashboards:
enabled: true
resources:
limits:
cpu: 500m
memory: 256Mi
requests:
cpu: 50m
memory: 128Mi
# Roll out cilium agent pods automatically when ConfigMap is updated.
rollOutCiliumPods: true
resources:
limits:
cpu: 1000m
memory: 1Gi
requests:
cpu: 200m
memory: 512Mi
#debug:
# enabled: true
# Increase rate limit when doing L2 announcements
k8sClientRateLimit:
qps: 20
burst: 100
l2announcements:
enabled: true
externalIPs:
enabled: true
loadBalancer:
# https://docs.cilium.io/en/stable/network/kubernetes/kubeproxy-free/#maglev-consistent-hashing
algorithm: maglev
gatewayAPI:
enabled: true
# enableAlpn: true
# enableAppProtocol: true
envoy:
prometheus:
enabled: true
port: "9964"
serviceMonitor:
enabled: true
securityContext:
capabilities:
keepCapNetBindService: true
envoy: [ NET_ADMIN, PERFMON, BPF ]
hubble:
enabled: true
metrics:
enabled:
- dns
- drop
- tcp
- flow
- port-distribution
- icmp
- "httpV2:exemplars=true;labelsContext=source_ip,source_namespace,source_workload,destination_ip,destination_namespace,destination_workload,traffic_direction;sourceContext=workload-name|reserved-identity;destinationContext=workload-name|reserved-identity"
enableOpenMetrics: true
port: 9965
serviceMonitor:
enabled: true
dashboards:
enabled: true
relay:
enabled: true
rollOutPods: true
prometheus:
enabled: true
port: 9966
serviceMonitor:
enabled: true
ui:
enabled: true
rollOutPods: true
ingressController: { enabled: false }
clustermesh:
apiserver:
metrics:
enabled: true
port: 9962
serviceMonitor:
enabled: true
# mTLS
authentication:
enabled: false
mutual:
spire:
enabled: false
install:
server:
dataStorage:
storageClass: cilium-spire-sc
prometheus:
metricsService: true
enabled: true
port: 9962
serviceMonitor:
enabled: true
trustCRDsExist: true
dashboards:
enabled: true

View File

@@ -4,4 +4,28 @@
1. [Infrastructure](./00-infrastructure/README.md) 1. [Infrastructure](./00-infrastructure/README.md)
## Configuration ## Architecture
### Network
- IPv4 configuration uses DHCP with static MAC binding for easy bring-up
- IPv6 addresses are manually assigned
- DNS Zone: prod.k8.eis-mk8.de.s5b.org
|VLAN|IPv4|IPv6|
|--|--|--|
|210|10.51.10.0/23|2a13:fc80:1:a::/64|
## How to use
### Prerequisites
#### Secrets handling
Use ``direnv`` package to automatically load the environment variables for a directory and keep the secrets in ``.envrc`` files. For ZSH, add
```shell
eval "$(direnv hook zsh)"
``
to your ``.zshrc`` for automatic loading on directory change.