refactor!: replace Longhorn with Rook Ceph

Longhorn is too unreliable for some reason.
2025-07-04 15:27:52 +07:00 · 2024-03-03 10:16:49 +07:00
parent fd8b0113ca
commit b4ba7ea6e2
6 changed files with 28 additions and 34 deletions
--- a/metal/roles/pxe_server/templates/kickstart.ks.j2
+++ b/metal/roles/pxe_server/templates/kickstart.ks.j2
@ -13,7 +13,7 @@ clearpart --all --drives={{ hostvars[item]['disk'] }}
 # Partitioning
 ignoredisk --only-use={{ hostvars[item]['disk'] }}
 partition /boot/efi --fstype=vfat --size=512
-partition / --fstype=ext4 --grow
+partition / --fstype=ext4 --size=32768

 # Network information
 network --bootproto=static --device={{ hostvars[item]['network_interface'] }} --ip={{ hostvars[item]['ansible_host'] }} --gateway={{ ansible_default_ipv4.gateway }} --nameserver={{ dns_server }} --netmask={{ ansible_default_ipv4.netmask }} --ipv6=auto --hostname={{ hostvars[item]['inventory_hostname'] }} --activate
@ -46,10 +46,22 @@ firewall --disabled
 %packages
@^custom-environment
 openssh-server
-iscsi-initiator-utils
 %end

-# Enable iSCSI for Kubernetes storage
-services --enable=iscsid
+# Create a raw partition for Ceph using the remaining space
+# Using a post script because there is no built-in feature in Kickstart
+# The three empty lines are equivalent to pressing Enter to use the default values for:
+# - Partition number
+# - First sector
+# - Last sector
+%post
+fdisk /dev/{{ hostvars[item]['disk'] }} << EOF
+new
+
+
+
+write
+EOF
+%end

 reboot
--- a/system/longhorn-system/Chart.yaml
+++ b/system/longhorn-system/Chart.yaml
@ -1,7 +0,0 @@
-apiVersion: v2
-name: longhorn
-version: 0.0.0
-dependencies:
-  - name: longhorn
-    version: 1.6.0
-    repository: https://charts.longhorn.io
--- a/system/longhorn-system/templates/servicemonitor.yaml
+++ b/system/longhorn-system/templates/servicemonitor.yaml
@ -1,17 +0,0 @@
-# TODO alert rules following https://longhorn.io/docs/1.1.0/monitoring/prometheus_and_grafana_setup/
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: {{ .Release.Name }}
-  namespace: {{ .Release.Namespace }}
-  annotations:
-    argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
-spec:
-  selector:
-    matchLabels:
-      app: longhorn-manager
-  namespaceSelector:
-    matchNames:
-      - {{ .Release.Namespace }}
-  endpoints:
-    - port: manager
--- a/system/longhorn-system/values.yaml
+++ b/system/longhorn-system/values.yaml
@ -1,6 +0,0 @@
-longhorn:
-  defaultSettings:
-    nodeDownPodDeletionPolicy: delete-both-statefulset-and-deployment-pod
-  persistence:
-    # If you have three or more nodes for storage, use 3; otherwise use 2
-    defaultClassReplicaCount: 2  # TODO run DR test to see if we actually need 3
--- a/system/rook-ceph/Chart.yaml
+++ b/system/rook-ceph/Chart.yaml
@ -0,0 +1,10 @@
+apiVersion: v2
+name: rook-ceph
+version: 0.0.0
+dependencies:
+  - name: rook-ceph
+    version: 1.13.5
+    repository: https://charts.rook.io/release
+  - name: rook-ceph-cluster
+    version: 1.13.5
+    repository: https://charts.rook.io/release
--- a/system/rook-ceph/values.yaml
+++ b/system/rook-ceph/values.yaml
@ -0,0 +1,2 @@
+rook-ceph: {}
+rook-ceph-cluster: {}