From 18d51382ef6fb55cbf80e53adab8b096a591917a Mon Sep 17 00:00:00 2001 From: Khue Doan Date: Fri, 22 Nov 2024 15:26:43 +0700 Subject: [PATCH] feat: backup and restore script --- Makefile | 11 ++ README.md | 2 +- docs/how-to-guides/backup-and-restore.md | 105 ++++++++++ docs/installation/post-installation.md | 5 + .../production/external-resources.md | 15 +- external/terraform.tfvars.example | 7 +- mkdocs.yml | 1 + scripts/backup | 185 ++++++++++++++++++ 8 files changed, 320 insertions(+), 11 deletions(-) create mode 100644 docs/how-to-guides/backup-and-restore.md create mode 100755 scripts/backup diff --git a/Makefile b/Makefile index 536672c4..3fb034a4 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,17 @@ tools: --workdir $(shell pwd) \ docker.io/nixos/nix nix --experimental-features 'nix-command flakes' develop +# TODO maybe there's a better way to manage backup with GitOps? +backup: + ./scripts/backup --action setup --namespace=actualbudget --pvc=actualbudget-data + ./scripts/backup --action setup --namespace=wireguard --pvc=wireguard-data + ./scripts/backup --action setup --namespace=jellyfin --pvc=jellyfin-data + +restore: + ./scripts/backup --action restore --namespace=actualbudget --pvc=actualbudget-data + ./scripts/backup --action restore --namespace=wireguard --pvc=wireguard-data + ./scripts/backup --action restore --namespace=jellyfin --pvc=jellyfin-data + test: make -C test diff --git a/README.md b/README.md index 762cd205..b180edb8 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ More information can be found in [the roadmap](#roadmap) below. - [x] Distributed storage - [x] Support multiple environments (dev, prod) - [x] Monitoring and alerting -- [ ] Automated offsite backups 🚧 +- [x] Automated backup and restore - [x] Single sign-on - [x] Infrastructure testing diff --git a/docs/how-to-guides/backup-and-restore.md b/docs/how-to-guides/backup-and-restore.md new file mode 100644 index 00000000..60188104 --- /dev/null +++ b/docs/how-to-guides/backup-and-restore.md @@ -0,0 +1,105 @@ +# Backup and restore + +## Prerequisites + +Create an S3 bucket to store backups. You can use AWS S3, Minio, or +any other S3-compatible provider. + +- For AWS S3, your bucket URL might look something like this: + `https://s3.amazonaws.com/my-homelab-backup`. +- For Minio, your bucket URL might look something like this: + `https://my-s3-host.example.com/homelab-backup`. + +Follow your provider's documentation to create a service account with the +following policy (replace `my-homelab-backup` with your actual bucket name): + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::my-homelab-backup", + "arn:aws:s3:::my-homelab-backup/*" + ] + } + ] +} +``` + +Save the access key and secret key to a secure location, such as a password +manager. While you're at it, generate a new password for Restic encryption and +save it there as well. + +!!! example + + I use Minio for my homelab backups. Here's how I set it up: + + - Create a bucket named `homelab-backup`. + - Create a service account under Identity -> Service Accounts -> Create + Service Account: + - Enable Restrict beyond user policy. + - Paste the policy above. + - Click Create and copy the access key and secret key + - I also set up Minio replication to store backups in two locations: one in + my house and one remotely. + +## Add backup credentials to global secrets + +Add the following to `external/terraform.tfvars`: + +```hcl +extra_secrets = { + restic-password = "xxxxxxxxxxxxxxxxxxxxxxxx" + restic-s3-bucket = "https://s3.amazonaws.com/my-homelab-backup-xxxxxxxxxx" + restic-s3-access-key = "xxxxxxxxxxxxxxxx" + restic-s3-secret-key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +} +``` + +Then apply the changes: + +```sh +make external +``` + +You may want to back up the `external/terraform.tfvars` file to a secure location as well. + +## Add backup configuration for volumes + +!!! warning + Do not run the backup command when building a new cluster where you intend + to restore backups, as it may overwrite existing backup data. To restore + data on a new cluster, refer to the [restore from + backup](#restore-from-backup) section. + +For now, you need to run a command to opt-in volumes until we have a better +GitOps solution: + +```sh +make backup +``` + +This command will set up Restic repositories and back up the volumes configured +in `./Makefile`. You can adjust the list there to add or remove volumes from the +backup. You only need to run this command once, the backup configuration will +be stored in the cluster and run on a schedule. + +## Restore from backup + +The restore process is ad-hoc, you need to run a command to restore application volumes: + +```sh +make restore +``` + +The command above will restore the latest backup of recommended volumes. Like +with backups, you can modify `./Makefile` to adjust the list of volumes you +want to restore. diff --git a/docs/installation/post-installation.md b/docs/installation/post-installation.md index 115fc84c..02b7104b 100644 --- a/docs/installation/post-installation.md +++ b/docs/installation/post-installation.md @@ -26,6 +26,11 @@ Save the following files to a safe location like a password manager (if you're u - Username: `admin` - Password: get from `global-secrets` namespace +## Backup + +Now is a good time to set up backups for your homelab. +Follow the [backup and restore guide](../how-to-guides/backup-and-restore.md) to get started. + ## Run the full test suite After the homelab has been stabilized, you can run the full test suite to ensure that everything is working properly: diff --git a/docs/installation/production/external-resources.md b/docs/installation/production/external-resources.md index 161c1688..0fd6aff4 100644 --- a/docs/installation/production/external-resources.md +++ b/docs/installation/production/external-resources.md @@ -7,14 +7,12 @@ Although I try to keep the amount of external resources to the minimum, there's still need for a few of them. Below is a list of external resources and why we need them (also see some [alternatives](#alternatives) below). -| Provider | Resource | Purpose | -| -------- | -------- | ------- | -| Terraform Cloud | Workspace | Terraform state backend | -| Cloudflare | DNS | DNS and [DNS-01 challenge](https://letsencrypt.org/docs/challenge-types/#dns-01-challenge) for certificates | -| Cloudflare | Tunnel | Public services to the internet without port forwarding | -| ntfy | Topic | External notification service to receive alerts | - - +| Provider | Resource | Purpose | +| -------- | -------- | ------- | +| Terraform Cloud | Workspace | Terraform state backend | +| Cloudflare | DNS | DNS and [DNS-01 challenge](https://letsencrypt.org/docs/challenge-types/#dns-01-challenge) for certificates | +| Cloudflare | Tunnel | Public services to the internet without port forwarding | +| ntfy | Topic | External notification service to receive alerts | ## Create credentials @@ -79,4 +77,3 @@ To avoid vendor lock-in, each external provider must have an equivalent alternat - ntfy: - [Self-host your own ntfy server](https://docs.ntfy.sh/install) - Any other [integration supported by Grafana Alerting](https://grafana.com/docs/grafana/latest/alerting/alerting-rules/manage-contact-points/integrations/#list-of-supported-integrations) - diff --git a/external/terraform.tfvars.example b/external/terraform.tfvars.example index aabb2101..7cacd3f9 100644 --- a/external/terraform.tfvars.example +++ b/external/terraform.tfvars.example @@ -16,6 +16,11 @@ extra_secrets = { # Try to keep this to a minimum with third-party secrets # Consider using the secret generator if possible # ../platform/global-secrets/files/secret-generator/config.yaml + # Here's some examples of what you might want to add: # - # key = "value" + # tailscale-auth-key = "tskey-auth-xxxxxxxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + # restic-password = "xxxxxxxxxxxxxxxxxxxxxxxx" + # restic-s3-bucket = "https://s3.amazonaws.com/my-homelab-backup-xxxxxxxxxx" + # restic-s3-access-key = "xxxxxxxxxxxxxxxx" + # restic-s3-secret-key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" } diff --git a/mkdocs.yml b/mkdocs.yml index 276411e3..64cbeaee 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,6 +59,7 @@ nav: - How-to guides: - how-to-guides/alternate-dns-setup.md - how-to-guides/expose-services-to-the-internet.md + - how-to-guides/backup-and-restore.md - how-to-guides/use-both-github-and-gitea.md - how-to-guides/add-or-remove-nodes.md - how-to-guides/run-commands-on-multiple-nodes.md diff --git a/scripts/backup b/scripts/backup new file mode 100755 index 00000000..b57fe884 --- /dev/null +++ b/scripts/backup @@ -0,0 +1,185 @@ +#!/usr/bin/env python + +import argparse +from kubernetes import client, config +from kubernetes.client.rest import ApiException + +config.load_kube_config() + +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument("--namespace", required=True) +arg_parser.add_argument("--pvc", required=True) +arg_parser.add_argument("--action", required=True) +args = arg_parser.parse_args() + +namespace = args.namespace +pvc = args.pvc +secret = f"{pvc}-backup-repository" + + +def apply_custom_resource(api, group, version, plural, name, namespace, body): + try: + # Check if the resource exists + api.get_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + ) + + print(f"Patching {body['kind']} {name}") + api.patch_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body=body, + ) + except ApiException as e: + if e.status == 404: + print(f"Creating {body['kind']} {name}") + api.create_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + body=body, + ) + else: + raise e + + +apply_custom_resource( + api=client.CustomObjectsApi(), + group="external-secrets.io", + version="v1beta1", + plural="externalsecrets", + name=secret, + namespace=namespace, + body={ + "apiVersion": "external-secrets.io/v1beta1", + "kind": "ExternalSecret", + "metadata": { + "name": secret, + "namespace": namespace, + "annotations": { + "app.kubernetes.io/managed-by": "scripts/backup", + }, + }, + "spec": { + "secretStoreRef": { + "kind": "ClusterSecretStore", + "name": "global-secrets", + }, + "data": [ + { + "remoteRef": { + "key": "external", + "property": "restic-s3-bucket", + }, + "secretKey": "restic_s3_bucket", + }, + { + "remoteRef": { + "key": "external", + "property": "restic-s3-access-key", + }, + "secretKey": "restic_s3_access_key", + }, + { + "remoteRef": { + "key": "external", + "property": "restic-s3-secret-key", + }, + "secretKey": "restic_s3_secret_key", + }, + { + "remoteRef": { + "key": "external", + "property": "restic-password", + }, + "secretKey": "restic_password", + }, + ], + "target": { + "template": { + "data": { + "RESTIC_REPOSITORY": f"s3:{{{{ .restic_s3_bucket }}}}/{namespace}/{pvc}", + "RESTIC_PASSWORD": "{{ .restic_password }}", + "AWS_ACCESS_KEY_ID": "{{ .restic_s3_access_key }}", + "AWS_SECRET_ACCESS_KEY": "{{ .restic_s3_secret_key }}", + } + } + }, + }, + }, +) + +if args.action == "setup": + apply_custom_resource( + api=client.CustomObjectsApi(), + group="volsync.backube", + version="v1alpha1", + plural="replicationsources", + name=pvc, + namespace=namespace, + body={ + "apiVersion": "volsync.backube/v1alpha1", + "kind": "ReplicationSource", + "metadata": { + "name": pvc, + "namespace": namespace, + "annotations": { + "app.kubernetes.io/managed-by": "scripts/backup", + }, + }, + "spec": { + "sourcePVC": pvc, + "trigger": {"schedule": "*/30 * * * *"}, + "restic": { + "pruneIntervalDays": 14, + "repository": secret, + "retain": { + "hourly": 6, + "daily": 5, + "weekly": 4, + "monthly": 2, + "yearly": 1, + }, + "copyMethod": "Snapshot", + }, + }, + }, + ) +elif args.action == "restore": + apply_custom_resource( + api=client.CustomObjectsApi(), + group="volsync.backube", + version="v1alpha1", + plural="replicationdestinations", + name=pvc, + namespace=namespace, + body={ + "apiVersion": "volsync.backube/v1alpha1", + "kind": "ReplicationDestination", + "metadata": { + "name": pvc, + "namespace": namespace, + "annotations": { + "app.kubernetes.io/managed-by": "scripts/backup", + }, + }, + "spec": { + "trigger": {"manual": "restore-once"}, + "restic": { + "repository": secret, + "destinationPVC": pvc, + "copyMethod": "Direct", + }, + }, + }, + ) +else: + raise ValueError(f"Invalid action: {args.action}")