diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 83dc6512..a6fbc738 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -115,3 +115,77 @@ jobs: if: always() continue-on-error: true run: make -C dev down + + nomad: + name: nomad ${{ matrix.nomad }} + runs-on: ubuntu-latest + + permissions: + id-token: write + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.nomad }} + cancel-in-progress: true + + strategy: + fail-fast: false + matrix: + include: + - nomad: 1.9.3 # renovate: datasource=github-releases depName=hashicorp/nomad + consul: 1.20.1 # renovate: datasource=github-releases depName=hashicorp/consul + + env: + TF_VAR_nomad_version: ${{ matrix.nomad }} + TF_VAR_consul_version: ${{ matrix.consul }} + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version: "1.23" + + - uses: opentofu/setup-opentofu@v1 + with: + tofu_version: v1.8.6 # renovate: datasource=github-releases depName=opentofu/opentofu + tofu_wrapper: false + + - uses: docker/setup-buildx-action@v3 + + - uses: yokawasa/action-setup-kube-tools@v0.11.2 + with: + setup-tools: | + skaffold + skaffold: v2.13.2 # renovate: datasource=github-releases depName=GoogleContainerTools/skaffold + + - uses: hashicorp/setup-nomad@main + id: setup + with: + version: ${{ matrix.nomad }} + + # used for generating the certificates + - name: setup consul binary + run: | + curl -o consul.zip https://releases.hashicorp.com/consul/${{ matrix.consul }}/consul_${{ matrix.consul }}_linux_amd64.zip + unzip consul.zip + mv consul /usr/local/bin/ + + - uses: hetznercloud/tps-action@main + + - name: Setup environment + run: make -C nomad/dev up + + - name: Run skaffold + run: | + source nomad/dev/files/env.sh + skaffold -f nomad/skaffold.yaml build + + - name: Run tests + run: | + source nomad/dev/files/env.sh + go test -v -tags e2e ./test/e2e/nomad/... + + - name: Cleanup + if: always() + continue-on-error: true + run: make -C nomad/dev down diff --git a/docs/nomad/README.md b/docs/nomad/README.md index f7201f7e..68264610 100644 --- a/docs/nomad/README.md +++ b/docs/nomad/README.md @@ -2,7 +2,7 @@ ## Preconditions -- Nomad >= 1.4.x cluster installed following the [Nomad Reference Architecture for production deployments](https://developer.hashicorp.com/nomad/tutorials/enterprise/production-reference-architecture-vm-with-consul). The setup was tested on Nomad Community, version 1.5.x. +- Nomad >= 1.4.x cluster installed following the [Nomad Reference Architecture for production deployments](https://developer.hashicorp.com/nomad/tutorials/enterprise/production-reference-architecture-vm-with-consul). The setup was tested on Nomad Community, version 1.9.3. - The cluster nodes need to have the `docker` driver installed & configured with [`allow_privileged = true`](https://developer.hashicorp.com/nomad/docs/drivers/docker#allow_privileged). - The HCL resources are meant to be executed on a machine having nomad installed (with access to the Nomad API). diff --git a/go.mod b/go.mod index 855edab8..5eb7d928 100644 --- a/go.mod +++ b/go.mod @@ -31,8 +31,17 @@ require ( github.com/google/go-cmp v0.6.0 // indirect github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.0 // indirect github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect + github.com/hashicorp/cronexpr v1.1.2 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-cleanhttp v0.5.2 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/hashicorp/go-rootcerts v1.0.2 // indirect + github.com/hashicorp/nomad/api v0.0.0-20241125123754-1f29a95c2413 // indirect github.com/klauspost/compress v1.17.11 // indirect + github.com/mitchellh/go-homedir v1.1.0 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/moby/sys/mountinfo v0.7.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/onsi/ginkgo/v2 v2.19.0 // indirect diff --git a/go.sum b/go.sum index 8af8615c..0617bbf0 100644 --- a/go.sum +++ b/go.sum @@ -29,10 +29,25 @@ github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2 github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 h1:qnpSQwGEnkcRpTqNOIR6bJbR0gAorgP9CSALpRcKoAA= github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1/go.mod h1:lXGCsh6c22WGtjr+qGHj1otzZpV/1kwTMAqkwZsnWRU= github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 h1:pRhl55Yx1eC7BZ1N+BBWwnKaMyD8uC+34TLdndZMAKk= github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0/go.mod h1:XKMd7iuf/RGPSMJ/U4HP0zS2Z9Fh8Ps9a+6X26m/tmI= +github.com/hashicorp/cronexpr v1.1.2 h1:wG/ZYIKT+RT3QkOdgYc+xsKWVRgnxJ1OJtjjy84fJ9A= +github.com/hashicorp/cronexpr v1.1.2/go.mod h1:P4wA0KBl9C5q2hABiMO7cp6jcIg96CDh1Efb3g1PWA4= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= +github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= +github.com/hashicorp/nomad/api v0.0.0-20241125123754-1f29a95c2413 h1:k0Z3HkPTwMY51/P6gRe20+oCQq6mszc7xJnpM1A+T4w= +github.com/hashicorp/nomad/api v0.0.0-20241125123754-1f29a95c2413/go.mod h1:svtxn6QnrQ69P23VvIWMR34tg3vmwLz4UdUzm1dSCgE= github.com/hetznercloud/hcloud-go/v2 v2.14.0 h1:WQW72DuOGqT486F0eNp92lDH5cwDTmyn9Mhin93m1To= github.com/hetznercloud/hcloud-go/v2 v2.14.0/go.mod h1:h8sHav+27Xa+48cVMAvAUMELov5h298Ilg2vflyTHgg= github.com/hetznercloud/hcloud-go/v2 v2.15.0 h1:6mpMJ/RuX1woZj+MCJdyKNEX9129KDkEIDeeyfr4GD4= @@ -53,6 +68,10 @@ github.com/kubernetes-csi/csi-test/v5 v5.3.1 h1:Wiukp1In+kif+BFo6q2ExjgB+MbrAz4j github.com/kubernetes-csi/csi-test/v5 v5.3.1/go.mod h1:7hA2cSYJ6T8CraEZPA6zqkLZwemjBD54XAnPsPC3VpA= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/moby/buildkit v0.16.0 h1:wOVBj1o5YNVad/txPQNXUXdelm7Hs/i0PUFjzbK0VKE= github.com/moby/buildkit v0.16.0/go.mod h1:Xqx/5GlrqE1yIRORk0NSCVDFpQAU1WjlT6KHYZdisIQ= github.com/moby/buildkit v0.17.0 h1:ZA/4AxwBbve1f3ZaNNJQiCBtTV62R6YweWNwq4A+sTc= diff --git a/nomad/README.md b/nomad/README.md new file mode 100644 index 00000000..801eb3be --- /dev/null +++ b/nomad/README.md @@ -0,0 +1,56 @@ +# Nomad + +Hetzner Cloud does not provide official support for running the CSI driver in Nomad. Nonetheless, we would still like to offer a satisfying developer experience and have automated e2e tests to avoid breaking Nomad accidentally. + +## Nomad Development Environment + +As a prerequisite for developing Nomad, a setup of the [`nomad-dev-env`](https://github.com/hetznercloud/nomad-dev-env) is necessary, which is located in `nomad/dev`. + +1. Setup the `HCLOUD_TOKEN` environment variable +2. Deploy the development cluster: + +```bash +make -C nomad/dev up +``` + +3. Load the generated configuration to access the development cluster: + +```bash +source nomad/dev/files/env.sh +``` + +4. Check that the cluster is healthy: + +```bash +nomad node status +``` + +## Skaffold + +Skaffold commands should be executed from the `csi-driver` root directory and use the `-f` flag to point to the Nomad specific `skaffold.yaml`. + +```bash +skaffold -f nomad/skaffold.yaml build +``` + +Skaffold does not offer any native support for Nomad. For this reason we use the Nomad post build hooks to deploy/redeploy the csi plugin. To delete the csi plugin a manual execution of `stop_nomad.sh` is necessary. + +```bash +bash ./nomad/stop_nomad.sh +``` + +## E2E Tests + +The nomad e2e tests are located in `test/e2e/nomad` and need a working development environment. + +1. Deploy the csi-driver + +```bash +skaffold -f nomad/skaffold.yaml build +``` + +2. Run the e2e tests + +```bash +go test -v -tags e2e ./test/e2e/nomad/... +``` diff --git a/nomad/dev/.gitignore b/nomad/dev/.gitignore new file mode 100644 index 00000000..3aa31043 --- /dev/null +++ b/nomad/dev/.gitignore @@ -0,0 +1,6 @@ +.terraform/ +terraform.tfstate* +.terraform.tfstate* +*.auto.tfvars + +files/ diff --git a/nomad/dev/.terraform.lock.hcl b/nomad/dev/.terraform.lock.hcl new file mode 100644 index 00000000..90045e04 --- /dev/null +++ b/nomad/dev/.terraform.lock.hcl @@ -0,0 +1,76 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/external" { + version = "2.3.4" + constraints = "2.3.4" + hashes = [ + "h1:saRbzhRhYh4urj+ARe8vIB0mlRspy6E/fPSyvwqjUW8=", + "zh:0e5eb3513d6ad5cc3196799a6e413c6a9c0b642ba6d8f84fc11efa48f58358a4", + "zh:1658beae42b4614d4009a3191710c86571ccf1dc526c4bac520a87ab701dd2e9", + "zh:28d937c13e90c170fc1e4b726a9bcf113aade53b95b3eccd335cd9eaba8acff5", + "zh:2ac19917bb83233f24391d4cbaf301bb6ec14013d3b7f93bdf64823280622daa", + "zh:3263d1808fc5252d586a9aa98d89086c912f53e1a3dc43bc5306364b358f04fa", + "zh:463469836637342495e22a936ef9ab3c8ab2fb47eb0fae09c346d63f3331af59", + "zh:53398a27492cd11f61b2f24c2601c12f50c39da32b90fd25aed7011a1e25a225", + "zh:5a44cfbcef52fd3c970144a69a934cab320bd3bb57939ae4682fc516783e2996", + "zh:65edb579d9d0dac42e77728d81da6e85ea30d3fe8f2cb6e5db82602ee8afa17e", + "zh:f2edd3027b7ae0d31a690fd5dcdcd22b467b4f1e045f84f2bc88289353ef9a5b", + ] +} + +provider "registry.opentofu.org/hashicorp/local" { + version = "2.5.2" + hashes = [ + "h1:6lS+5A/4WFAqY3/RHWFRBSiFVLPRjvLaUgxPQvjXLHU=", + "zh:25b95b76ceaa62b5c95f6de2fa6e6242edbf51e7fc6c057b7f7101aa4081f64f", + "zh:3c974fdf6b42ca6f93309cf50951f345bfc5726ec6013b8832bcd3be0eb3429e", + "zh:5de843bf6d903f5cca97ce1061e2e06b6441985c68d013eabd738a9e4b828278", + "zh:86beead37c7b4f149a54d2ae633c99ff92159c748acea93ff0f3603d6b4c9f4f", + "zh:8e52e81d3dc50c3f79305d257da7fde7af634fed65e6ab5b8e214166784a720e", + "zh:9882f444c087c69559873b2d72eec406a40ede21acb5ac334d6563bf3a2387df", + "zh:a4484193d110da4a06c7bffc44cc6b61d3b5e881cd51df2a83fdda1a36ea25d2", + "zh:a53342426d173e29d8ee3106cb68abecdf4be301a3f6589e4e8d42015befa7da", + "zh:d25ef2aef6a9004363fc6db80305d30673fc1f7dd0b980d41d863b12dacd382a", + "zh:fa2d522fb323e2121f65b79709fd596514b293d816a1d969af8f72d108888e4c", + ] +} + +provider "registry.opentofu.org/hashicorp/tls" { + version = "4.0.6" + hashes = [ + "h1:EJoUGDo7L52Iu22cA1KCndJ9B1Rrfd75wyZzsScEnc0=", + "zh:4b53b372767e5068d9bbfc89199201c1ae4283dde2f0c301974f8abb4215791f", + "zh:5b4c308bd074c6d0bd560220e6ee10a9859ca9a1f29a59367b0477a740ff265e", + "zh:674dd6bc85597677e160ee601d88b21c5a974759a658769812d2904bd94bc042", + "zh:6ccc1c448349b56677ba66112aec7e0a58eb827f66209ca5f4077b81cce240fb", + "zh:8aa6e13a5d722b74230937ea21e8b4994e53340d95b5691cf6cf3518b9f38e6e", + "zh:8b27e55e4c7fa887774860113b95c8f7f68804b002fa47f0eb8e3a485997287e", + "zh:a430b5a3e8753d8f61784de49e538ac4abed19fb665fccd8a10b55402fe9f076", + "zh:b07c978c335ae9fc12f9c221629610775e4ae36691ed4e7ba258d275dd58a243", + "zh:bbec8cb1efc84ee3026c793956a4a4cd0ece20b89d2d4f7d954c68e7f6d596d0", + "zh:e684e247424188dc3b500a543b1a8046d1c0ec08c2a90aedca0c4f6bb56bedbd", + ] +} + +provider "registry.opentofu.org/hetznercloud/hcloud" { + version = "1.49.1" + constraints = "~> 1.45" + hashes = [ + "h1:FKGRNHVbcfQJd8EWrb8Ze5QHkaGr8zI+ZKxBMjvOwPk=", + "zh:3d5f9773da4f8203cf625d04a5a0e4ff7e202684c010a801a945756140c61cde", + "zh:446305d492017cda91e5c15122ec16ff15bfe3ef4d3fd6bcea0cdf7742ab1b86", + "zh:44d4f9156ed8b4f0444bd4dc456825940be49048828565964a192286d28c9f20", + "zh:492ad893d2f89bb17c9beb877c8ceb4a16caf39db1a79030fefeada6c7aa217f", + "zh:68dc552c19ad9d209ec6018445df6e06fb77a637513a53cc66ddce1b024082be", + "zh:7492495ffda6f6c49ab38b539bd2eb965b1150a63fb6b191a27dec07d17601cb", + "zh:850fe92005981ea00db86c3e49ba5b49732fdf1f7bd5530a68f6e272847059fc", + "zh:8cb67f744c233acfb1d68a6c27686315439d944edf733b95f113b4aa63d86713", + "zh:8e13dac46e8c2497772ed1baee701b1d1c26bcc95a63b5c4566c83468f504868", + "zh:c44249c6a8ba931e208a334792686b5355ab2da465cadea03c1ea8e73c02db12", + "zh:d103125a28a85c89aea0cb0c534fe3f504416c4d4fc75c37364b9ec5f66dd77d", + "zh:ed8f64e826aa9bfca95b72892271678cb78411b40d7b404a52404141e05a4ab1", + "zh:f40efad816de00b279bd1e2cbf62c76b0e5b2da150a0764f259984b318e30945", + "zh:f5e912d0873bf4ecc43feba4ceccdf158048080c76d557e47f34749139fdd452", + ] +} diff --git a/nomad/dev/Makefile b/nomad/dev/Makefile new file mode 100644 index 00000000..fb88e42d --- /dev/null +++ b/nomad/dev/Makefile @@ -0,0 +1,26 @@ +SHELL = bash + +ENV ?= dev + +env.auto.tfvars: + @echo 'hcloud_token = "$(HCLOUD_TOKEN)"' >> "$@" + +.terraform: + tofu init + +validate: .terraform + tofu validate + +up: .terraform env.auto.tfvars + tofu apply -auto-approve + $(MAKE) port-forward + +down: .terraform env.auto.tfvars + if test -f files/registry-port-forward.sh; then files/registry-port-forward.sh down; fi + tofu destroy -auto-approve + +port-forward: + files/registry-port-forward.sh up + +clean: + rm -Rf files/ .terraform/ terraform.tfstate* env.auto.tfvars diff --git a/nomad/dev/main.tf b/nomad/dev/main.tf new file mode 100644 index 00000000..95cb75ba --- /dev/null +++ b/nomad/dev/main.tf @@ -0,0 +1,5 @@ +module "dev" { + source = "github.com/hetznercloud/nomad-dev-env?ref=v0.1.0" # renovate: datasource=github-releases depName=hetznercloud/nomad-dev-env + + hcloud_token = var.hcloud_token +} diff --git a/nomad/dev/variables.tf b/nomad/dev/variables.tf new file mode 100644 index 00000000..4c0c0b86 --- /dev/null +++ b/nomad/dev/variables.tf @@ -0,0 +1,3 @@ +variable "hcloud_token" { + sensitive = true +} diff --git a/nomad/hcloud-csi-controller.hcl b/nomad/hcloud-csi-controller.hcl new file mode 100644 index 00000000..b45659be --- /dev/null +++ b/nomad/hcloud-csi-controller.hcl @@ -0,0 +1,55 @@ +job "hcloud-csi-controller" { + datacenters = ["dc1"] + namespace = "default" + type = "service" + + group "controller" { + count = 1 + + constraint { + distinct_hosts = true + } + + update { + max_parallel = 1 + canary = 1 + min_healthy_time = "10s" + healthy_deadline = "1m" + auto_revert = true + auto_promote = true + } + + task "plugin" { + driver = "docker" + + config { + image = "$SKAFFOLD_IMAGE" + command = "bin/hcloud-csi-driver-controller" + } + + env { + CSI_ENDPOINT = "unix://csi/csi.sock" + ENABLE_METRICS = true + } + + template { + data = < /dev/null; then + nomad job stop -purge hcloud-csi-controller +fi + +controller="$(mktemp)" +envsubst < "./nomad/hcloud-csi-controller.hcl" > "$controller" +sed -i 's/localhost:30666/docker-registry.service.consul:5000/' "$controller" +nomad job run "$controller" + +if nomad job inspect hcloud-csi-node > /dev/null; then + nomad job stop -purge hcloud-csi-node +fi + +node="$(mktemp)" +envsubst < "./nomad/hcloud-csi-node.hcl" > "$node" +sed -i 's/localhost:30666/docker-registry.service.consul:5000/' "$node" +nomad job run "$node" diff --git a/nomad/stop_nomad.sh b/nomad/stop_nomad.sh new file mode 100755 index 00000000..644cbf7a --- /dev/null +++ b/nomad/stop_nomad.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -euo pipefail + +nomad job stop -purge hcloud-csi-controller +echo "Deleted hcloud-csi-controller" + +nomad job stop -purge hcloud-csi-node +echo "Deleted hcloud-csi-node" diff --git a/test/e2e/nomad/e2e_test.go b/test/e2e/nomad/e2e_test.go new file mode 100644 index 00000000..7d19fa84 --- /dev/null +++ b/test/e2e/nomad/e2e_test.go @@ -0,0 +1,233 @@ +//go:build e2e + +package e2e + +import ( + "context" + "fmt" + "os" + "strconv" + "testing" + + nomad "github.com/hashicorp/nomad/api" + "github.com/hetznercloud/csi-driver/internal/driver" + "github.com/stretchr/testify/assert" +) + +const ( + ResizedCapacity = 11811160064 // 11GiB + ResizedCapacityGB = 11 +) + +var cluster *Cluster + +func TestMain(m *testing.M) { + var err error + cluster, err = NewCluster() + if err != nil { + fmt.Printf("%v\n", err) + os.Exit(1) + } + + exitCode := m.Run() + + if err := cluster.Cleanup(); err != nil { + fmt.Printf("%v\n", err) + os.Exit(1) + } + + os.Exit(exitCode) +} + +func TestGetPluginInfo(t *testing.T) { + plugin, _, err := cluster.nomadClient.CSIPlugins().Info(driver.PluginName, &nomad.QueryOptions{}) + if err != nil { + t.Error(err) + } + + assert.NotNil(t, plugin, "Expected plugin from Nomad to be not nil") + + assert.Equalf( + t, + plugin.Version, + driver.PluginVersion, + "Expected plugin version %s, but got %s", + driver.PluginVersion, + plugin.Version, + ) +} + +func TestVolumeLifecycle(t *testing.T) { + volReq := CreateVolumeSpec("db-vol") + + var hcloudVolID int64 + t.Run("volume creation", func(t *testing.T) { + vol, _, err := cluster.CreateVolume(volReq, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + assert.Len(t, vol, 1) + + hcloudVolID, err = strconv.ParseInt(vol[0].ExternalID, 10, 64) + if err != nil { + t.Error(err) + } + + hcloudVolume, _, err := cluster.hcloudClient.Volume.GetByID(context.Background(), hcloudVolID) + if err != nil { + t.Error(err) + } + + assert.NotNilf(t, hcloudVolume, "could not find volume with ID %d on hcloud", hcloudVolID) + }) + + t.Run("volume resize", func(t *testing.T) { + volReq.RequestedCapacityMin = ResizedCapacity + + _, _, err := cluster.nomadClient.CSIVolumes().Create(volReq, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + + hcloudVolume, _, err := cluster.hcloudClient.Volume.GetByID(context.Background(), hcloudVolID) + if err != nil { + t.Error(err) + } + + if assert.NotNilf(t, hcloudVolume, "could not find volume with ID %d on hcloud", hcloudVolID) { + assert.Equalf( + t, + hcloudVolume.Size, + ResizedCapacityGB, + "Expected vol size %d, but got %d", + ResizedCapacityGB, + hcloudVolume.Size, + ) + } + }) + + t.Run("volume deletion", func(t *testing.T) { + err := cluster.DeleteVolume(volReq.ID, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + + hcloudVolume, _, err := cluster.hcloudClient.Volume.GetByID(context.Background(), hcloudVolID) + if err != nil { + t.Error(err) + } + + assert.Nil(t, hcloudVolume, "hcloud volume was deleted in nomad, but still exists") + }) +} + +func TestVolumeWrite(t *testing.T) { + volID := "test-vol" + jobID := "test-writer" + volReq := CreateVolumeSpec(volID) + job := CreateBusyboxWithVolumeJobSpec(jobID, volID, "/test") + + t.Run("create volume", func(t *testing.T) { + vol, _, err := cluster.CreateVolume(volReq, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + assert.Len(t, vol, 1) + }) + + // Used to ensure that the job for verifying the data is scheduled on another node + var previousNodeID string + t.Run("write to volume", func(t *testing.T) { + _, _, err := cluster.nomadClient.Jobs().Register(job, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + defer func() { + _, _, err = cluster.nomadClient.Jobs().Deregister(*job.ID, true, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + }() + + allocStub, err := cluster.WaitForRunningJob(*job.ID) + if err != nil { + t.Error(err) + return + } + + previousNodeID = allocStub.NodeID + + alloc, _, err := cluster.nomadClient.Allocations().Info(allocStub.ID, &nomad.QueryOptions{}) + if err != nil { + t.Error(err) + return + } + + exitCode, err := cluster.ExecInAlloc(alloc, jobID, []string{ + "dd", + "if=/dev/random", + "of=/test/data", + "bs=1M", + "count=1", + }) + if err != nil { + t.Error(err) + } + assert.Equalf(t, 0, exitCode, "could not write test data - exit code: %d", exitCode) + }) + + t.Run("verify volume data", func(t *testing.T) { + // try to schedule job on another node + constraint := &nomad.Affinity{ + LTarget: "${node.unique.id}", + RTarget: previousNodeID, + Operand: "!=", + } + job.Affinities = append(job.Affinities, constraint) + + _, _, err := cluster.nomadClient.Jobs().Register(job, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + defer func() { + _, _, err = cluster.nomadClient.Jobs().Deregister(*job.ID, true, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + }() + + allocStub, err := cluster.WaitForRunningJob(*job.ID) + if err != nil { + t.Error(err) + return + } + + alloc, _, err := cluster.nomadClient.Allocations().Info(allocStub.ID, &nomad.QueryOptions{}) + if err != nil { + t.Error(err) + return + } + + // verify that file exists and has a size greater than zero + exitCode, err := cluster.ExecInAlloc(alloc, jobID, []string{ + "test", + "-s", + "/test/data", + }) + if err != nil { + t.Error(err) + } + assert.Equalf(t, 0, exitCode, "could not verify test data - exit code: %d", exitCode) + }) + + t.Run("delete volume", func(t *testing.T) { + // with retries, as volume can still be in use for a couple of seconds after job got deleted, + // which results in a internal server error + for i := range 10 { + if err := cluster.DeleteVolume(volReq.ID, &nomad.WriteOptions{}); err == nil { + break + } + backoffSleep(i) + } + }) +} diff --git a/test/e2e/nomad/utils.go b/test/e2e/nomad/utils.go new file mode 100644 index 00000000..81794381 --- /dev/null +++ b/test/e2e/nomad/utils.go @@ -0,0 +1,230 @@ +package e2e + +import ( + "context" + "fmt" + "math" + "os" + "sync" + "time" + + nomad "github.com/hashicorp/nomad/api" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +const InitialVolumeCapacity = 10737418240 // 10GiB + +type Cluster struct { + hcloudClient *hcloud.Client + nomadClient *nomad.Client + + volumesCreated map[string]struct{} + lock sync.Mutex +} + +func NewCluster() (*Cluster, error) { + token := os.Getenv("HCLOUD_TOKEN") + if token == "" { + return nil, fmt.Errorf("HCLOUD_TOKEN env variable is not set") + } + + hcloudOpts := []hcloud.ClientOption{ + hcloud.WithToken(token), + hcloud.WithApplication("nomad-csi-e2e", ""), + hcloud.WithPollOpts(hcloud.PollOpts{ + BackoffFunc: hcloud.ExponentialBackoffWithOpts(hcloud.ExponentialBackoffOpts{ + Base: time.Second, + Multiplier: 2, + Cap: 10 * time.Second, + }), + }), + } + + hcloudClient := hcloud.NewClient(hcloudOpts...) + if hcloudClient == nil { + return nil, fmt.Errorf("hcloud client could not be initialized") + } + + nomadAddr := os.Getenv("NOMAD_ADDR") + if nomadAddr == "" { + return nil, fmt.Errorf("NOMAD_ADDR env variable is not set") + } + nomadCACert := os.Getenv("NOMAD_CACERT") + if nomadCACert == "" { + return nil, fmt.Errorf("NOMAD_CACERT env variable is not set") + } + nomadClientCert := os.Getenv("NOMAD_CLIENT_CERT") + if nomadClientCert == "" { + return nil, fmt.Errorf("NOMAD_CLIENT_CERT env variable is not set") + } + nomadClientKey := os.Getenv("NOMAD_CLIENT_KEY") + if nomadClientKey == "" { + return nil, fmt.Errorf("NOMAD_CLIENT_KEY env variable is not set") + } + + nomadConfig := nomad.DefaultConfig() + + nomadClient, err := nomad.NewClient(nomadConfig) + if err != nil { + return nil, err + } + + return &Cluster{ + hcloudClient: hcloudClient, + nomadClient: nomadClient, + volumesCreated: make(map[string]struct{}), + lock: sync.Mutex{}, + }, nil +} + +func (cluster *Cluster) Cleanup() []error { + var cleanupErrors []error + + for volName := range cluster.volumesCreated { + vol, _, err := cluster.hcloudClient.Volume.GetByName(context.Background(), volName) + if err != nil { + cleanupErrors = append(cleanupErrors, err) + continue + } + if vol == nil { + cleanupErrors = append(cleanupErrors, fmt.Errorf("volume %s not found on hcloud", volName)) + continue + } + _, err = cluster.hcloudClient.Volume.Delete(context.Background(), vol) + if err != nil { + cleanupErrors = append(cleanupErrors, err) + } + } + + return cleanupErrors +} + +func (cluster *Cluster) CreateVolume(volReq *nomad.CSIVolume, w *nomad.WriteOptions) ([]*nomad.CSIVolume, *nomad.WriteMeta, error) { + vol, meta, err := cluster.nomadClient.CSIVolumes().Create(volReq, w) + if err != nil { + return nil, nil, err + } + + cluster.lock.Lock() + defer cluster.lock.Unlock() + + cluster.volumesCreated[volReq.ID] = struct{}{} + + return vol, meta, err +} + +func (cluster *Cluster) DeleteVolume(externalVolID string, w *nomad.WriteOptions) error { + err := cluster.nomadClient.CSIVolumes().Delete(externalVolID, w) + if err != nil { + return err + } + + cluster.lock.Lock() + defer cluster.lock.Unlock() + + delete(cluster.volumesCreated, externalVolID) + + return nil +} + +func (cluster *Cluster) ExecInAlloc(alloc *nomad.Allocation, task string, command []string) (int, error) { + exitCode, err := cluster.nomadClient.Allocations().Exec( + context.Background(), + alloc, + task, + true, + command, + os.Stdin, + os.Stdout, + os.Stderr, + make(<-chan nomad.TerminalSize), + &nomad.QueryOptions{}, + ) + if err != nil { + return exitCode, err + } + return exitCode, nil +} + +func (cluster *Cluster) WaitForRunningJob(jobID string) (*nomad.AllocationListStub, error) { + for retry := range 10 { + allocs, _, err := cluster.nomadClient.Jobs().Allocations( + jobID, + false, + &nomad.QueryOptions{}, + ) + if err != nil { + return nil, err + } + + for _, alloc := range allocs { + if alloc.ClientStatus == "running" { + return alloc, nil + } + } + + backoffSleep(retry) + } + return nil, fmt.Errorf("no running allocation for job %s", jobID) +} + +func CreateVolumeSpec(id string) *nomad.CSIVolume { + return &nomad.CSIVolume{ + ID: id, + Name: id, + Namespace: "default", + PluginID: "csi.hetzner.cloud", + RequestedCapacityMin: InitialVolumeCapacity, + RequestedCapabilities: []*nomad.CSIVolumeCapability{ + { + AccessMode: "single-node-writer", + AttachmentMode: "file-system", + }, + }, + MountOptions: &nomad.CSIMountOptions{ + FSType: "ext4", + MountFlags: []string{ + "discard", + "defaults", + }, + }, + } +} + +func CreateBusyboxWithVolumeJobSpec(id string, volumeID string, mountPath string) *nomad.Job { + job := nomad.NewServiceJob(id, id, "global", 50) + taskGroup := nomad.NewTaskGroup(id, 1) + + taskGroup.Volumes = map[string]*nomad.VolumeRequest{ + volumeID: { + Name: volumeID, + ReadOnly: false, + Type: "csi", + Source: volumeID, + AttachmentMode: "file-system", + AccessMode: "single-node-writer", + PerAlloc: false, + }, + } + + task := nomad.NewTask(id, "docker") + task = task.SetConfig("image", "busybox:stable") + task = task.SetConfig("command", "sleep") + task = task.SetConfig("args", []string{"3600"}) + + task.VolumeMounts = append(task.VolumeMounts, &nomad.VolumeMount{ + Volume: &volumeID, + Destination: &mountPath, + }) + + taskGroup = taskGroup.AddTask(task) + job = job.AddTaskGroup(taskGroup) + return job +} + +func backoffSleep(retry int) { + delay := math.Pow(2, float64(retry)) + delay = math.Min(delay, 16) + time.Sleep(time.Second * time.Duration(delay)) +}