From 4dbea7dfa7cf4efe561a918b33fb0ae482f41852 Mon Sep 17 00:00:00 2001 From: Lukas Metzner Date: Wed, 18 Dec 2024 11:19:56 +0100 Subject: [PATCH] feat(nomad): improved nomad support (#798) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds better support for our integration with Nomad. Although not officially supported, we would like to improve the developer experience and integrate automated e2e tests to avoid breaking Nomad integrations with future csi-driver releases. The Terraform module for creating a Nomad (+Consul) cluster on Hetzner Cloud can be found [here](https://github.com/hetznercloud/nomad-dev-env). --------- Co-authored-by: Julian Tölle Co-authored-by: Jonas L. --- .github/workflows/test.yml | 74 ++++++++++ docs/nomad/README.md | 2 +- go.mod | 9 ++ go.sum | 19 +++ nomad/README.md | 56 ++++++++ nomad/dev/.gitignore | 6 + nomad/dev/.terraform.lock.hcl | 76 +++++++++++ nomad/dev/Makefile | 26 ++++ nomad/dev/main.tf | 5 + nomad/dev/variables.tf | 3 + nomad/hcloud-csi-controller.hcl | 55 ++++++++ nomad/hcloud-csi-node.hcl | 41 ++++++ nomad/skaffold.yaml | 17 +++ nomad/start_nomad.sh | 21 +++ nomad/stop_nomad.sh | 9 ++ test/e2e/nomad/e2e_test.go | 233 ++++++++++++++++++++++++++++++++ test/e2e/nomad/utils.go | 230 +++++++++++++++++++++++++++++++ 17 files changed, 881 insertions(+), 1 deletion(-) create mode 100644 nomad/README.md create mode 100644 nomad/dev/.gitignore create mode 100644 nomad/dev/.terraform.lock.hcl create mode 100644 nomad/dev/Makefile create mode 100644 nomad/dev/main.tf create mode 100644 nomad/dev/variables.tf create mode 100644 nomad/hcloud-csi-controller.hcl create mode 100644 nomad/hcloud-csi-node.hcl create mode 100644 nomad/skaffold.yaml create mode 100755 nomad/start_nomad.sh create mode 100755 nomad/stop_nomad.sh create mode 100644 test/e2e/nomad/e2e_test.go create mode 100644 test/e2e/nomad/utils.go diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b452341b..a02a8126 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -115,3 +115,77 @@ jobs: if: always() continue-on-error: true run: make -C dev down + + nomad: + name: nomad ${{ matrix.nomad }} + runs-on: ubuntu-latest + + permissions: + id-token: write + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.nomad }} + cancel-in-progress: true + + strategy: + fail-fast: false + matrix: + include: + - nomad: 1.9.3 # renovate: datasource=github-releases depName=hashicorp/nomad + consul: 1.20.1 # renovate: datasource=github-releases depName=hashicorp/consul + + env: + TF_VAR_nomad_version: ${{ matrix.nomad }} + TF_VAR_consul_version: ${{ matrix.consul }} + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version: "1.23" + + - uses: opentofu/setup-opentofu@v1 + with: + tofu_version: v1.8.6 # renovate: datasource=github-releases depName=opentofu/opentofu + tofu_wrapper: false + + - uses: docker/setup-buildx-action@v3 + + - uses: yokawasa/action-setup-kube-tools@v0.11.2 + with: + setup-tools: | + skaffold + skaffold: v2.13.2 # renovate: datasource=github-releases depName=GoogleContainerTools/skaffold + + - uses: hashicorp/setup-nomad@main + id: setup + with: + version: ${{ matrix.nomad }} + + # used for generating the certificates + - name: setup consul binary + run: | + curl -o consul.zip https://releases.hashicorp.com/consul/${{ matrix.consul }}/consul_${{ matrix.consul }}_linux_amd64.zip + unzip consul.zip + mv consul /usr/local/bin/ + + - uses: hetznercloud/tps-action@main + + - name: Setup environment + run: make -C nomad/dev up + + - name: Run skaffold + run: | + source nomad/dev/files/env.sh + skaffold -f nomad/skaffold.yaml build + + - name: Run tests + run: | + source nomad/dev/files/env.sh + go test -v -tags e2e ./test/e2e/nomad/... + + - name: Cleanup + if: always() + continue-on-error: true + run: make -C nomad/dev down diff --git a/docs/nomad/README.md b/docs/nomad/README.md index f7201f7e..68264610 100644 --- a/docs/nomad/README.md +++ b/docs/nomad/README.md @@ -2,7 +2,7 @@ ## Preconditions -- Nomad >= 1.4.x cluster installed following the [Nomad Reference Architecture for production deployments](https://developer.hashicorp.com/nomad/tutorials/enterprise/production-reference-architecture-vm-with-consul). The setup was tested on Nomad Community, version 1.5.x. +- Nomad >= 1.4.x cluster installed following the [Nomad Reference Architecture for production deployments](https://developer.hashicorp.com/nomad/tutorials/enterprise/production-reference-architecture-vm-with-consul). The setup was tested on Nomad Community, version 1.9.3. - The cluster nodes need to have the `docker` driver installed & configured with [`allow_privileged = true`](https://developer.hashicorp.com/nomad/docs/drivers/docker#allow_privileged). - The HCL resources are meant to be executed on a machine having nomad installed (with access to the Nomad API). diff --git a/go.mod b/go.mod index b408a44a..4148652a 100644 --- a/go.mod +++ b/go.mod @@ -31,8 +31,17 @@ require ( github.com/google/go-cmp v0.6.0 // indirect github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.0 // indirect github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect + github.com/hashicorp/cronexpr v1.1.2 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-cleanhttp v0.5.2 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/hashicorp/go-rootcerts v1.0.2 // indirect + github.com/hashicorp/nomad/api v0.0.0-20241125123754-1f29a95c2413 // indirect github.com/klauspost/compress v1.17.11 // indirect + github.com/mitchellh/go-homedir v1.1.0 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/moby/sys/mountinfo v0.7.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/onsi/ginkgo/v2 v2.19.0 // indirect diff --git a/go.sum b/go.sum index 43c29811..a75de9f9 100644 --- a/go.sum +++ b/go.sum @@ -29,10 +29,25 @@ github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2 github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 h1:qnpSQwGEnkcRpTqNOIR6bJbR0gAorgP9CSALpRcKoAA= github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1/go.mod h1:lXGCsh6c22WGtjr+qGHj1otzZpV/1kwTMAqkwZsnWRU= github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 h1:pRhl55Yx1eC7BZ1N+BBWwnKaMyD8uC+34TLdndZMAKk= github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0/go.mod h1:XKMd7iuf/RGPSMJ/U4HP0zS2Z9Fh8Ps9a+6X26m/tmI= +github.com/hashicorp/cronexpr v1.1.2 h1:wG/ZYIKT+RT3QkOdgYc+xsKWVRgnxJ1OJtjjy84fJ9A= +github.com/hashicorp/cronexpr v1.1.2/go.mod h1:P4wA0KBl9C5q2hABiMO7cp6jcIg96CDh1Efb3g1PWA4= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= +github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= +github.com/hashicorp/nomad/api v0.0.0-20241125123754-1f29a95c2413 h1:k0Z3HkPTwMY51/P6gRe20+oCQq6mszc7xJnpM1A+T4w= +github.com/hashicorp/nomad/api v0.0.0-20241125123754-1f29a95c2413/go.mod h1:svtxn6QnrQ69P23VvIWMR34tg3vmwLz4UdUzm1dSCgE= github.com/hetznercloud/hcloud-go/v2 v2.14.0 h1:WQW72DuOGqT486F0eNp92lDH5cwDTmyn9Mhin93m1To= github.com/hetznercloud/hcloud-go/v2 v2.14.0/go.mod h1:h8sHav+27Xa+48cVMAvAUMELov5h298Ilg2vflyTHgg= github.com/hetznercloud/hcloud-go/v2 v2.15.0 h1:6mpMJ/RuX1woZj+MCJdyKNEX9129KDkEIDeeyfr4GD4= @@ -55,6 +70,10 @@ github.com/kubernetes-csi/csi-test/v5 v5.3.1 h1:Wiukp1In+kif+BFo6q2ExjgB+MbrAz4j github.com/kubernetes-csi/csi-test/v5 v5.3.1/go.mod h1:7hA2cSYJ6T8CraEZPA6zqkLZwemjBD54XAnPsPC3VpA= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/moby/buildkit v0.16.0 h1:wOVBj1o5YNVad/txPQNXUXdelm7Hs/i0PUFjzbK0VKE= github.com/moby/buildkit v0.16.0/go.mod h1:Xqx/5GlrqE1yIRORk0NSCVDFpQAU1WjlT6KHYZdisIQ= github.com/moby/buildkit v0.17.0 h1:ZA/4AxwBbve1f3ZaNNJQiCBtTV62R6YweWNwq4A+sTc= diff --git a/nomad/README.md b/nomad/README.md new file mode 100644 index 00000000..801eb3be --- /dev/null +++ b/nomad/README.md @@ -0,0 +1,56 @@ +# Nomad + +Hetzner Cloud does not provide official support for running the CSI driver in Nomad. Nonetheless, we would still like to offer a satisfying developer experience and have automated e2e tests to avoid breaking Nomad accidentally. + +## Nomad Development Environment + +As a prerequisite for developing Nomad, a setup of the [`nomad-dev-env`](https://github.com/hetznercloud/nomad-dev-env) is necessary, which is located in `nomad/dev`. + +1. Setup the `HCLOUD_TOKEN` environment variable +2. Deploy the development cluster: + +```bash +make -C nomad/dev up +``` + +3. Load the generated configuration to access the development cluster: + +```bash +source nomad/dev/files/env.sh +``` + +4. Check that the cluster is healthy: + +```bash +nomad node status +``` + +## Skaffold + +Skaffold commands should be executed from the `csi-driver` root directory and use the `-f` flag to point to the Nomad specific `skaffold.yaml`. + +```bash +skaffold -f nomad/skaffold.yaml build +``` + +Skaffold does not offer any native support for Nomad. For this reason we use the Nomad post build hooks to deploy/redeploy the csi plugin. To delete the csi plugin a manual execution of `stop_nomad.sh` is necessary. + +```bash +bash ./nomad/stop_nomad.sh +``` + +## E2E Tests + +The nomad e2e tests are located in `test/e2e/nomad` and need a working development environment. + +1. Deploy the csi-driver + +```bash +skaffold -f nomad/skaffold.yaml build +``` + +2. Run the e2e tests + +```bash +go test -v -tags e2e ./test/e2e/nomad/... +``` diff --git a/nomad/dev/.gitignore b/nomad/dev/.gitignore new file mode 100644 index 00000000..3aa31043 --- /dev/null +++ b/nomad/dev/.gitignore @@ -0,0 +1,6 @@ +.terraform/ +terraform.tfstate* +.terraform.tfstate* +*.auto.tfvars + +files/ diff --git a/nomad/dev/.terraform.lock.hcl b/nomad/dev/.terraform.lock.hcl new file mode 100644 index 00000000..90045e04 --- /dev/null +++ b/nomad/dev/.terraform.lock.hcl @@ -0,0 +1,76 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/external" { + version = "2.3.4" + constraints = "2.3.4" + hashes = [ + "h1:saRbzhRhYh4urj+ARe8vIB0mlRspy6E/fPSyvwqjUW8=", + "zh:0e5eb3513d6ad5cc3196799a6e413c6a9c0b642ba6d8f84fc11efa48f58358a4", + "zh:1658beae42b4614d4009a3191710c86571ccf1dc526c4bac520a87ab701dd2e9", + "zh:28d937c13e90c170fc1e4b726a9bcf113aade53b95b3eccd335cd9eaba8acff5", + "zh:2ac19917bb83233f24391d4cbaf301bb6ec14013d3b7f93bdf64823280622daa", + "zh:3263d1808fc5252d586a9aa98d89086c912f53e1a3dc43bc5306364b358f04fa", + "zh:463469836637342495e22a936ef9ab3c8ab2fb47eb0fae09c346d63f3331af59", + "zh:53398a27492cd11f61b2f24c2601c12f50c39da32b90fd25aed7011a1e25a225", + "zh:5a44cfbcef52fd3c970144a69a934cab320bd3bb57939ae4682fc516783e2996", + "zh:65edb579d9d0dac42e77728d81da6e85ea30d3fe8f2cb6e5db82602ee8afa17e", + "zh:f2edd3027b7ae0d31a690fd5dcdcd22b467b4f1e045f84f2bc88289353ef9a5b", + ] +} + +provider "registry.opentofu.org/hashicorp/local" { + version = "2.5.2" + hashes = [ + "h1:6lS+5A/4WFAqY3/RHWFRBSiFVLPRjvLaUgxPQvjXLHU=", + "zh:25b95b76ceaa62b5c95f6de2fa6e6242edbf51e7fc6c057b7f7101aa4081f64f", + "zh:3c974fdf6b42ca6f93309cf50951f345bfc5726ec6013b8832bcd3be0eb3429e", + "zh:5de843bf6d903f5cca97ce1061e2e06b6441985c68d013eabd738a9e4b828278", + "zh:86beead37c7b4f149a54d2ae633c99ff92159c748acea93ff0f3603d6b4c9f4f", + "zh:8e52e81d3dc50c3f79305d257da7fde7af634fed65e6ab5b8e214166784a720e", + "zh:9882f444c087c69559873b2d72eec406a40ede21acb5ac334d6563bf3a2387df", + "zh:a4484193d110da4a06c7bffc44cc6b61d3b5e881cd51df2a83fdda1a36ea25d2", + "zh:a53342426d173e29d8ee3106cb68abecdf4be301a3f6589e4e8d42015befa7da", + "zh:d25ef2aef6a9004363fc6db80305d30673fc1f7dd0b980d41d863b12dacd382a", + "zh:fa2d522fb323e2121f65b79709fd596514b293d816a1d969af8f72d108888e4c", + ] +} + +provider "registry.opentofu.org/hashicorp/tls" { + version = "4.0.6" + hashes = [ + "h1:EJoUGDo7L52Iu22cA1KCndJ9B1Rrfd75wyZzsScEnc0=", + "zh:4b53b372767e5068d9bbfc89199201c1ae4283dde2f0c301974f8abb4215791f", + "zh:5b4c308bd074c6d0bd560220e6ee10a9859ca9a1f29a59367b0477a740ff265e", + "zh:674dd6bc85597677e160ee601d88b21c5a974759a658769812d2904bd94bc042", + "zh:6ccc1c448349b56677ba66112aec7e0a58eb827f66209ca5f4077b81cce240fb", + "zh:8aa6e13a5d722b74230937ea21e8b4994e53340d95b5691cf6cf3518b9f38e6e", + "zh:8b27e55e4c7fa887774860113b95c8f7f68804b002fa47f0eb8e3a485997287e", + "zh:a430b5a3e8753d8f61784de49e538ac4abed19fb665fccd8a10b55402fe9f076", + "zh:b07c978c335ae9fc12f9c221629610775e4ae36691ed4e7ba258d275dd58a243", + "zh:bbec8cb1efc84ee3026c793956a4a4cd0ece20b89d2d4f7d954c68e7f6d596d0", + "zh:e684e247424188dc3b500a543b1a8046d1c0ec08c2a90aedca0c4f6bb56bedbd", + ] +} + +provider "registry.opentofu.org/hetznercloud/hcloud" { + version = "1.49.1" + constraints = "~> 1.45" + hashes = [ + "h1:FKGRNHVbcfQJd8EWrb8Ze5QHkaGr8zI+ZKxBMjvOwPk=", + "zh:3d5f9773da4f8203cf625d04a5a0e4ff7e202684c010a801a945756140c61cde", + "zh:446305d492017cda91e5c15122ec16ff15bfe3ef4d3fd6bcea0cdf7742ab1b86", + "zh:44d4f9156ed8b4f0444bd4dc456825940be49048828565964a192286d28c9f20", + "zh:492ad893d2f89bb17c9beb877c8ceb4a16caf39db1a79030fefeada6c7aa217f", + "zh:68dc552c19ad9d209ec6018445df6e06fb77a637513a53cc66ddce1b024082be", + "zh:7492495ffda6f6c49ab38b539bd2eb965b1150a63fb6b191a27dec07d17601cb", + "zh:850fe92005981ea00db86c3e49ba5b49732fdf1f7bd5530a68f6e272847059fc", + "zh:8cb67f744c233acfb1d68a6c27686315439d944edf733b95f113b4aa63d86713", + "zh:8e13dac46e8c2497772ed1baee701b1d1c26bcc95a63b5c4566c83468f504868", + "zh:c44249c6a8ba931e208a334792686b5355ab2da465cadea03c1ea8e73c02db12", + "zh:d103125a28a85c89aea0cb0c534fe3f504416c4d4fc75c37364b9ec5f66dd77d", + "zh:ed8f64e826aa9bfca95b72892271678cb78411b40d7b404a52404141e05a4ab1", + "zh:f40efad816de00b279bd1e2cbf62c76b0e5b2da150a0764f259984b318e30945", + "zh:f5e912d0873bf4ecc43feba4ceccdf158048080c76d557e47f34749139fdd452", + ] +} diff --git a/nomad/dev/Makefile b/nomad/dev/Makefile new file mode 100644 index 00000000..fb88e42d --- /dev/null +++ b/nomad/dev/Makefile @@ -0,0 +1,26 @@ +SHELL = bash + +ENV ?= dev + +env.auto.tfvars: + @echo 'hcloud_token = "$(HCLOUD_TOKEN)"' >> "$@" + +.terraform: + tofu init + +validate: .terraform + tofu validate + +up: .terraform env.auto.tfvars + tofu apply -auto-approve + $(MAKE) port-forward + +down: .terraform env.auto.tfvars + if test -f files/registry-port-forward.sh; then files/registry-port-forward.sh down; fi + tofu destroy -auto-approve + +port-forward: + files/registry-port-forward.sh up + +clean: + rm -Rf files/ .terraform/ terraform.tfstate* env.auto.tfvars diff --git a/nomad/dev/main.tf b/nomad/dev/main.tf new file mode 100644 index 00000000..95cb75ba --- /dev/null +++ b/nomad/dev/main.tf @@ -0,0 +1,5 @@ +module "dev" { + source = "github.com/hetznercloud/nomad-dev-env?ref=v0.1.0" # renovate: datasource=github-releases depName=hetznercloud/nomad-dev-env + + hcloud_token = var.hcloud_token +} diff --git a/nomad/dev/variables.tf b/nomad/dev/variables.tf new file mode 100644 index 00000000..4c0c0b86 --- /dev/null +++ b/nomad/dev/variables.tf @@ -0,0 +1,3 @@ +variable "hcloud_token" { + sensitive = true +} diff --git a/nomad/hcloud-csi-controller.hcl b/nomad/hcloud-csi-controller.hcl new file mode 100644 index 00000000..b45659be --- /dev/null +++ b/nomad/hcloud-csi-controller.hcl @@ -0,0 +1,55 @@ +job "hcloud-csi-controller" { + datacenters = ["dc1"] + namespace = "default" + type = "service" + + group "controller" { + count = 1 + + constraint { + distinct_hosts = true + } + + update { + max_parallel = 1 + canary = 1 + min_healthy_time = "10s" + healthy_deadline = "1m" + auto_revert = true + auto_promote = true + } + + task "plugin" { + driver = "docker" + + config { + image = "$SKAFFOLD_IMAGE" + command = "bin/hcloud-csi-driver-controller" + } + + env { + CSI_ENDPOINT = "unix://csi/csi.sock" + ENABLE_METRICS = true + } + + template { + data = < /dev/null; then + nomad job stop -purge hcloud-csi-controller +fi + +controller="$(mktemp)" +envsubst < "./nomad/hcloud-csi-controller.hcl" > "$controller" +sed -i 's/localhost:30666/docker-registry.service.consul:5000/' "$controller" +nomad job run "$controller" + +if nomad job inspect hcloud-csi-node > /dev/null; then + nomad job stop -purge hcloud-csi-node +fi + +node="$(mktemp)" +envsubst < "./nomad/hcloud-csi-node.hcl" > "$node" +sed -i 's/localhost:30666/docker-registry.service.consul:5000/' "$node" +nomad job run "$node" diff --git a/nomad/stop_nomad.sh b/nomad/stop_nomad.sh new file mode 100755 index 00000000..644cbf7a --- /dev/null +++ b/nomad/stop_nomad.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -euo pipefail + +nomad job stop -purge hcloud-csi-controller +echo "Deleted hcloud-csi-controller" + +nomad job stop -purge hcloud-csi-node +echo "Deleted hcloud-csi-node" diff --git a/test/e2e/nomad/e2e_test.go b/test/e2e/nomad/e2e_test.go new file mode 100644 index 00000000..7d19fa84 --- /dev/null +++ b/test/e2e/nomad/e2e_test.go @@ -0,0 +1,233 @@ +//go:build e2e + +package e2e + +import ( + "context" + "fmt" + "os" + "strconv" + "testing" + + nomad "github.com/hashicorp/nomad/api" + "github.com/hetznercloud/csi-driver/internal/driver" + "github.com/stretchr/testify/assert" +) + +const ( + ResizedCapacity = 11811160064 // 11GiB + ResizedCapacityGB = 11 +) + +var cluster *Cluster + +func TestMain(m *testing.M) { + var err error + cluster, err = NewCluster() + if err != nil { + fmt.Printf("%v\n", err) + os.Exit(1) + } + + exitCode := m.Run() + + if err := cluster.Cleanup(); err != nil { + fmt.Printf("%v\n", err) + os.Exit(1) + } + + os.Exit(exitCode) +} + +func TestGetPluginInfo(t *testing.T) { + plugin, _, err := cluster.nomadClient.CSIPlugins().Info(driver.PluginName, &nomad.QueryOptions{}) + if err != nil { + t.Error(err) + } + + assert.NotNil(t, plugin, "Expected plugin from Nomad to be not nil") + + assert.Equalf( + t, + plugin.Version, + driver.PluginVersion, + "Expected plugin version %s, but got %s", + driver.PluginVersion, + plugin.Version, + ) +} + +func TestVolumeLifecycle(t *testing.T) { + volReq := CreateVolumeSpec("db-vol") + + var hcloudVolID int64 + t.Run("volume creation", func(t *testing.T) { + vol, _, err := cluster.CreateVolume(volReq, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + assert.Len(t, vol, 1) + + hcloudVolID, err = strconv.ParseInt(vol[0].ExternalID, 10, 64) + if err != nil { + t.Error(err) + } + + hcloudVolume, _, err := cluster.hcloudClient.Volume.GetByID(context.Background(), hcloudVolID) + if err != nil { + t.Error(err) + } + + assert.NotNilf(t, hcloudVolume, "could not find volume with ID %d on hcloud", hcloudVolID) + }) + + t.Run("volume resize", func(t *testing.T) { + volReq.RequestedCapacityMin = ResizedCapacity + + _, _, err := cluster.nomadClient.CSIVolumes().Create(volReq, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + + hcloudVolume, _, err := cluster.hcloudClient.Volume.GetByID(context.Background(), hcloudVolID) + if err != nil { + t.Error(err) + } + + if assert.NotNilf(t, hcloudVolume, "could not find volume with ID %d on hcloud", hcloudVolID) { + assert.Equalf( + t, + hcloudVolume.Size, + ResizedCapacityGB, + "Expected vol size %d, but got %d", + ResizedCapacityGB, + hcloudVolume.Size, + ) + } + }) + + t.Run("volume deletion", func(t *testing.T) { + err := cluster.DeleteVolume(volReq.ID, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + + hcloudVolume, _, err := cluster.hcloudClient.Volume.GetByID(context.Background(), hcloudVolID) + if err != nil { + t.Error(err) + } + + assert.Nil(t, hcloudVolume, "hcloud volume was deleted in nomad, but still exists") + }) +} + +func TestVolumeWrite(t *testing.T) { + volID := "test-vol" + jobID := "test-writer" + volReq := CreateVolumeSpec(volID) + job := CreateBusyboxWithVolumeJobSpec(jobID, volID, "/test") + + t.Run("create volume", func(t *testing.T) { + vol, _, err := cluster.CreateVolume(volReq, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + assert.Len(t, vol, 1) + }) + + // Used to ensure that the job for verifying the data is scheduled on another node + var previousNodeID string + t.Run("write to volume", func(t *testing.T) { + _, _, err := cluster.nomadClient.Jobs().Register(job, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + defer func() { + _, _, err = cluster.nomadClient.Jobs().Deregister(*job.ID, true, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + }() + + allocStub, err := cluster.WaitForRunningJob(*job.ID) + if err != nil { + t.Error(err) + return + } + + previousNodeID = allocStub.NodeID + + alloc, _, err := cluster.nomadClient.Allocations().Info(allocStub.ID, &nomad.QueryOptions{}) + if err != nil { + t.Error(err) + return + } + + exitCode, err := cluster.ExecInAlloc(alloc, jobID, []string{ + "dd", + "if=/dev/random", + "of=/test/data", + "bs=1M", + "count=1", + }) + if err != nil { + t.Error(err) + } + assert.Equalf(t, 0, exitCode, "could not write test data - exit code: %d", exitCode) + }) + + t.Run("verify volume data", func(t *testing.T) { + // try to schedule job on another node + constraint := &nomad.Affinity{ + LTarget: "${node.unique.id}", + RTarget: previousNodeID, + Operand: "!=", + } + job.Affinities = append(job.Affinities, constraint) + + _, _, err := cluster.nomadClient.Jobs().Register(job, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + defer func() { + _, _, err = cluster.nomadClient.Jobs().Deregister(*job.ID, true, &nomad.WriteOptions{}) + if err != nil { + t.Error(err) + } + }() + + allocStub, err := cluster.WaitForRunningJob(*job.ID) + if err != nil { + t.Error(err) + return + } + + alloc, _, err := cluster.nomadClient.Allocations().Info(allocStub.ID, &nomad.QueryOptions{}) + if err != nil { + t.Error(err) + return + } + + // verify that file exists and has a size greater than zero + exitCode, err := cluster.ExecInAlloc(alloc, jobID, []string{ + "test", + "-s", + "/test/data", + }) + if err != nil { + t.Error(err) + } + assert.Equalf(t, 0, exitCode, "could not verify test data - exit code: %d", exitCode) + }) + + t.Run("delete volume", func(t *testing.T) { + // with retries, as volume can still be in use for a couple of seconds after job got deleted, + // which results in a internal server error + for i := range 10 { + if err := cluster.DeleteVolume(volReq.ID, &nomad.WriteOptions{}); err == nil { + break + } + backoffSleep(i) + } + }) +} diff --git a/test/e2e/nomad/utils.go b/test/e2e/nomad/utils.go new file mode 100644 index 00000000..81794381 --- /dev/null +++ b/test/e2e/nomad/utils.go @@ -0,0 +1,230 @@ +package e2e + +import ( + "context" + "fmt" + "math" + "os" + "sync" + "time" + + nomad "github.com/hashicorp/nomad/api" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +const InitialVolumeCapacity = 10737418240 // 10GiB + +type Cluster struct { + hcloudClient *hcloud.Client + nomadClient *nomad.Client + + volumesCreated map[string]struct{} + lock sync.Mutex +} + +func NewCluster() (*Cluster, error) { + token := os.Getenv("HCLOUD_TOKEN") + if token == "" { + return nil, fmt.Errorf("HCLOUD_TOKEN env variable is not set") + } + + hcloudOpts := []hcloud.ClientOption{ + hcloud.WithToken(token), + hcloud.WithApplication("nomad-csi-e2e", ""), + hcloud.WithPollOpts(hcloud.PollOpts{ + BackoffFunc: hcloud.ExponentialBackoffWithOpts(hcloud.ExponentialBackoffOpts{ + Base: time.Second, + Multiplier: 2, + Cap: 10 * time.Second, + }), + }), + } + + hcloudClient := hcloud.NewClient(hcloudOpts...) + if hcloudClient == nil { + return nil, fmt.Errorf("hcloud client could not be initialized") + } + + nomadAddr := os.Getenv("NOMAD_ADDR") + if nomadAddr == "" { + return nil, fmt.Errorf("NOMAD_ADDR env variable is not set") + } + nomadCACert := os.Getenv("NOMAD_CACERT") + if nomadCACert == "" { + return nil, fmt.Errorf("NOMAD_CACERT env variable is not set") + } + nomadClientCert := os.Getenv("NOMAD_CLIENT_CERT") + if nomadClientCert == "" { + return nil, fmt.Errorf("NOMAD_CLIENT_CERT env variable is not set") + } + nomadClientKey := os.Getenv("NOMAD_CLIENT_KEY") + if nomadClientKey == "" { + return nil, fmt.Errorf("NOMAD_CLIENT_KEY env variable is not set") + } + + nomadConfig := nomad.DefaultConfig() + + nomadClient, err := nomad.NewClient(nomadConfig) + if err != nil { + return nil, err + } + + return &Cluster{ + hcloudClient: hcloudClient, + nomadClient: nomadClient, + volumesCreated: make(map[string]struct{}), + lock: sync.Mutex{}, + }, nil +} + +func (cluster *Cluster) Cleanup() []error { + var cleanupErrors []error + + for volName := range cluster.volumesCreated { + vol, _, err := cluster.hcloudClient.Volume.GetByName(context.Background(), volName) + if err != nil { + cleanupErrors = append(cleanupErrors, err) + continue + } + if vol == nil { + cleanupErrors = append(cleanupErrors, fmt.Errorf("volume %s not found on hcloud", volName)) + continue + } + _, err = cluster.hcloudClient.Volume.Delete(context.Background(), vol) + if err != nil { + cleanupErrors = append(cleanupErrors, err) + } + } + + return cleanupErrors +} + +func (cluster *Cluster) CreateVolume(volReq *nomad.CSIVolume, w *nomad.WriteOptions) ([]*nomad.CSIVolume, *nomad.WriteMeta, error) { + vol, meta, err := cluster.nomadClient.CSIVolumes().Create(volReq, w) + if err != nil { + return nil, nil, err + } + + cluster.lock.Lock() + defer cluster.lock.Unlock() + + cluster.volumesCreated[volReq.ID] = struct{}{} + + return vol, meta, err +} + +func (cluster *Cluster) DeleteVolume(externalVolID string, w *nomad.WriteOptions) error { + err := cluster.nomadClient.CSIVolumes().Delete(externalVolID, w) + if err != nil { + return err + } + + cluster.lock.Lock() + defer cluster.lock.Unlock() + + delete(cluster.volumesCreated, externalVolID) + + return nil +} + +func (cluster *Cluster) ExecInAlloc(alloc *nomad.Allocation, task string, command []string) (int, error) { + exitCode, err := cluster.nomadClient.Allocations().Exec( + context.Background(), + alloc, + task, + true, + command, + os.Stdin, + os.Stdout, + os.Stderr, + make(<-chan nomad.TerminalSize), + &nomad.QueryOptions{}, + ) + if err != nil { + return exitCode, err + } + return exitCode, nil +} + +func (cluster *Cluster) WaitForRunningJob(jobID string) (*nomad.AllocationListStub, error) { + for retry := range 10 { + allocs, _, err := cluster.nomadClient.Jobs().Allocations( + jobID, + false, + &nomad.QueryOptions{}, + ) + if err != nil { + return nil, err + } + + for _, alloc := range allocs { + if alloc.ClientStatus == "running" { + return alloc, nil + } + } + + backoffSleep(retry) + } + return nil, fmt.Errorf("no running allocation for job %s", jobID) +} + +func CreateVolumeSpec(id string) *nomad.CSIVolume { + return &nomad.CSIVolume{ + ID: id, + Name: id, + Namespace: "default", + PluginID: "csi.hetzner.cloud", + RequestedCapacityMin: InitialVolumeCapacity, + RequestedCapabilities: []*nomad.CSIVolumeCapability{ + { + AccessMode: "single-node-writer", + AttachmentMode: "file-system", + }, + }, + MountOptions: &nomad.CSIMountOptions{ + FSType: "ext4", + MountFlags: []string{ + "discard", + "defaults", + }, + }, + } +} + +func CreateBusyboxWithVolumeJobSpec(id string, volumeID string, mountPath string) *nomad.Job { + job := nomad.NewServiceJob(id, id, "global", 50) + taskGroup := nomad.NewTaskGroup(id, 1) + + taskGroup.Volumes = map[string]*nomad.VolumeRequest{ + volumeID: { + Name: volumeID, + ReadOnly: false, + Type: "csi", + Source: volumeID, + AttachmentMode: "file-system", + AccessMode: "single-node-writer", + PerAlloc: false, + }, + } + + task := nomad.NewTask(id, "docker") + task = task.SetConfig("image", "busybox:stable") + task = task.SetConfig("command", "sleep") + task = task.SetConfig("args", []string{"3600"}) + + task.VolumeMounts = append(task.VolumeMounts, &nomad.VolumeMount{ + Volume: &volumeID, + Destination: &mountPath, + }) + + taskGroup = taskGroup.AddTask(task) + job = job.AddTaskGroup(taskGroup) + return job +} + +func backoffSleep(retry int) { + delay := math.Pow(2, float64(retry)) + delay = math.Min(delay, 16) + time.Sleep(time.Second * time.Duration(delay)) +}